Coverage for src/wiktextract/extractor/en/form_descriptions.py: 76%

1323 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1# Code for parsing linguistic form descriptions and tags for word senses 

2# (both the word entry head - initial part and parenthesized parts - 

3# and tags at the beginning of word senses) 

4# 

5# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

6 

7import functools 

8import re 

9import unicodedata 

10from typing import ( 

11 Any, 

12 Literal, 

13 Optional, 

14 Sequence, 

15 Union, 

16) 

17 

18import Levenshtein 

19from nltk import TweetTokenizer # type:ignore[import-untyped] 

20 

21from ...datautils import data_append, data_extend, split_at_comma_semi 

22from ...tags import ( 

23 alt_of_tags, 

24 form_of_tags, 

25 head_final_bantu_langs, 

26 head_final_bantu_map, 

27 head_final_numeric_langs, 

28 head_final_other_langs, 

29 head_final_other_map, 

30 head_final_semitic_langs, 

31 head_final_semitic_map, 

32 uppercase_tags, 

33 valid_tags, 

34 xlat_descs_map, 

35 xlat_head_map, 

36 xlat_tags_map, 

37) 

38from ...topics import topic_generalize_map, valid_topics 

39from ...wxr_context import WiktextractContext 

40from .english_words import ( 

41 english_words, 

42 not_english_words, 

43 potentially_english_words, 

44) 

45from .form_descriptions_known_firsts import known_firsts 

46from .taxondata import known_species 

47from .type_utils import ( 

48 AltOf, 

49 FormData, 

50 LinkageData, 

51 SenseData, 

52 SoundData, 

53 TranslationData, 

54 WordData, 

55) 

56 

57# Tokenizer for classify_desc() 

58tokenizer = TweetTokenizer() 

59 

60# These are ignored as the value of a related form in form head. 

61IGNORED_RELATED: set[str] = set( 

62 [ 

63 "-", 

64 "־", 

65 "᠆", 

66 "‐", 

67 "‑", 

68 "‒", 

69 "–", 

70 "—", 

71 "―", 

72 "−", 

73 "⸺", 

74 "⸻", 

75 "﹘", 

76 "﹣", 

77 "-", 

78 "?", 

79 "(none)", 

80 ] 

81) 

82 

83 

84# First words of unicodedata.name() that indicate scripts that cannot be 

85# accepted in romanizations or english (i.e., should be considered "other" 

86# in classify_desc()). 

87non_latin_scripts: list[str] = [ 

88 "ADLAM", 

89 "ARABIC", 

90 "ARABIC-INDIC", 

91 "ARMENIAN", 

92 "BALINESE", 

93 "BENGALI", 

94 "BRAHMI", 

95 "BRAILLE", 

96 "CANADIAN", 

97 "CHAKMA", 

98 "CHAM", 

99 "CHEROKEE", 

100 "CJK", 

101 "COPTIC", 

102 "COUNTING ROD", 

103 "CUNEIFORM", 

104 "CYRILLIC", 

105 "DOUBLE-STRUCK", 

106 "EGYPTIAN", 

107 "ETHIOPIC", 

108 "EXTENDED ARABIC-INDIC", 

109 "GEORGIAN", 

110 "GLAGOLITIC", 

111 "GOTHIC", 

112 "GREEK", 

113 "GUJARATI", 

114 "GURMUKHI", 

115 "HANGUL", 

116 "HANIFI ROHINGYA", 

117 "HEBREW", 

118 "HIRAGANA", 

119 "JAVANESE", 

120 "KANNADA", 

121 "KATAKANA", 

122 "KAYAH LI", 

123 "KHMER", 

124 "KHUDAWADI", 

125 "LAO", 

126 "LEPCHA", 

127 "LIMBU", 

128 "MALAYALAM", 

129 "MEETEI", 

130 "MYANMAR", 

131 "NEW TAI LUE", 

132 "NKO", 

133 "OL CHIKI", 

134 "OLD PERSIAN", 

135 "OLD SOUTH ARABIAN", 

136 "ORIYA", 

137 "OSMANYA", 

138 "PHOENICIAN", 

139 "SAURASHTRA", 

140 "SHARADA", 

141 "SINHALA", 

142 "SUNDANESE", 

143 "SYLOTI", 

144 "TAI THAM", 

145 "TAKRI", 

146 "TAMIL", 

147 "TELUGU", 

148 "THAANA", 

149 "THAI", 

150 "TIBETAN", 

151 "TIFINAGH", 

152 "TIRHUTA", 

153 "UGARITIC", 

154 "WARANG CITI", 

155 "YI", 

156] 

157non_latin_scripts_re = re.compile( 

158 r"(" + r"|".join(re.escape(x) for x in non_latin_scripts) + r")\b" 

159) 

160 

161# Sanity check xlat_head_map values 

162for k, v in xlat_head_map.items(): 

163 if v.startswith("?"): 

164 v = v[1:] 

165 for tag in v.split(): 

166 if tag not in valid_tags: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true

167 print( 

168 "WARNING: xlat_head_map[{}] contains unrecognized tag {}".format( 

169 k, tag 

170 ) 

171 ) 

172 

173# Regexp for finding nested translations from translation items (these are 

174# used in, e.g., year/English/Translations/Arabic). This is actually used 

175# in page.py. 

176nested_translations_re = re.compile( 

177 r"\s+\((({}): ([^()]|\([^()]+\))+)\)".format( 

178 "|".join( 

179 re.escape(x.removeprefix("?")) 

180 for x in sorted(xlat_head_map.values(), key=len, reverse=True) 

181 if x and not x.startswith("class-") 

182 ) 

183 ) 

184) 

185 

186# Regexp that matches head tag specifiers. Used to match tags from end of 

187# translations and linkages 

188head_final_re_text = r"( -)?( ({}))+".format( 

189 "|".join( 

190 re.escape(x) 

191 for x in 

192 # The sort is to put longer ones first, preferring them in 

193 # the regexp match 

194 sorted(xlat_head_map.keys(), key=len, reverse=True) 

195 ) 

196) 

197head_final_re = re.compile(head_final_re_text + "$") 

198 

199# Regexp used to match head tag specifiers at end of a form for certain 

200# Bantu languages (particularly Swahili and similar languages). 

201head_final_bantu_re_text = r" ({})".format( 

202 "|".join(re.escape(x) for x in head_final_bantu_map.keys()) 

203) 

204head_final_bantu_re = re.compile(head_final_bantu_re_text + "$") 

205 

206# Regexp used to match head tag specifiers at end of a form for certain 

207# Semitic languages (particularly Arabic and similar languages). 

208head_final_semitic_re_text = r" ({})".format( 

209 "|".join(re.escape(x) for x in head_final_semitic_map.keys()) 

210) 

211head_final_semitic_re = re.compile(head_final_semitic_re_text + "$") 

212 

213# Regexp used to match head tag specifiers at end of a form for certain 

214# other languages (e.g., Lithuanian, Finnish, French). 

215head_final_other_re_text = r" ({})".format( 

216 "|".join(re.escape(x) for x in head_final_other_map.keys()) 

217) 

218head_final_other_re = re.compile(head_final_other_re_text + "$") 

219 

220# Regexp for splitting heads. See parse_word_head(). 

221head_split_re_text = ( 

222 "(" 

223 + head_final_re_text 

224 + "|" 

225 + head_final_bantu_re_text 

226 + "|" 

227 + head_final_semitic_re_text 

228 + "|" 

229 + head_final_other_re_text 

230 + ")?( or |[,;]+)" 

231) 

232head_split_re = re.compile(head_split_re_text) 

233head_split_re_parens = 0 

234for m in re.finditer(r"(^|[^\\])[(]+", head_split_re_text): 

235 head_split_re_parens += m.group(0).count("(") 

236 

237# Parenthesized parts that are ignored in translations 

238tr_ignored_parens: set[str] = set( 

239 [ 

240 "please verify", 

241 "(please verify)", 

242 "transliteration needed", 

243 "(transliteration needed)", 

244 "in words with back vowel harmony", 

245 "(in words with back vowel harmony)", 

246 "in words with front vowel harmony", 

247 "(in words with front vowel harmony)", 

248 "see below", 

249 "see usage notes below", 

250 ] 

251) 

252tr_ignored_parens_re = re.compile( 

253 r"^(" 

254 + "|".join(re.escape(x) for x in tr_ignored_parens) 

255 + ")$" 

256 + r"|^(Can we clean up|Can we verify|for other meanings see " 

257 r"lit\. )" 

258) 

259 

260# Translations that are ignored 

261ignored_translations: set[str] = set( 

262 [ 

263 "[script needed]", 

264 "please add this translation if you can", 

265 ] 

266) 

267 

268# Put english text into the "note" field in a translation if it contains one 

269# of these words 

270tr_note_re = re.compile( 

271 r"(\b(article|definite|indefinite|superlative|comparative|pattern|" 

272 r"adjective|adjectives|clause|clauses|pronoun|pronouns|preposition|prep|" 

273 r"postposition|postp|action|actions|articles|" 

274 r"adverb|adverbs|noun|nouns|verb|verbs|before|" 

275 r"after|placed|prefix|suffix|used with|translated|" 

276 r"nominative|genitive|dative|infinitive|participle|past|perfect|imperfect|" 

277 r"perfective|imperfective|auxiliary|negative|future|present|tense|aspect|" 

278 r"conjugation|declension|class|category|plural|singular|positive|" 

279 r"seldom used|formal|informal|familiar|unspoken|spoken|written|" 

280 r"indicative|progressive|conditional|potential|" 

281 r"accusative|adessive|inessive|superessive|elative|allative|" 

282 r"dialect|dialects|object|subject|predicate|movies|recommended|language|" 

283 r"locative|continuous|simple|continuousness|gerund|subjunctive|" 

284 r"periphrastically|no equivalent|not used|not always used|" 

285 r"used only with|not applicable|use the|signifying|wordplay|pronounced|" 

286 r"preconsonantal|spelled|spelling|respelling|respellings|phonetic|" 

287 r"may be replaced|stricter sense|for nonhumans|" 

288 r"sense:|used:|in full:|informally used|followed by|" 

289 r"not restricted to|pertaining to|or optionally with|are optional|" 

290 r"in conjunction with|in compounds|depending on the relationship|" 

291 r"person addressed|one person|multiple persons|may be replaced with|" 

292 r"optionally completed with|in the phrase|in response to|" 

293 r"before a|before an|preceded by|verbs ending|very common|after a verb|" 

294 r"with verb|with uncountable|with the objects|with stative|" 

295 r"can be replaced by|often after|used before|used after|" 

296 r"used in|clipping of|spoken|somewhat|capitalized|" 

297 r"short form|shortening of|shortened form|initialism of|" 

298 r"said to|rare:|rarer also|is rarer|negatively connoted|" 

299 r"previously mentioned|uncountable noun|countable noun|" 

300 r"countable nouns|uncountable nouns|" 

301 r"with predicative|with -|with imperfect|with a negated|" 

302 r"colloquial|misspelling|holophrastic|frequently|esp\.|especially|" 

303 r'"|' 

304 r"general term|after a vowel|before a vowel|" 

305 r"form|regular|irregular|alternative)" 

306 r")($|[) ])|^(" 

307 # Following are only matched at the beginning of the string 

308 r"pl|pl\.|see:|pl:|sg:|plurals:|e\.g\.|e\.g\.:|e\.g\.,|cf\.|compare|such as|" 

309 r"see|only|often|usually|used|usage:|of|not|in|compare|usu\.|" 

310 r"as|about|abbrv\.|abbreviation|abbr\.|that:|optionally|" 

311 r"mainly|from|for|also|also:|acronym|" 

312 r"\+|with) " 

313) 

314# \b does not work at the end??? 

315 

316# Related forms matching this regexp will be considered suspicious if the 

317# page title does not also match one of these. 

318suspicious_related_re = re.compile( 

319 r"(^| )(f|m|n|c|or|pl|sg|inan|anim|pers|anml|impf|pf|vir|nvir)( |$)" 

320 r"|[][:=<>&#*|]" 

321 r"| \d+$" 

322) 

323 

324# Word forms (head forms, translations, etc) that will be considered ok and 

325# silently accepted even if they would otherwise trigger a suspicious 

326# form warning. 

327ok_suspicious_forms: set[str] = set( 

328 [ 

329 "but en or", # "golden goal"/English/Tr/French 

330 "cœur en or", # "heart of gold"/Eng/Tr/French 

331 "en or", # golden/Eng/Tr/French 

332 "men du", # jet/Etym2/Noun/Tr/Cornish 

333 "parachute en or", # "golden parachute"/Eng/Tr/French 

334 "vieil or", # "old gold"/Eng/Tr/French 

335 # "all that glitters is not gold"/Eng/Tr/French 

336 "tout ce qui brille n’est pas or", 

337 "μη αποκλειστικό or", # inclusive or/Eng/Tr/Greek 

338 "period or full stop", 

339 ] 

340) 

341 

342 

343# Replacements to be done in classify_desc before tokenizing. This is a 

344# workaround for shortcomings in TweetTokenizer. 

345tokenizer_fixup_map = { 

346 r"a.m.": "AM", 

347 r"p.m.": "PM", 

348} 

349tokenizer_fixup_re = re.compile( 

350 r"\b(" 

351 + "|".join( 

352 re.escape(x) 

353 for x in sorted( 

354 tokenizer_fixup_map.keys(), key=lambda x: len(x), reverse=True 

355 ) 

356 ) 

357 + r")" 

358) 

359 

360# Unknown tags starting with these words will be silently ignored. 

361ignored_unknown_starts: set[str] = set( 

362 [ 

363 "originally", 

364 "e.g.", 

365 "c.f.", 

366 "supplanted by", 

367 "supplied by", 

368 ] 

369) 

370 

371ignored_unknown_starts_re = re.compile( 

372 r"^(" 

373 + "|".join( 

374 re.escape(x) 

375 for x in sorted(ignored_unknown_starts, key=lambda x: -len(x)) 

376 ) 

377 + ") " 

378) 

379 

380# If an unknown sequence starts with one of these, it will continue as an 

381# unknown sequence until the end, unless it turns out to have a replacement. 

382allowed_unknown_starts: set[str] = set( 

383 [ 

384 "Relating", 

385 "accompanied", 

386 "added", 

387 "after", 

388 "answering", 

389 "as", 

390 "based", 

391 "before", 

392 "conjugated", 

393 "conjunction", 

394 "construed", 

395 "especially", 

396 "expression:", 

397 "figurative:", 

398 "followed", 

399 "for", 

400 "forms", 

401 "from", 

402 "governs", 

403 "in", 

404 "indicating", 

405 "modifying", 

406 "normally", 

407 "not", 

408 "of", 

409 "preceding", 

410 "prefixed", 

411 "referring", 

412 "relating", 

413 "revived", 

414 "said", 

415 "since", 

416 "takes", 

417 "used", 

418 "with", 

419 "With", 

420 "without", 

421 ] 

422) 

423# Allow the ignored unknown starts without complaining 

424allowed_unknown_starts.update(ignored_unknown_starts) 

425 

426# Full unknown tags that will be ignored in decode_tags() 

427# XXX this is unused, ask Tatu where the contents is now 

428ignored_unknown_tags: set[str] = set([]) 

429 

430# Head endings that are mapped to tags 

431head_end_map = { 

432 " 1st conj.": "conjugation-1", 

433 " 2nd conj.": "conjugation-2", 

434 " 3rd conj.": "conjugation-3", 

435 " 4th conj.": "conjugation-4", 

436 " 5th conj.": "conjugation-5", 

437 " 6th conj.": "conjugation-6", 

438 " 7th conj.": "conjugation-7", 

439} 

440head_end_re = re.compile( 

441 r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$" 

442) 

443 

444 

445# Dictionary of language-specific parenthesized head part starts that 

446# either introduce new tags or modify previous tags. The value for each 

447# language is a dictionary that maps the first word of the head part to 

448# (rem_tags, add_tags), where ``rem_tags`` can be True to remove all previous 

449# tags or a space-separated string of tags to remove, and ``add_tags`` should 

450# be a string of tags to add. 

451lang_specific_head_map: dict[ 

452 str, dict[str, Union[tuple[str, str], tuple[Literal[True], str]]] 

453] = { 

454 "Danish": { 

455 # prefix: (rem_tags space separate string/True, add_tags s-sep str) 

456 "c": ("neuter", "common-gender"), 

457 "n": ("common-gender", "neuter"), 

458 "pl": ("singular neuter common-gender", "plural"), 

459 "sg": ("plural neuter common-gender", "singular"), 

460 }, 

461} 

462 

463 

464# Regular expression used to strip additional stuff from the end of alt_of and 

465# form_of. 

466alt_of_form_of_clean_re = re.compile( 

467 r"(?s)(" 

468 + "|".join( 

469 [ 

470 r":", 

471 r'[“"]', 

472 r";", 

473 r" \(", 

474 r" - ", 

475 r" ־ ", 

476 r" ᠆ ", 

477 r" ‐ ", 

478 r" ‑ ", 

479 r" ‒ ", 

480 r" – ", 

481 r" — ", 

482 r" ― ", 

483 r" − ", 

484 r" ⸺ ", 

485 r" ⸻ ", 

486 r" ﹘ ", 

487 r" ﹣ ", 

488 r" - ", 

489 r" \+ ", 

490 r" \(with ", 

491 r" with -ra/-re", 

492 r"\. Used ", 

493 r"\. Also ", 

494 r"\. Since ", 

495 r"\. A ", 

496 r"\.\. A ", 

497 r"\. An ", 

498 r"\.\. An ", 

499 r"\. an ", 

500 r"\. The ", 

501 r"\. Spanish ", 

502 r"\. Language ", 

503 r"\. former name of ", 

504 r"\. AIM", 

505 r"\. OT", 

506 r"\. Not ", 

507 r"\. Now ", 

508 r"\. Nowadays ", 

509 r"\. Early ", 

510 r"\. ASEAN", 

511 r"\. UN", 

512 r"\. IMF", 

513 r"\. WHO", 

514 r"\. WIPO", 

515 r"\. AC", 

516 r"\. DC", 

517 r"\. DNA", 

518 r"\. RNA", 

519 r"\. SOB", 

520 r"\. IMO", 

521 r"\. Behavior", 

522 r"\. Income ", 

523 r"\. More ", 

524 r"\. Most ", 

525 r"\. Only ", 

526 r"\. Also ", 

527 r"\. From ", 

528 r"\. Of ", 

529 r"\.\. Of ", 

530 r"\. To ", 

531 r"\. For ", 

532 r"\. If ", 

533 r"\. Praenominal ", 

534 r"\. This ", 

535 r"\. Replaced ", 

536 r"\. CHCS is the ", 

537 r"\. Equivalent ", 

538 r"\. Initialism ", 

539 r"\. Note ", 

540 r"\. Alternative ", 

541 r"\. Compare ", 

542 r"\. Cf\. ", 

543 r"\. Comparable ", 

544 r"\. Involves ", 

545 r"\. Sometimes ", 

546 r"\. Commonly ", 

547 r"\. Often ", 

548 r"\. Typically ", 

549 r"\. Possibly ", 

550 r"\. Although ", 

551 r"\. Rare ", 

552 r"\. Instead ", 

553 r"\. Integrated ", 

554 r"\. Distinguished ", 

555 r"\. Given ", 

556 r"\. Found ", 

557 r"\. Was ", 

558 r"\. In ", 

559 r"\. It ", 

560 r"\.\. It ", 

561 r"\. One ", 

562 r"\. Any ", 

563 r"\. They ", 

564 r"\. Members ", 

565 r"\. Each ", 

566 r"\. Original ", 

567 r"\. Especially ", 

568 r"\. Usually ", 

569 r"\. Known ", 

570 r"\.\. Known ", 

571 r"\. See ", 

572 r"\. see ", 

573 r"\. target was not ", 

574 r"\. Popular ", 

575 r"\. Pedantic ", 

576 r"\. Positive ", 

577 r"\. Society ", 

578 r"\. Plan ", 

579 r"\. Environmentally ", 

580 r"\. Affording ", 

581 r"\. Encompasses ", 

582 r"\. Expresses ", 

583 r"\. Indicates ", 

584 r"\. Text ", 

585 r"\. Large ", 

586 r"\. Sub-sorting ", 

587 r"\. Sax", 

588 r"\. First-person ", 

589 r"\. Second-person ", 

590 r"\. Third-person ", 

591 r"\. 1st ", 

592 r"\. 2nd ", 

593 r"\. 3rd ", 

594 r"\. Term ", 

595 r"\. Northeastern ", 

596 r"\. Northwestern ", 

597 r"\. Southeast ", 

598 r"\. Egyptian ", 

599 r"\. English ", 

600 r"\. Cape Province was split into ", 

601 r"\. Pañcat", 

602 r"\. of the ", 

603 r"\. is ", 

604 r"\. after ", 

605 r"\. or ", 

606 r"\. chromed", 

607 r"\. percussion", 

608 r"\. with his ", 

609 r"\. a\.k\.a\. ", 

610 r"\. comparative form ", 

611 r"\. singular ", 

612 r"\. plural ", 

613 r"\. present ", 

614 r"\. his ", 

615 r"\. her ", 

616 r"\. equivalent ", 

617 r"\. measuring ", 

618 r"\. used in ", 

619 r"\. cutely ", 

620 r"\. Protects", 

621 r'\. "', 

622 r"\.^", 

623 r"\. \+ ", 

624 r"\., ", 

625 r". — ", 

626 r", a ", 

627 r", an ", 

628 r", the ", 

629 r", obsolete ", 

630 r", possessed", # 'd/English 

631 r", imitating", # 1/English 

632 r", derived from", 

633 r", called ", 

634 r", especially ", 

635 r", slang for ", 

636 r" corresponding to ", 

637 r" equivalent to ", 

638 r" popularized by ", 

639 r" denoting ", 

640 r" in its various senses\.", 

641 r" used by ", 

642 r" but not for ", 

643 r" since ", 

644 r" i\.e\. ", 

645 r" i\. e\. ", 

646 r" e\.g\. ", 

647 r" eg\. ", 

648 r" etc\. ", 

649 r"\[http", 

650 r" — used as ", 

651 r" by K\. Forsyth ", 

652 r" by J\. R\. Allen ", 

653 r" by S\. Ferguson ", 

654 r" by G\. Donaldson ", 

655 r" May refer to ", 

656 r" An area or region ", 

657 ] 

658 ) 

659 + r").*$" 

660) 

661 

662 

663class ValidNode: 

664 """Node in the valid_sequences tree. Each node is part of a chain 

665 or chains that form sequences built out of keys in key->tags 

666 maps like xlat_tags, etc. The ValidNode's 'word' is the key 

667 by which it is refered to in the root dict or a `children` dict, 

668 `end` marks that the node is the end-terminus of a sequence (but 

669 it can still continue if the sequence is shared by the start of 

670 other sequences: "nominative$" and "nominative plural$" for example), 

671 `tags` and `topics` are the dicts containing tag and topic strings 

672 for terminal nodes (end==True).""" 

673 

674 __slots__ = ( 

675 "end", 

676 "tags", 

677 "topics", 

678 "children", 

679 ) 

680 

681 def __init__( 

682 self, 

683 end=False, 

684 tags: Optional[list[str]] = None, 

685 topics: Optional[list[str]] = None, 

686 children: Optional[dict[str, "ValidNode"]] = None, 

687 ) -> None: 

688 self.end = end 

689 self.tags: list[str] = tags or [] 

690 self.topics: list[str] = topics or [] 

691 self.children: dict[str, "ValidNode"] = children or {} 

692 

693 

694def add_to_valid_tree(tree: ValidNode, desc: str, v: Optional[str]) -> None: 

695 """Helper function for building trees of valid tags/sequences during 

696 initialization.""" 

697 assert isinstance(tree, ValidNode) 

698 assert isinstance(desc, str) 

699 assert v is None or isinstance(v, str) 

700 node = tree 

701 

702 # Build the tree structure: each node has children nodes 

703 # whose names are denoted by their dict key. 

704 for w in desc.split(" "): 

705 if w in node.children: 

706 node = node.children[w] 

707 else: 

708 new_node = ValidNode() 

709 node.children[w] = new_node 

710 node = new_node 

711 if not node.end: 

712 node.end = True 

713 if not v: 

714 return None # Terminate early because there are no tags 

715 

716 tagslist = [] 

717 topicslist = [] 

718 for vv in v.split(): 

719 if vv in valid_tags: 

720 tagslist.append(vv) 

721 elif vv in valid_topics: 721 ↛ 724line 721 didn't jump to line 724 because the condition on line 721 was always true

722 topicslist.append(vv) 

723 else: 

724 print( 

725 "WARNING: tag/topic {!r} maps to unknown {!r}".format(desc, vv) 

726 ) 

727 topics = " ".join(topicslist) 

728 tags = " ".join(tagslist) 

729 # Changed to "_tags" and "_topics" to avoid possible key-collisions. 

730 if topics: 

731 node.topics.extend([topics]) 

732 if tags: 

733 node.tags.extend([tags]) 

734 

735 

736def add_to_valid_tree1( 

737 tree: ValidNode, 

738 k: str, 

739 v: Union[list[str], tuple[str, ...], str], 

740 valid_values: Union[set[str], dict[str, Any]], 

741) -> list[str]: 

742 assert isinstance(tree, ValidNode) 

743 assert isinstance(k, str) 

744 assert v is None or isinstance(v, (list, tuple, str)) 

745 assert isinstance(valid_values, (set, dict)) 

746 if not v: 746 ↛ 747line 746 didn't jump to line 747 because the condition on line 746 was never true

747 add_to_valid_tree(valid_sequences, k, None) 

748 return [] 

749 elif isinstance(v, str): 

750 v = [v] 

751 q = [] 

752 for vv in v: 

753 assert isinstance(vv, str) 

754 add_to_valid_tree(valid_sequences, k, vv) 

755 vvs = vv.split() 

756 for x in vvs: 

757 q.append(x) 

758 # return each individual tag 

759 return q 

760 

761 

762def add_to_valid_tree_mapping( 

763 tree: ValidNode, 

764 mapping: Union[dict[str, Union[list[str], str]], dict[str, str]], 

765 valid_values: Union[set[str], dict[str, Any]], 

766 recurse: bool, 

767) -> None: 

768 assert isinstance(tree, ValidNode) 

769 assert isinstance(mapping, dict) 

770 assert isinstance(valid_values, (set, dict)) 

771 assert recurse in (True, False) 

772 for k, v in mapping.items(): 

773 assert isinstance(k, str) 

774 assert isinstance(v, (list, str)) 

775 if isinstance(v, str): 

776 q = add_to_valid_tree1(tree, k, [v], valid_values) 

777 else: 

778 q = add_to_valid_tree1(tree, k, v, valid_values) 

779 if recurse: 

780 visited = set() 

781 while q: 

782 v = q.pop() 

783 if v in visited: 

784 continue 

785 visited.add(v) 

786 if v not in mapping: 

787 continue 

788 vv = mapping[v] 

789 qq = add_to_valid_tree1(tree, k, vv, valid_values) 

790 q.extend(qq) 

791 

792 

793# Tree of sequences considered to be tags (includes sequences that are 

794# mapped to something that becomes one or more valid tags) 

795valid_sequences = ValidNode() 

796sequences_with_slashes: set[str] = set() 

797for tag in valid_tags: 

798 # The basic tags used in our tag system; some are a bit weird, but easier 

799 # to implement this with 'false' positives than filter out stuff no one else 

800 # uses. 

801 if "/" in tag: 

802 sequences_with_slashes.add(tag) 

803 add_to_valid_tree(valid_sequences, tag, tag) 

804for tag in uppercase_tags: 

805 hyphenated = re.sub(r"\s+", "-", tag) 

806 if "/" in tag: 

807 sequences_with_slashes.add(tag) 

808 add_to_valid_tree(valid_sequences, tag, hyphenated) 

809 

810# xlat_tags_map! 

811add_to_valid_tree_mapping(valid_sequences, xlat_tags_map, valid_tags, False) 

812for k in xlat_tags_map: 

813 if "/" in k: 

814 sequences_with_slashes.add(k) 

815# Add topics to the same table, with all generalized topics also added 

816for topic in valid_topics: 

817 assert " " not in topic 

818 if "/" in topic: 818 ↛ 819line 818 didn't jump to line 819 because the condition on line 818 was never true

819 sequences_with_slashes.add(topic) 

820 add_to_valid_tree(valid_sequences, topic, topic) 

821# Let each original topic value stand alone. These are not generally on 

822# valid_topics. We add the original topics with spaces replaced by hyphens. 

823for topic in topic_generalize_map.keys(): 

824 hyphenated = re.sub(r"\s+", "-", topic) 

825 if "/" in topic: 825 ↛ 826line 825 didn't jump to line 826 because the condition on line 825 was never true

826 sequences_with_slashes.add(topic) 

827 add_to_valid_tree(valid_sequences, topic, hyphenated) 

828# Add canonicalized/generalized topic values 

829add_to_valid_tree_mapping( 

830 valid_sequences, topic_generalize_map, valid_topics, True 

831) 

832 

833# Regex used to divide a decode candidate into parts that shouldn't 

834# have their slashes turned into spaces 

835slashes_re = re.compile( 

836 r"(" + "|".join((re.escape(s) for s in sequences_with_slashes)) + r")" 

837) 

838 

839# Regexp used to find "words" from word heads and linguistic descriptions 

840word_pattern = ( 

841 r"[^ ,;()\u200e]+|" 

842 r"\([^ ,;()\u200e]+\)[^ ,;()\u200e]+|" 

843 r"[\u2800-\u28ff]|" # Braille characters 

844 r"\(([^()]|\([^()]*\))*\)" 

845) 

846 

847word_re_global = re.compile(word_pattern) 

848 

849 

850def distw(titleparts: Sequence[str], word: str) -> float: 

851 """Computes how distinct ``word`` is from the most similar word in 

852 ``titleparts``. Returns 1 if words completely distinct, 0 if 

853 identical, or otherwise something in between.""" 

854 assert isinstance(titleparts, (list, tuple)) 

855 assert isinstance(word, str) 

856 w = min( 

857 Levenshtein.distance(word, tw) / max(len(tw), len(word)) 

858 for tw in titleparts 

859 ) 

860 return w 

861 

862 

863def map_with( 

864 ht: dict[str, str | list[str]] | dict[str, str], 

865 lst: Sequence[str], 

866) -> list[str]: 

867 """Takes alternatives from ``lst``, maps them using ``ht`` to zero or 

868 more alternatives each, and returns a combined list of alternatives.""" 

869 assert isinstance(ht, dict) 

870 assert isinstance(lst, (list, tuple)) 

871 ret = [] 

872 for x in lst: 

873 assert isinstance(x, str) 

874 x = x.strip() 

875 x = ht.get(x, x) 

876 if isinstance(x, str): 876 ↛ 879line 876 didn't jump to line 879 because the condition on line 876 was always true

877 if x: 877 ↛ 872line 877 didn't jump to line 872 because the condition on line 877 was always true

878 ret.append(x) 

879 elif isinstance(x, (list, tuple)): 

880 ret.extend(x) 

881 else: 

882 raise RuntimeError("map_with unexpected value: {!r}".format(x)) 

883 return ret 

884 

885 

886TagList = list[str] 

887PosPathStep = tuple[int, TagList, TagList] 

888 

889 

890def check_unknown( 

891 from_i: int, 

892 to_i: int, 

893 i: int, 

894 wordlst: Sequence[str], 

895 allow_any: bool, 

896 no_unknown_starts: bool, 

897) -> list[PosPathStep]: 

898 """Check if the current section from_i->to_i is actually unknown 

899 or if it needs some special handling. We already presupposed that 

900 this is UNKNOWN; this is just called to see what *kind* of UNKNOWN.""" 

901 assert isinstance(to_i, int) 

902 assert isinstance(from_i, int) 

903 assert isinstance(i, int) 

904 # Adds unknown tag if needed. Returns new last_i 

905 # print("check_unknown to_i={} from_i={} i={}" 

906 # .format(to_i, from_i, i)) 

907 if from_i >= to_i: 

908 return [] 

909 words = wordlst[from_i:to_i] 

910 tag = " ".join(words) 

911 assert tag 

912 # print(f"{tag=}") 

913 if re.match(ignored_unknown_starts_re, tag): 

914 # Tags with this start are to be ignored 

915 return [(from_i, ["UNKNOWN"], [])] 

916 if tag in ignored_unknown_tags: 916 ↛ 917line 916 didn't jump to line 917 because the condition on line 916 was never true

917 return [] # One of the tags listed as to be ignored 

918 if tag in ("and", "or"): 

919 return [] 

920 if ( 

921 not allow_any 

922 and not words[0].startswith("~") 

923 and ( 

924 no_unknown_starts 

925 or words[0] not in allowed_unknown_starts 

926 or len(words) <= 1 

927 ) 

928 ): 

929 # print("ERR allow_any={} words={}" 

930 # .format(allow_any, words)) 

931 return [ 

932 (from_i, ["UNKNOWN"], ["error-unknown-tag"]) 

933 ] # Add ``tag`` here to include 

934 else: 

935 return [(from_i, ["UNKNOWN"], [tag])] 

936 

937 

938def add_new1( 

939 node: ValidNode, 

940 i: int, 

941 start_i: int, 

942 last_i: int, 

943 new_paths: list[list[PosPathStep]], 

944 new_nodes: list[tuple[ValidNode, int, int]], 

945 pos_paths: list[list[list[PosPathStep]]], 

946 wordlst: list[str], 

947 allow_any: bool, 

948 no_unknown_starts: bool, 

949 max_last_i: int, 

950) -> int: 

951 assert isinstance(new_paths, list) 

952 # print("add_new: start_i={} last_i={}".format(start_i, last_i)) 

953 # print("$ {} last_i={} start_i={}" 

954 # .format(w, last_i, start_i)) 

955 max_last_i = max(max_last_i, last_i) # if last_i has grown 

956 if (node, start_i, last_i) not in new_nodes: 

957 new_nodes.append((node, start_i, last_i)) 

958 if node.end: 

959 # We can see a terminal point in the search tree. 

960 u = check_unknown( 

961 last_i, start_i, i, wordlst, allow_any, no_unknown_starts 

962 ) 

963 # Create new paths candidates based on different past possible 

964 # paths; pos_path[last_i] contains possible paths, so add this 

965 # new one at the beginning(?) 

966 # The list comprehension inside the parens generates an iterable 

967 # of lists, so this is .extend( [(last_i...)], [(last_i...)], ... ) 

968 # XXX: this is becoming impossible to annotate, nodes might 

969 # need to become classed objects and not just dicts, or at least 

970 # a TypedDict with a "children" node 

971 new_paths.extend( 

972 [(last_i, node.tags, node.topics)] + u + x 

973 for x in pos_paths[last_i] 

974 ) 

975 max_last_i = i + 1 

976 return max_last_i 

977 

978 

979@functools.lru_cache(maxsize=65536) 

980def decode_tags( 

981 src: str, 

982 allow_any=False, 

983 no_unknown_starts=False, 

984) -> tuple[list[tuple[str, ...]], list[str]]: 

985 tagsets, topics = decode_tags1(src, allow_any, no_unknown_starts) 

986 # print(f"decode_tags: {src=}, {tagsets=}") 

987 

988 # Insert retry-code here that modifies the text source 

989 if ( 

990 any(s.startswith("error-") for tagset in tagsets for s in tagset) 

991 # I hate Python's *nested* list comprehension syntax ^ 

992 or any(s.startswith("error-") for s in topics) 

993 ): 

994 new_tagsets: list[tuple[str, ...]] = [] 

995 new_topics: list[str] = [] 

996 

997 if "/" in src: 

998 # slashes_re contains valid key entries with slashes; we're going 

999 # to skip them by splitting the string and skipping handling every 

1000 # second entry, which contains the splitting group like "masculine/ 

1001 # feminine" style keys. 

1002 split_parts = re.split(slashes_re, src) 

1003 new_parts: list[str] = [] 

1004 if len(split_parts) > 1: 

1005 for i, s in enumerate(split_parts): 

1006 if i % 2 == 0: 

1007 new_parts.append(s.replace("/", " ")) 

1008 else: 

1009 new_parts.append(s) 

1010 new_src = "".join(new_parts) 

1011 else: 

1012 new_src = src 

1013 new_tagsets, new_topics = decode_tags1( 

1014 new_src, allow_any, no_unknown_starts 

1015 ) 

1016 elif " or " in src or " and " in src: 

1017 # Annoying kludge. 

1018 new_src = src.replace(" and ", " ") 

1019 new_src = new_src.replace(" or ", " ") 

1020 new_tagsets, new_topics = decode_tags1( 

1021 new_src, allow_any, no_unknown_starts 

1022 ) 

1023 # print(f"{new_tagsets=}") 

1024 

1025 if new_tagsets or new_topics: 

1026 old_errors = sum( 

1027 1 for tagset in tagsets for s in tagset if s.startswith("error") 

1028 ) 

1029 old_errors += sum(1 for s in topics if s.startswith("error")) 

1030 new_errors = sum( 

1031 1 

1032 for new_tagset in new_tagsets 

1033 for s in new_tagset 

1034 if s.startswith("error") 

1035 ) 

1036 new_errors += sum(1 for s in new_topics if s.startswith("error")) 

1037 

1038 if new_errors <= old_errors: 1038 ↛ 1041line 1038 didn't jump to line 1041 because the condition on line 1038 was always true

1039 return new_tagsets, new_topics 

1040 

1041 return tagsets, topics 

1042 

1043 

1044def decode_tags1( 

1045 src: str, 

1046 allow_any=False, 

1047 no_unknown_starts=False, 

1048) -> tuple[list[tuple[str, ...]], list[str]]: 

1049 """Decodes tags, doing some canonicalizations. This returns a list of 

1050 lists of tags and a list of topics.""" 

1051 assert isinstance(src, str) 

1052 

1053 # print("decode_tags: src={!r}".format(src)) 

1054 

1055 pos_paths: list[list[list[PosPathStep]]] = [[[]]] 

1056 wordlst: list[str] = [] 

1057 max_last_i = 0 # pre-initialized here so that it can be used as a ref 

1058 

1059 add_new = functools.partial( 

1060 add_new1, # pre-set parameters and references for function 

1061 pos_paths=pos_paths, 

1062 wordlst=wordlst, 

1063 allow_any=allow_any, 

1064 no_unknown_starts=no_unknown_starts, 

1065 max_last_i=max_last_i, 

1066 ) 

1067 # First split the tags at commas and semicolons. Their significance is that 

1068 # a multi-word sequence cannot continue across them. 

1069 parts = split_at_comma_semi(src, extra=[";", ":"]) 

1070 

1071 for part in parts: 

1072 max_last_i = len(wordlst) # "how far have we gone?" 

1073 lst1 = part.split() 

1074 if not lst1: 

1075 continue 

1076 wordlst.extend(lst1) 

1077 cur_nodes: list[tuple[ValidNode, int, int]] = [] # Currently seen 

1078 for w in lst1: 

1079 i = len(pos_paths) - 1 

1080 new_nodes: list[tuple[ValidNode, int, int]] = [] 

1081 # replacement nodes for next loop 

1082 new_paths: list[list[PosPathStep]] = [] 

1083 # print("ITER i={} w={} max_last_i={} wordlst={}" 

1084 # .format(i, w, max_last_i, wordlst)) 

1085 node: ValidNode 

1086 start_i: int 

1087 last_i: int 

1088 for node, start_i, last_i in cur_nodes: 

1089 # ValidNodes are part of a search tree that checks if a 

1090 # phrase is found in xlat_tags_map and other text->tags dicts. 

1091 if w in node.children: 

1092 # the phrase continues down the tree 

1093 # print("INC", w) 

1094 max_last_i = add_new( 

1095 node.children[w], 

1096 i, 

1097 start_i, 

1098 last_i, 

1099 new_paths, 

1100 new_nodes, 

1101 ) 

1102 if node.end: 

1103 # we've hit an end point, the tags and topics have already 

1104 # been gathered at some point, don't do anything with the 

1105 # old stuff 

1106 if w in valid_sequences.children: 

1107 # This starts a *new* possible section 

1108 max_last_i = add_new( 

1109 valid_sequences.children[w], # root-> 

1110 i, 

1111 i, 

1112 i, 

1113 new_paths, 

1114 new_nodes, 

1115 ) 

1116 if w not in node.children and not node.end: 

1117 # print("w not in node and $: i={} last_i={} wordlst={}" 

1118 # .format(i, last_i, wordlst)) 

1119 # If i == last_i == 0, for example (beginning) 

1120 if ( 

1121 i == last_i 

1122 or no_unknown_starts 

1123 or wordlst[last_i] not in allowed_unknown_starts 

1124 ): 

1125 # print("NEW", w) 

1126 if w in valid_sequences.children: 

1127 # Start new sequences here 

1128 max_last_i = add_new( 

1129 valid_sequences.children[w], 

1130 i, 

1131 i, 

1132 last_i, 

1133 new_paths, 

1134 new_nodes, 

1135 ) 

1136 if not new_nodes: 

1137 # This is run at the start when i == max_last_i == 0, 

1138 # which is what populates the first node in new_nodes. 

1139 # Some initial words cause the rest to be interpreted as unknown 

1140 # print("not new nodes: i={} last_i={} wordlst={}" 

1141 # .format(i, max_last_i, wordlst)) 

1142 if ( 

1143 i == max_last_i 

1144 or no_unknown_starts 

1145 or wordlst[max_last_i] not in allowed_unknown_starts 

1146 ): 

1147 # print("RECOVER w={} i={} max_last_i={} wordlst={}" 

1148 # .format(w, i, max_last_i, wordlst)) 

1149 if w in valid_sequences.children: 

1150 max_last_i = add_new( 

1151 # new sequence from root 

1152 valid_sequences.children[w], 

1153 i, 

1154 i, 

1155 max_last_i, 

1156 new_paths, 

1157 new_nodes, 

1158 ) 

1159 cur_nodes = new_nodes # Completely replace nodes! 

1160 # 2023-08-18, fix to improve performance 

1161 # Decode tags does a big search of the best-shortest matching 

1162 # sequences of tags, but the original algorithm didn't have 

1163 # any culling happen during operation, so in a case with 

1164 # a lot of tags (for example, big blocks of text inserted 

1165 # somewhere by mistake that is processed by decode_tags), 

1166 # it would lead to exponential growth of new_paths contents. 

1167 # This culling, using the same weighting algorithm code as 

1168 # in the original is just applied to new_paths before it is 

1169 # added to pos_paths. Basically it's "take the 10 best paths". 

1170 # This *can* cause bugs if it gets stuck in a local minimum 

1171 # or something, but this whole process is one-dimensional 

1172 # and not that complex, so hopefully it works out... 

1173 pw = [] 

1174 path: list[PosPathStep] 

1175 for path in new_paths: 

1176 weight = len(path) 

1177 if any(x[1] == ["UNKNOWN"] for x in path): 

1178 weight += 100 # Penalize unknown paths 

1179 pw.append((weight, path)) 

1180 new_paths = [weightpath[1] for weightpath in sorted(pw)[:10]] 

1181 pos_paths.append(new_paths) 

1182 

1183 # print("END max_last_i={} len(wordlst)={} len(pos_paths)={}" 

1184 # .format(max_last_i, len(wordlst), len(pos_paths))) 

1185 

1186 if cur_nodes: 

1187 # print("END HAVE_NODES") 

1188 for node, start_i, last_i in cur_nodes: 

1189 if node.end: 

1190 # print("$ END start_i={} last_i={}" 

1191 # .format(start_i, last_i)) 

1192 for path in pos_paths[start_i]: 

1193 pos_paths[-1].append( 

1194 [(last_i, node.tags, node.topics)] + path 

1195 ) 

1196 else: 

1197 # print("UNK END start_i={} last_i={} wordlst={}" 

1198 # .format(start_i, last_i, wordlst)) 

1199 u = check_unknown( 

1200 last_i, 

1201 len(wordlst), 

1202 len(wordlst), 

1203 wordlst, 

1204 allow_any, 

1205 no_unknown_starts, 

1206 ) 

1207 if pos_paths[start_i]: 

1208 for path in pos_paths[start_i]: 

1209 pos_paths[-1].append(u + path) 

1210 else: 

1211 pos_paths[-1].append(u) 

1212 else: 

1213 # Check for a final unknown tag 

1214 # print("NO END NODES max_last_i={}".format(max_last_i)) 

1215 paths = pos_paths[max_last_i] or [[]] 

1216 u = check_unknown( 

1217 max_last_i, 

1218 len(wordlst), 

1219 len(wordlst), 

1220 wordlst, 

1221 allow_any, 

1222 no_unknown_starts, 

1223 ) 

1224 if u: 

1225 # print("end max_last_i={}".format(max_last_i)) 

1226 for path in list(paths): # Copy in case it is the last pos 

1227 pos_paths[-1].append(u + path) 

1228 

1229 # import json 

1230 # print("POS_PATHS:", json.dumps(pos_paths, indent=2, sort_keys=True)) 

1231 

1232 if not pos_paths[-1]: 

1233 # print("decode_tags: {}: EMPTY POS_PATHS[-1]".format(src)) 

1234 return [], [] 

1235 

1236 # Find the best path 

1237 pw = [] 

1238 for path in pos_paths[-1]: 

1239 weight = len(path) 

1240 if any(x[1] == ["UNKNOWN"] for x in path): 

1241 weight += 100 # Penalize unknown paths 

1242 pw.append((weight, path)) 

1243 path = min(pw)[1] 

1244 

1245 # Convert the best path to tagsets and topics 

1246 tagsets: list[list[str]] = [[]] 

1247 topics: list[str] = [] 

1248 for i, tagspec, topicspec in path: 

1249 if len(tagsets or "") > 16: 

1250 # ctx.error("Too many tagsets! This is probably exponential", 

1251 # sortid="form_descriptions/20230818") 

1252 return [("error-unknown-tag", "error-exponential-tagsets")], [] 

1253 if tagspec == ["UNKNOWN"]: 

1254 new_tagsets = [] 

1255 for x in tagsets: 

1256 new_tagsets.append(x + topicspec) 

1257 tagsets = new_tagsets 

1258 continue 

1259 if tagspec: 

1260 new_tagsets = [] 

1261 for x in tagsets: 

1262 for t in tagspec: 

1263 if t: 1263 ↛ 1270line 1263 didn't jump to line 1270 because the condition on line 1263 was always true

1264 new_tags = list(x) 

1265 for tag in t.split(): 

1266 if tag not in new_tags: 

1267 new_tags.append(tag) 

1268 new_tagsets.append(new_tags) 

1269 else: 

1270 new_tagsets.append(x) 

1271 tagsets = new_tagsets 

1272 if topicspec: 

1273 for t in topicspec: 

1274 for topic in t.split(): 

1275 if topic not in topics: 

1276 topics.append(topic) 

1277 

1278 # print("unsorted tagsets:", tagsets) 

1279 ret_tagsets = sorted(set(tuple(sorted(set(tags))) for tags in tagsets)) 

1280 # topics = list(sorted(set(topics))) XXX tests expect not sorted 

1281 # print("decode_tags: {} -> {} topics {}".format(src, tagsets, topics)) 

1282 # Yes, ret_tagsets is a list of tags in tuples, while topics is a LIST 

1283 # of tags. Turning topics into a tuple breaks tests, turning the tuples 

1284 # inside tagsets into lists breaks tests, I'm leaving them mismatched 

1285 # for now. XXX 

1286 return ret_tagsets, topics 

1287 

1288 

1289def parse_head_final_tags( 

1290 wxr: WiktextractContext, lang: str, form: str 

1291) -> tuple[str, list[str]]: 

1292 """Parses tags that are allowed at the end of a form head from the end 

1293 of the form. This can also be used for parsing the final gender etc tags 

1294 from translations and linkages.""" 

1295 assert isinstance(wxr, WiktextractContext) 

1296 assert isinstance(lang, str) # Should be language that "form" is for 

1297 assert isinstance(form, str) 

1298 

1299 # print("parse_head_final_tags: lang={} form={!r}".format(lang, form)) 

1300 

1301 # Make sure there are no double spaces in the form as this code does not 

1302 # handle them otherwise. 

1303 form = re.sub(r"\s+", " ", form.strip()) 

1304 if not form: 

1305 return form, [] 

1306 

1307 origform = form 

1308 

1309 tags = [] 

1310 

1311 # If parsing for certain Bantu languages (e.g., Swahili), handle 

1312 # some extra head-final tags first 

1313 if lang in head_final_bantu_langs: 

1314 m = re.search(head_final_bantu_re, form) 

1315 if m is not None: 

1316 tagkeys = m.group(1) 

1317 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1317 ↛ 1332line 1317 didn't jump to line 1332 because the condition on line 1317 was always true

1318 form = form[: m.start()] 

1319 v = head_final_bantu_map[tagkeys] 

1320 if v.startswith("?"): 1320 ↛ 1321line 1320 didn't jump to line 1321 because the condition on line 1320 was never true

1321 v = v[1:] 

1322 wxr.wtp.debug( 

1323 "suspicious suffix {!r} in language {}: {}".format( 

1324 tagkeys, lang, origform 

1325 ), 

1326 sortid="form_descriptions/1028", 

1327 ) 

1328 tags.extend(v.split()) 

1329 

1330 # If parsing for certain Semitic languages (e.g., Arabic), handle 

1331 # some extra head-final tags first 

1332 if lang in head_final_semitic_langs: 

1333 m = re.search(head_final_semitic_re, form) 

1334 if m is not None: 

1335 tagkeys = m.group(1) 

1336 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1336 ↛ 1351line 1336 didn't jump to line 1351 because the condition on line 1336 was always true

1337 form = form[: m.start()] 

1338 v = head_final_semitic_map[tagkeys] 

1339 if v.startswith("?"): 1339 ↛ 1340line 1339 didn't jump to line 1340 because the condition on line 1339 was never true

1340 v = v[1:] 

1341 wxr.wtp.debug( 

1342 "suspicious suffix {!r} in language {}: {}".format( 

1343 tagkeys, lang, origform 

1344 ), 

1345 sortid="form_descriptions/1043", 

1346 ) 

1347 tags.extend(v.split()) 

1348 

1349 # If parsing for certain other languages (e.g., Lithuanian, 

1350 # French, Finnish), handle some extra head-final tags first 

1351 if lang in head_final_other_langs: 

1352 m = re.search(head_final_other_re, form) 

1353 if m is not None: 

1354 tagkeys = m.group(1) 

1355 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1355 ↛ 1360line 1355 didn't jump to line 1360 because the condition on line 1355 was always true

1356 form = form[: m.start()] 

1357 tags.extend(head_final_other_map[tagkeys].split(" ")) 

1358 

1359 # Handle normal head-final tags 

1360 m = re.search(head_final_re, form) 

1361 if m is not None: 

1362 tagkeys = m.group(3) 

1363 # Only replace tags ending with numbers in languages that have 

1364 # head-final numeric tags (e.g., Bantu classes); also, don't replace 

1365 # tags if the main title ends with them (then presume they are part 

1366 # of the word) 

1367 # print("head_final_tags form={!r} tagkeys={!r} lang={}" 

1368 # .format(form, tagkeys, lang)) 

1369 tagkeys_contains_digit = re.search(r"\d", tagkeys) 

1370 if ( 

1371 (not tagkeys_contains_digit or lang in head_final_numeric_langs) 

1372 and not wxr.wtp.title.endswith(" " + tagkeys) # type:ignore[union-attr] 

1373 and 

1374 # XXX the above test does not capture when the whole word is a 

1375 # xlat_head_map key, so I added the below test to complement 

1376 # it; does this break anything? 

1377 not wxr.wtp.title == tagkeys 

1378 ): # defunct/English, 

1379 # "more defunct" -> "more" ["archaic"] 

1380 if not tagkeys_contains_digit or lang in head_final_numeric_langs: 1380 ↛ 1394line 1380 didn't jump to line 1394 because the condition on line 1380 was always true

1381 form = form[: m.start()] 

1382 v = xlat_head_map[tagkeys] 

1383 if v.startswith("?"): 1383 ↛ 1384line 1383 didn't jump to line 1384 because the condition on line 1383 was never true

1384 v = v[1:] 

1385 wxr.wtp.debug( 

1386 "suspicious suffix {!r} in language {}: {}".format( 

1387 tagkeys, lang, origform 

1388 ), 

1389 sortid="form_descriptions/1077", 

1390 ) 

1391 tags.extend(v.split()) 

1392 

1393 # Generate warnings about words ending in " or" after processing 

1394 if ( 

1395 (form.endswith(" or") and not origform.endswith(" or")) 

1396 or re.search( 

1397 r" (1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|" 

1398 r"1a|2a|9a|10a|m1|f1|f2|m2|f3|m3|f4|m4|f5|m5|or|\?)" 

1399 r"($|/| (f|m|sg|pl|anim|inan))", 

1400 form, 

1401 ) 

1402 or form.endswith(" du") 

1403 ): 

1404 if form not in ok_suspicious_forms: 

1405 wxr.wtp.debug( 

1406 "suspicious unhandled suffix in {}: {!r}, originally {!r}".format( 

1407 lang, form, origform 

1408 ), 

1409 sortid="form_descriptions/1089", 

1410 ) 

1411 

1412 # print("parse_head_final_tags: form={!r} tags={}".format(form, tags)) 

1413 return form, tags 

1414 

1415 

1416def quote_kept_parens(s: str) -> str: 

1417 """Changes certain parenthesized expressions so that they won't be 

1418 interpreted as parentheses. This is used for parts that are kept as 

1419 part of the word, such as "read admiral (upper half)".""" 

1420 return re.sub( 

1421 r"\((lower half|upper half|k|s|n|II|III|A|C|G|U|Y|" 

1422 r"vinyl|p-phenylene vinylene|\(\(\s*\)\))\)", 

1423 r"__lpar__\1__rpar__", 

1424 s, 

1425 ) 

1426 

1427 

1428def quote_kept_ruby( 

1429 wxr: WiktextractContext, 

1430 ruby_tuples: list[ 

1431 tuple[ 

1432 str, 

1433 str, 

1434 ] 

1435 ], 

1436 s: str, 

1437) -> str: 

1438 if len(ruby_tuples) < 1: 

1439 wxr.wtp.debug( 

1440 "quote_kept_ruby called with no ruby", 

1441 sortid="form_description/1114/20230517", 

1442 ) 

1443 return s 

1444 ks = [] 

1445 rs = [] 

1446 for k, r in ruby_tuples: 

1447 ks.append(re.escape(k)) 

1448 rs.append(re.escape(r)) 

1449 if not (ks and rs): 

1450 wxr.wtp.debug( 

1451 f"empty column in ruby_tuples: {ruby_tuples}", 

1452 sortid="form_description/1124/20230606", 

1453 ) 

1454 return s 

1455 newm = re.compile( 

1456 r"({})\s*\(\s*({})\s*\)".format("|".join(ks), "|".join(rs)) 

1457 ) 

1458 rub_re = re.compile( 

1459 r"({})".format( 

1460 r"|".join( 

1461 r"{}\(*{}\)*".format( 

1462 re.escape(k), 

1463 re.escape(r), 

1464 ) 

1465 for k, r in ruby_tuples 

1466 ) 

1467 ) 

1468 ) 

1469 

1470 def paren_replace(m: re.Match) -> str: 

1471 return re.sub(newm, r"\1__lrub__\2__rrub__", m.group(0)) 

1472 

1473 return re.sub(rub_re, paren_replace, s) 

1474 

1475 

1476def unquote_kept_parens(s: str) -> str: 

1477 """Conerts the quoted parentheses back to normal parentheses.""" 

1478 return re.sub(r"__lpar__(.*?)__rpar__", r"(\1)", s) 

1479 

1480 

1481def add_romanization( 

1482 wxr: WiktextractContext, 

1483 data: WordData, 

1484 roman: str, 

1485 text: str, 

1486 is_reconstruction: bool, 

1487 head_group: Optional[int], 

1488 ruby: Sequence[tuple[str, str]], 

1489) -> None: 

1490 tags_lst = ["romanization"] 

1491 m = re.match(r"([^:]+):(.+)", roman) 

1492 # This function's purpose is to intercept broken romanizations, 

1493 # like "Yale: hēnpyeng" style tags. Most romanization styles 

1494 # are already present as tags, so we can use decode_tags to find 

1495 # them. 

1496 if m: 1496 ↛ 1497line 1496 didn't jump to line 1497 because the condition on line 1496 was never true

1497 tagsets, topics = decode_tags(m.group(1)) 

1498 if tagsets: 

1499 for tags in tagsets: 

1500 tags_lst.extend(tags) 

1501 roman = m.group(2) 

1502 add_related( 

1503 wxr, 

1504 data, 

1505 tags_lst, 

1506 [roman], 

1507 text, 

1508 True, 

1509 is_reconstruction, 

1510 head_group, 

1511 ruby, 

1512 ) 

1513 

1514 

1515def add_related( 

1516 wxr: WiktextractContext, 

1517 data: WordData, 

1518 tags_lst: Union[list[str], tuple[str, ...]], 

1519 related_list: list[str], 

1520 origtext: str, 

1521 add_all_canonicals: bool, 

1522 is_reconstruction: bool, 

1523 head_group: Optional[int], 

1524 ruby_data: Optional[Sequence[tuple[str, str]]] = None, 

1525) -> Optional[list[tuple[str, ...]]]: 

1526 """Internal helper function for some post-processing entries for related 

1527 forms (e.g., in word head). This returns a list of list of tags to be 

1528 added to following related forms or None (cf. walrus/English word head, 

1529 parenthesized part starting with "both").""" 

1530 assert isinstance(wxr, WiktextractContext) 

1531 assert isinstance(tags_lst, (list, tuple)) 

1532 for x in tags_lst: 

1533 assert isinstance(x, str) 

1534 assert isinstance(related_list, (list, tuple)) 

1535 assert isinstance(origtext, str) 

1536 assert add_all_canonicals in (True, False) 

1537 assert isinstance(ruby_data, (list, tuple)) or ruby_data is None 

1538 if ruby_data is None: 1538 ↛ 1539line 1538 didn't jump to line 1539 because the condition on line 1538 was never true

1539 ruby_data = [] 

1540 related = " ".join(related_list) 

1541 # print("add_related: tags_lst={} related={}".format(tags_lst, related)) 

1542 if related == "[please provide]": 1542 ↛ 1543line 1542 didn't jump to line 1543 because the condition on line 1542 was never true

1543 return None 

1544 if related in IGNORED_RELATED: 1544 ↛ 1545line 1544 didn't jump to line 1545 because the condition on line 1544 was never true

1545 return None 

1546 if is_reconstruction and related.startswith("*") and len(related) > 1: 

1547 related = related[1:] 

1548 

1549 # Get title word, with any reconstruction prefix removed 

1550 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title) # type:ignore[arg-type] 

1551 

1552 def check_related(related: str) -> None: 

1553 # Warn about some suspicious related forms 

1554 m = re.search(suspicious_related_re, related) 

1555 if (m and m.group(0) not in titleword) or ( 

1556 related in ("f", "m", "n", "c") and len(titleword) >= 3 

1557 ): 

1558 if "eumhun" in tags_lst: 1558 ↛ 1559line 1558 didn't jump to line 1559 because the condition on line 1558 was never true

1559 return 

1560 if "cangjie-input" in tags_lst: 1560 ↛ 1561line 1560 didn't jump to line 1561 because the condition on line 1560 was never true

1561 return 

1562 if "class" in tags_lst: 1562 ↛ 1563line 1562 didn't jump to line 1563 because the condition on line 1562 was never true

1563 return 

1564 if wxr.wtp.section == "Korean" and re.search( 1564 ↛ 1568line 1564 didn't jump to line 1568 because the condition on line 1564 was never true

1565 r"^\s*\w*>\w*\s*$", related 

1566 ): 

1567 # ignore Korean "i>ni" / "라>나" values 

1568 return 

1569 if ( 1569 ↛ 1576line 1569 didn't jump to line 1576 because the condition on line 1569 was never true

1570 wxr.wtp.section == "Burmese" 

1571 and "romanization" in tags_lst 

1572 and re.search(r":", related) 

1573 ): 

1574 # ignore Burmese with ":", that is used in Burmese 

1575 # translitteration of "း", the high-tone visarga. 

1576 return 

1577 wxr.wtp.debug( 

1578 "suspicious related form tags {}: {!r} in {!r}".format( 

1579 tags_lst, related, origtext 

1580 ), 

1581 sortid="form_descriptions/1147", 

1582 ) 

1583 

1584 following_tagsets = None # Tagsets to add to following related forms 

1585 roman = None 

1586 tagsets1: list[tuple[str, ...]] = [tuple()] 

1587 topics1: list[str] = [] 

1588 

1589 m = re.match(r"\((([^()]|\([^()]*\))*)\)\s+", related) 

1590 if m: 

1591 paren = m.group(1) 

1592 related = related[m.end() :] 

1593 m = re.match(r"^(all|both) (.*)", paren) 

1594 if m: 1594 ↛ 1595line 1594 didn't jump to line 1595 because the condition on line 1594 was never true

1595 tagsets1, topics1 = decode_tags(m.group(2)) 

1596 following_tagsets = tagsets1 

1597 else: 

1598 tagsets1, topics1 = decode_tags(paren) 

1599 else: 

1600 m = re.search(r"\s+\((([^()]|\([^()]*\))*)\)$", related) 

1601 if m: 

1602 paren = m.group(1) 

1603 if paren.startswith("U+"): 1603 ↛ 1604line 1603 didn't jump to line 1604 because the condition on line 1603 was never true

1604 related = related[: m.start()] 

1605 else: 

1606 cls = classify_desc(paren) 

1607 if ( 1607 ↛ 1614line 1607 didn't jump to line 1614 because the condition on line 1607 was always true

1608 cls in ("romanization", "english") 

1609 and classify_desc(related[: m.start()]) == "other" 

1610 ): 

1611 roman = paren 

1612 related = related[: m.start()] 

1613 else: 

1614 related = related[: m.start()] 

1615 tagsets1, topics1 = decode_tags(paren) 

1616 if related and related.startswith("{{"): 1616 ↛ 1617line 1616 didn't jump to line 1617 because the condition on line 1616 was never true

1617 wxr.wtp.debug( 

1618 "{{ in word head form - possible Wiktionary error: {!r}".format( 

1619 related 

1620 ), 

1621 sortid="form_descriptions/1177", 

1622 ) 

1623 return None # Likely Wiktionary coding error 

1624 related = unquote_kept_parens(related) 

1625 # Split related by "/" (e.g., grande/Spanish) superlative in head 

1626 # Do not split if / in word title, see π//Japanese 

1627 if len(related) > 5 and "/" not in wxr.wtp.title: # type:ignore[operator] 

1628 alts = split_at_comma_semi(related, separators=["/"]) 

1629 else: 

1630 alts = [related] 

1631 if ruby_data: 1631 ↛ 1633line 1631 didn't jump to line 1633 because the condition on line 1631 was never true

1632 # prepare some regex stuff in advance 

1633 ks, rs = [], [] 

1634 for k, r in ruby_data: 

1635 ks.append(re.escape(k)) 

1636 rs.append(re.escape(r)) 

1637 splitter = r"((?:{})__lrub__(?:{})__rrub__)".format( 

1638 "|".join(ks), "|".join(rs) 

1639 ) 

1640 for related in alts: 

1641 ruby: list[tuple[str, str]] = [] 

1642 if ruby_data: 1642 ↛ 1643line 1642 didn't jump to line 1643 because the condition on line 1642 was never true

1643 new_related = [] 

1644 rub_split = re.split(splitter, related) 

1645 for s in rub_split: 

1646 m = re.match(r"(.+)__lrub__(.+)__rrub__", s) 

1647 if m: 

1648 # add ruby with (\1, \2) 

1649 ruby.append((m.group(1), m.group(2))) 

1650 new_related.append(m.group(1)) 

1651 else: 

1652 new_related.append(s) 

1653 related = "".join(new_related) 

1654 tagsets2, topics2 = decode_tags(" ".join(tags_lst)) 

1655 for tags1 in tagsets1: 

1656 assert isinstance(tags1, (list, tuple)) 

1657 for tags2 in tagsets2: 

1658 assert isinstance(tags1, (list, tuple)) 

1659 dt: LinkageData = {"word": related} 

1660 if roman: 

1661 dt["roman"] = roman 

1662 if ruby: 1662 ↛ 1663line 1662 didn't jump to line 1663 because the condition on line 1662 was never true

1663 dt["ruby"] = ruby 

1664 if "alt-of" in tags2: 1664 ↛ 1665line 1664 didn't jump to line 1665 because the condition on line 1664 was never true

1665 check_related(related) 

1666 data_extend(data, "tags", tags1) 

1667 data_extend(data, "tags", tags2) 

1668 data_extend(data, "topics", topics1) 

1669 data_extend(data, "topics", topics2) 

1670 data_append(data, "alt_of", dt) 

1671 elif "form-of" in tags2: 1671 ↛ 1672line 1671 didn't jump to line 1672 because the condition on line 1671 was never true

1672 check_related(related) 

1673 data_extend(data, "tags", tags1) 

1674 data_extend(data, "tags", tags2) 

1675 data_extend(data, "topics", topics1) 

1676 data_extend(data, "topics", topics2) 

1677 data_append(data, "form_of", dt) 

1678 elif "compound-of" in tags2: 1678 ↛ 1679line 1678 didn't jump to line 1679 because the condition on line 1678 was never true

1679 check_related(related) 

1680 data_extend(data, "tags", tags1) 

1681 data_extend(data, "tags", tags2) 

1682 data_extend(data, "topics", topics1) 

1683 data_extend(data, "topics", topics2) 

1684 data_append(data, "compound", related) 

1685 else: 

1686 lang = wxr.wtp.section or "LANG_MISSING" 

1687 related, final_tags = parse_head_final_tags( 

1688 wxr, lang, related 

1689 ) 

1690 # print("add_related: related={!r} tags1={!r} tags2={!r} " 

1691 # "final_tags={!r}" 

1692 # .format(related, tags1, tags2, final_tags)) 

1693 tags = list(tags1) + list(tags2) + list(final_tags) 

1694 check_related(related) 

1695 form: FormData = {"form": related} 

1696 if head_group: 

1697 form["head_nr"] = head_group 

1698 if roman: 

1699 form["roman"] = roman 

1700 if ruby: 1700 ↛ 1701line 1700 didn't jump to line 1701 because the condition on line 1700 was never true

1701 form["ruby"] = ruby 

1702 data_extend(form, "topics", topics1) 

1703 data_extend(form, "topics", topics2) 

1704 if topics1 or topics2: 1704 ↛ 1705line 1704 didn't jump to line 1705 because the condition on line 1704 was never true

1705 wxr.wtp.debug( 

1706 "word head form has topics: {}".format(form), 

1707 sortid="form_descriptions/1233", 

1708 ) 

1709 # Add tags from canonical form into the main entry 

1710 if "canonical" in tags: 

1711 if related in ("m", "f") and len(titleword) > 1: 1711 ↛ 1712line 1711 didn't jump to line 1712 because the condition on line 1711 was never true

1712 wxr.wtp.debug( 

1713 "probably incorrect canonical form " 

1714 "{!r} ignored (probably tag combination " 

1715 "missing from xlat_head_map)".format(related), 

1716 sortid="form_descriptions/1241", 

1717 ) 

1718 continue 

1719 if ( 

1720 related != titleword 

1721 or add_all_canonicals 

1722 or topics1 

1723 or topics2 

1724 or ruby 

1725 ): 

1726 data_extend(form, "tags", sorted(set(tags))) 

1727 else: 

1728 # We won't add canonical form here 

1729 filtered_tags = list( 

1730 x for x in tags if x != "canonical" 

1731 ) 

1732 data_extend(data, "tags", filtered_tags) 

1733 continue 

1734 else: 

1735 data_extend(form, "tags", sorted(set(tags))) 

1736 # Only insert if the form is not already there 

1737 for old in data.get("forms", ()): 

1738 if form == old: 1738 ↛ 1739line 1738 didn't jump to line 1739 because the condition on line 1738 was never true

1739 break 

1740 else: 

1741 data_append(data, "forms", form) 

1742 

1743 # If this form had pre-tags that started with "both" or "all", add those 

1744 # tags also to following related forms that don't have their own tags 

1745 # specified. 

1746 return following_tagsets 

1747 

1748 

1749# Issue #967, in English word forms sometimes forms are skipped because 

1750# they are taggable words and their distw() is too big, like clipping from clip 

1751WORDS_WITH_FALSE_POSITIVE_TAGS: dict[str, list[str]] = { 

1752 "clip": ["clipping"], # XXX remember to change me back to clipping after 

1753 "English": ["English", "Englishes"], 

1754 "common": ["common", "commoner"], 

1755 # tests. 

1756} 

1757 

1758WORDS_WITH_FALSE_POSITIVE_FORMS: dict[str, list[str]] = { 

1759 "unaccountability": ["countable", "uncountable"], 

1760 "uncountability": ["countable", "uncountable"], 

1761} 

1762 

1763FALSE_POSITIVE_MISSING_FORMS: dict[str, list[str]] = {} 

1764 

1765FORM_ASSOCIATED_TAG_WORDS: set[str] = { 

1766 "participle", 

1767 "past", 

1768 "present", 

1769 "singular", 

1770 "plural", 

1771 "first-person", 

1772 "second-person", 

1773 "third-person", 

1774 "gerund", 

1775} 

1776 

1777 

1778def parse_word_head( 

1779 wxr: WiktextractContext, 

1780 pos: str, 

1781 text: str, 

1782 data: WordData, 

1783 is_reconstruction: bool, 

1784 head_group: Optional[int], 

1785 ruby=None, 

1786 links=None, 

1787) -> None: 

1788 """Parses the head line for a word for in a particular language and 

1789 part-of-speech, extracting tags and related forms.""" 

1790 assert isinstance(wxr, WiktextractContext) 

1791 assert isinstance(pos, str) 

1792 assert isinstance(text, str) 

1793 assert isinstance(data, dict) 

1794 assert isinstance(ruby, (list, tuple)) or ruby is None 

1795 if ruby is None: 

1796 ruby = [] 

1797 assert is_reconstruction in (True, False) 

1798 # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text)) 

1799 # print(f"PARSE_WORD_HEAD: {data=}") 

1800 if links is None: 

1801 links = [] 

1802 

1803 if len(links) > 0: 

1804 # if we have link data (that is, links with stuff like commas and 

1805 # spaces, replace word_re with a modified local scope pattern 

1806 # print(f"links {list((c, ord(c)) for link in links for c in link)=}") 

1807 word_re = re.compile( 

1808 r"\b" # In case we have forms that are longer and contain links 

1809 + 

1810 # or words as a substring... 

1811 r"\b|\b".join( 

1812 sorted((re.escape(s) for s in links), key=lambda x: -len(x)) 

1813 ) 

1814 + r"\b|" 

1815 + word_pattern 

1816 ) 

1817 else: 

1818 word_re = word_re_global 

1819 

1820 if "Lua execution error" in text or "Lua timeout error" in text: 1820 ↛ 1821line 1820 didn't jump to line 1821 because the condition on line 1820 was never true

1821 return 

1822 

1823 # In Aug 2021, some words had spurious Template:en at the end of head forms 

1824 # due to a Wiktionary error. 

1825 text = re.sub(r"\s+Template:[-a-zA-Z]+\s*$", "", text) 

1826 

1827 # Fix words with "superlative:" or "comparative:" at end of head 

1828 # e.g. grande/Spanish/Adj 

1829 text = re.sub(r" (superlative|comparative): (.*)", r" (\1 \2)", text) 

1830 

1831 # Parse Arabic non-past forms, e.g. أبلع/Arabic/Verb 

1832 m = re.search(r", non-past ([^)]+ \([^)]+\))", text) 

1833 if m: 

1834 add_related( 

1835 wxr, 

1836 data, 

1837 ["non-past"], 

1838 [m.group(1)], 

1839 text, 

1840 True, 

1841 is_reconstruction, 

1842 head_group, 

1843 ruby, 

1844 ) 

1845 text = text[: m.start()] + text[m.end() :] 

1846 

1847 language = wxr.wtp.section 

1848 titleword = re.sub( 

1849 r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "MISSING_TITLE" 

1850 ) 

1851 titleparts = list( 

1852 m.group(0) 

1853 for m in re.finditer(word_re, wxr.wtp.title or "MISSING_TITLE") 

1854 ) 

1855 if not titleparts: 1855 ↛ 1856line 1855 didn't jump to line 1856 because the condition on line 1855 was never true

1856 return 

1857 

1858 # Remove " or" from the end to prevent weird canonical forms 

1859 if text.endswith(" or"): 

1860 for tp in titleparts: 

1861 if text.endswith(tp): 1861 ↛ 1862line 1861 didn't jump to line 1862 because the condition on line 1861 was never true

1862 break 

1863 else: 

1864 text = text.removesuffix(" or").rstrip() 

1865 

1866 # Handle the part of the head that is not in parentheses. However, certain 

1867 # parenthesized parts are part of word, and those must be handled 

1868 # specially here. 

1869 if ruby: 1869 ↛ 1870line 1869 didn't jump to line 1870 because the condition on line 1869 was never true

1870 text = quote_kept_ruby(wxr, ruby, text) 

1871 base = text 

1872 base = quote_kept_parens(base) 

1873 base = remove_text_in_parentheses(base) 

1874 base = base.replace("?", "") # Removes uncertain articles etc 

1875 base = re.sub(r"\s+", " ", base) 

1876 base = re.sub(r" ([,;])", r"\1", base) 

1877 base = re.sub(r" • ", r" ", base) 

1878 # Many languages use • as a punctuation mark separating the base 

1879 # from the rest of the head. στάδιος/Ancient Greek, issue #176 

1880 base = base.strip() 

1881 

1882 # Check for certain endings in head (mostly for compatibility with weird 

1883 # heads, e.g. rata/Romanian "1st conj." at end) 

1884 m = re.search(head_end_re, base) 

1885 tags: Union[tuple[str, ...], list[str]] = [] 

1886 if m: 1886 ↛ 1887line 1886 didn't jump to line 1887 because the condition on line 1886 was never true

1887 tags = head_end_map[m.group(1).lower()].split() 

1888 data_extend(data, "tags", tags) 

1889 base = base[: m.start()] 

1890 

1891 # Special case: handle Hán Nôm readings for Vietnamese characters 

1892 m = re.match( 

1893 r"{}: (Hán Nôm) readings: (.*)".format(re.escape(titleword)), base 

1894 ) 

1895 if m: 1895 ↛ 1896line 1895 didn't jump to line 1896 because the condition on line 1895 was never true

1896 tag, readings = m.groups() 

1897 tag = re.sub(r"\s+", "-", tag) 

1898 for reading in split_at_comma_semi(readings, skipped=links): 

1899 add_related( 

1900 wxr, 

1901 data, 

1902 [tag], 

1903 [reading], 

1904 text, 

1905 True, 

1906 is_reconstruction, 

1907 head_group, 

1908 ruby, 

1909 ) 

1910 return 

1911 

1912 # Special case: Hebrew " [pattern: nnn]" ending 

1913 m = re.search(r"\s+\[pattern: ([^]]+)\]", base) 

1914 if m: 1914 ↛ 1915line 1914 didn't jump to line 1915 because the condition on line 1914 was never true

1915 add_related( 

1916 wxr, 

1917 data, 

1918 ["class"], 

1919 [m.group(1)], 

1920 text, 

1921 True, 

1922 is_reconstruction, 

1923 head_group, 

1924 ruby, 

1925 ) 

1926 base = base[: m.start()] + base[m.end() :] 

1927 

1928 # Clean away some messy "Upload an image" template text used in 

1929 # American Sign Language: 

1930 # S@NearBaseForearm-PalmUp Frontandback S@BaseForearm-PalmUp 

1931 m = re.search(r"Upload .+ gif image.", base) 

1932 if m: 1932 ↛ 1933line 1932 didn't jump to line 1933 because the condition on line 1932 was never true

1933 base = base[: m.start()] + base[m.end() :] 

1934 

1935 # Split the head into alternatives. This is a complicated task, as 

1936 # we do not want so split on "or" or "," when immediately followed by more 

1937 # head-final tags, but otherwise do want to split by them. 

1938 # 20230907 added "or" to this to handle 'true or false', titles with 'or' 

1939 if wxr.wtp.title and ("," in wxr.wtp.title or " or " in wxr.wtp.title): 

1940 # A kludge to handle article titles/phrases with commas. 

1941 # Preprocess splits to first capture the title, then handle 

1942 # all the others as usual. 

1943 presplits = re.split(r"({})".format(wxr.wtp.title), base) 

1944 splits = [] 

1945 for psplit in presplits: 

1946 if psplit == wxr.wtp.title: 

1947 splits.append(psplit) 

1948 else: 

1949 splits.extend(re.split(head_split_re, psplit)) 

1950 else: 

1951 # Do the normal split; previous only-behavior. 

1952 splits = re.split(head_split_re, base) 

1953 # print("SPLITS:", splits) 

1954 alts: list[str] = [] 

1955 # print("parse_word_head: splits:", splits, 

1956 # "head_split_re_parens:", head_split_re_parens) 

1957 for i in range( 

1958 0, len(splits) - head_split_re_parens, head_split_re_parens + 1 

1959 ): 

1960 v = splits[i] 

1961 ending = splits[i + 1] or "" # XXX is this correct??? 

1962 # print("parse_word_head alts v={!r} ending={!r} alts={}" 

1963 # .format(v, ending, alts)) 

1964 if alts and (v == "" and ending): 

1965 assert ending[0] == " " 

1966 alts[-1] += " or" + ending # endings starts with space 

1967 elif v or ending: 1967 ↛ 1957line 1967 didn't jump to line 1957 because the condition on line 1967 was always true

1968 alts.append((v or "") + (ending or "")) 

1969 last = splits[-1].strip() 

1970 conn = "" if len(splits) < 3 else splits[-2] 

1971 # print("parse_word_head alts last={!r} conn={!r} alts={}" 

1972 # .format(last, conn, alts)) 

1973 if ( 

1974 alts 

1975 and last 

1976 and ( 

1977 last.split()[0] in xlat_head_map 

1978 or ( 

1979 conn == " or " 

1980 and (alts[-1] + " or " + last).strip() in xlat_head_map 

1981 ) 

1982 ) 

1983 ): 

1984 alts[-1] += " or " + last 

1985 elif last: 

1986 alts.append(last) 

1987 

1988 # print("parse_word_head alts: {}".format(alts)) 

1989 # print(f"{base=}") 

1990 

1991 # Process the head alternatives 

1992 canonicals: list[tuple[list[str], list[str]]] = [] 

1993 mode: Optional[str] = None 

1994 for alt_i, alt in enumerate(alts): 

1995 alt = alt.strip() 

1996 if alt.startswith("compound form:"): 1996 ↛ 1997line 1996 didn't jump to line 1997 because the condition on line 1996 was never true

1997 mode = "compound-form" 

1998 alt = alt[14:].strip() 

1999 if mode == "compound-form": 1999 ↛ 2000line 1999 didn't jump to line 2000 because the condition on line 1999 was never true

2000 add_related( 

2001 wxr, 

2002 data, 

2003 ["in-compounds"], 

2004 [alt], 

2005 text, 

2006 True, 

2007 is_reconstruction, 

2008 head_group, 

2009 ruby, 

2010 ) 

2011 continue 

2012 # For non-first parts, see if it can be treated as tags-only 

2013 if alt_i == 0: 

2014 expanded_alts = [alt] 

2015 else: 

2016 expanded_alts = map_with(xlat_descs_map, [alt]) 

2017 # print("EXPANDED_ALTS:", expanded_alts) 

2018 tagsets: Optional[list[tuple[str, ...]]] 

2019 for alt in expanded_alts: 

2020 baseparts = list(m.group(0) for m in word_re.finditer(alt)) 

2021 if alt_i > 0: 

2022 tagsets, topics = decode_tags(" ".join(baseparts)) 

2023 if not any("error-unknown-tag" in x for x in tagsets): 

2024 data_extend(data, "topics", topics) 

2025 for tags1 in tagsets: 

2026 data_extend(data, "tags", tags1) 

2027 continue 

2028 

2029 alt, tags = parse_head_final_tags( 

2030 wxr, language or "MISSING_LANG", alt 

2031 ) 

2032 tags = list(tags) # Make sure we don't modify anything cached 

2033 tags.append("canonical") 

2034 if alt_i == 0 and "," in wxr.wtp.title: # type:ignore[operator] 

2035 # Kludge to handle article titles/phrases with commas. 

2036 # basepart's regex strips commas, which leads to a 

2037 # canonical form that is the title phrase without a comma. 

2038 # basepart in add_related is almost immediately joined with 

2039 # spaces anyhow. XXX not exactly sure why it's 

2040 # canonicals.append((tags, baseparts)) and not (tags, [alt]) 

2041 baseparts = [alt] 

2042 canonicals.append((tags, baseparts)) 

2043 for tags, baseparts in canonicals: 

2044 add_related( 

2045 wxr, 

2046 data, 

2047 tags, 

2048 baseparts, 

2049 text, 

2050 len(canonicals) > 1, 

2051 is_reconstruction, 

2052 head_group, 

2053 ruby, 

2054 ) 

2055 

2056 # Handle parenthesized descriptors for the word form and links to 

2057 # related words 

2058 text = quote_kept_parens(text) 

2059 parens = list( 

2060 m.group(2) 

2061 for m in re.finditer(r"(^|\s)\((([^()]|\([^()]*\))*)\)", text) 

2062 ) 

2063 parens.extend( 

2064 m.group(1) 

2065 for m in re.finditer(r"[^\s]\((([^()]|\([^()]*\))*)\)($|\s)", text) 

2066 ) 

2067 have_romanization = False 

2068 have_ruby = False 

2069 hiragana = "" 

2070 katakana = "" 

2071 for paren in parens: 

2072 paren = paren.strip() 

2073 if not paren: 2073 ↛ 2074line 2073 didn't jump to line 2074 because the condition on line 2073 was never true

2074 continue 

2075 if paren.startswith("see "): 

2076 continue 

2077 if paren.startswith("U+"): 2077 ↛ 2078line 2077 didn't jump to line 2078 because the condition on line 2077 was never true

2078 continue 

2079 # In some rare cases, strip word that inflects form the form 

2080 # description, e.g. "look through rose-tinted glasses"/English. 

2081 paren = re.sub(r"\s*\(\[[^])]*\]\)", "", paren) 

2082 

2083 # If it starts with hiragana or katakana, treat as such form. Note 

2084 # that each hiragana/katakana character is in separate parentheses, 

2085 # so we must concatenate them. 

2086 try: 

2087 un = unicodedata.name(paren[0]).split()[0] 

2088 except ValueError: 

2089 un = "INVALID" 

2090 if un == "KATAKANA": 2090 ↛ 2091line 2090 didn't jump to line 2091 because the condition on line 2090 was never true

2091 katakana += paren 

2092 have_ruby = True 

2093 continue 

2094 if un == "HIRAGANA": 2094 ↛ 2095line 2094 didn't jump to line 2095 because the condition on line 2094 was never true

2095 hiragana += paren 

2096 have_ruby = True 

2097 continue 

2098 

2099 # Parse format ", 16 (Japan, Mainland), 17 (Hong Kong, Taiwan) strokes," 

2100 # in the middle of the parenthesized expression, e.g. 薄 

2101 def strokes_repl(m: re.Match) -> str: 

2102 strokes1, tags1, strokes2, tags2 = m.groups() 

2103 for strokes, tags in [[strokes1, tags1], [strokes2, tags2]]: 

2104 tags = tags.split(", ") 

2105 tags = list( 

2106 "Mainland China" if t == "Mainland" else t for t in tags 

2107 ) 

2108 tags.append("strokes") 

2109 add_related( 

2110 wxr, 

2111 data, 

2112 tags, 

2113 [strokes], 

2114 text, 

2115 True, 

2116 is_reconstruction, 

2117 head_group, 

2118 ruby, 

2119 ) 

2120 return ", " 

2121 

2122 paren = re.sub( 

2123 r", (\d+) \(([^()]+)\), (\d+) \(([^()]+)\) strokes, ", 

2124 strokes_repl, 

2125 paren, 

2126 ) 

2127 

2128 descriptors = map_with(xlat_descs_map, [paren]) 

2129 new_desc = [] 

2130 for desc in descriptors: 

2131 new_desc.extend( 

2132 map_with( 

2133 xlat_tags_map, 

2134 split_at_comma_semi(desc, extra=[", or "], skipped=links), 

2135 ) 

2136 ) 

2137 prev_tags: Union[list[list[str]], list[tuple[str, ...]], None] = None 

2138 following_tags = None # Added to prev_tags from previous parenthesized 

2139 # part, e.g. walrus/English 

2140 # "(both nonstandard, proscribed, uncommon)" 

2141 for desc_i, desc in enumerate(new_desc): 

2142 # print("HEAD DESC: {!r}".format(desc)) 

2143 

2144 # Abort on certain descriptors (assume remaining values are 

2145 # examples or uninteresting, cf. gaan/Navajo, horior/Latin) 

2146 if re.match(r"^(per |e\.g\.$)", desc): 2146 ↛ 2147line 2146 didn't jump to line 2147 because the condition on line 2146 was never true

2147 break 

2148 

2149 # If it all consists of CJK characters, add it with the 

2150 # CJK tag. This is used at least for some Vietnamese 

2151 # words (e.g., ba/Vietnamese) 

2152 try: 

2153 if all(unicodedata.name(x).startswith("CJK ") for x in desc): 2153 ↛ 2154line 2153 didn't jump to line 2154 because the condition on line 2153 was never true

2154 add_related( 

2155 wxr, 

2156 data, 

2157 ["CJK"], 

2158 [desc], 

2159 text, 

2160 True, 

2161 is_reconstruction, 

2162 head_group, 

2163 ruby, 

2164 ) 

2165 continue 

2166 except ValueError: 

2167 pass 

2168 

2169 # Handle some special cases 

2170 splitdesc = desc.split() 

2171 if ( 2171 ↛ 2180line 2171 didn't jump to line 2180 because the condition on line 2171 was never true

2172 len(splitdesc) >= 3 

2173 and splitdesc[1] == "superlative" 

2174 and classify_desc(splitdesc[0]) != "tags" 

2175 and prev_tags 

2176 ): 

2177 # Handle the special case of second comparative after comma, 

2178 # followed by superlative without comma. E.g. 

2179 # mal/Portuguese/Adv 

2180 for ts in prev_tags: 

2181 add_related( 

2182 wxr, 

2183 data, 

2184 ts, 

2185 [splitdesc[0]], 

2186 text, 

2187 True, 

2188 is_reconstruction, 

2189 head_group, 

2190 ruby, 

2191 ) 

2192 desc = " ".join(splitdesc[1:]) 

2193 elif ( 2193 ↛ 2201line 2193 didn't jump to line 2201 because the condition on line 2193 was never true

2194 len(splitdesc) == 2 

2195 and splitdesc[0] in ("also", "and") 

2196 and prev_tags 

2197 and classify_desc(splitdesc[1]) != "tags" 

2198 ): 

2199 # Sometimes alternative forms are prefixed with "also" or 

2200 # "and" 

2201 for ts in prev_tags: 

2202 add_related( 

2203 wxr, 

2204 data, 

2205 ts, 

2206 [splitdesc[1]], 

2207 text, 

2208 True, 

2209 is_reconstruction, 

2210 head_group, 

2211 ruby, 

2212 ) 

2213 continue 

2214 elif len(splitdesc) >= 2 and splitdesc[0] in ("including",): 2214 ↛ 2215line 2214 didn't jump to line 2215 because the condition on line 2214 was never true

2215 continue 

2216 

2217 # If only one word, assume it is comma-separated alternative 

2218 # to the previous one 

2219 if " " not in desc: 

2220 cls = classify_desc(desc) 

2221 if cls != "tags": 

2222 if prev_tags: 2222 ↛ 2224line 2222 didn't jump to line 2224 because the condition on line 2222 was never true

2223 # Assume comma-separated alternative to previous one 

2224 for ts in prev_tags: 

2225 add_related( 

2226 wxr, 

2227 data, 

2228 ts, 

2229 [desc], 

2230 text, 

2231 True, 

2232 is_reconstruction, 

2233 head_group, 

2234 ruby, 

2235 ) 

2236 continue 

2237 elif distw(titleparts, desc) <= 0.5: 2237 ↛ 2240line 2237 didn't jump to line 2240 because the condition on line 2237 was never true

2238 # Similar to head word, assume a dialectal variation to 

2239 # the base form. Cf. go/Alemannic German/Verb 

2240 add_related( 

2241 wxr, 

2242 data, 

2243 ["alternative"], 

2244 [desc], 

2245 text, 

2246 True, 

2247 is_reconstruction, 

2248 head_group, 

2249 ruby, 

2250 ) 

2251 continue 

2252 elif ( 

2253 cls in ("romanization", "english") 

2254 and not have_romanization 

2255 and classify_desc(titleword) == "other" 

2256 and not ( 

2257 "categories" in data and desc in data["categories"] 

2258 ) 

2259 ): 

2260 # Assume it to be a romanization 

2261 add_romanization( 

2262 wxr, 

2263 data, 

2264 desc, 

2265 text, 

2266 is_reconstruction, 

2267 head_group, 

2268 ruby, 

2269 ) 

2270 have_romanization = True 

2271 continue 

2272 

2273 m = re.match(r"^(\d+) strokes?$", desc) 

2274 if m: 

2275 # Special case, used to give #strokes for Han characters 

2276 add_related( 

2277 wxr, 

2278 data, 

2279 ["strokes"], 

2280 [m.group(1)], 

2281 text, 

2282 True, 

2283 is_reconstruction, 

2284 head_group, 

2285 ruby, 

2286 ) 

2287 continue 

2288 

2289 # See if it is radical+strokes 

2290 m = re.match( 

2291 r"^([\u2F00-\u2FDF\u2E80-\u2EFF\U00018800-\U00018AFF" 

2292 r"\uA490-\uA4CF\u4E00-\u9FFF]\+\d+)" 

2293 r"( in (Japanese|Chinese|traditional Chinese|" 

2294 r"simplified Chinese))?$", 

2295 desc, 

2296 ) 

2297 if m: 2297 ↛ 2300line 2297 didn't jump to line 2300 because the condition on line 2297 was never true

2298 # Special case, used to give radical + strokes for Han 

2299 # characters 

2300 radical_strokes = m.group(1) 

2301 lang = m.group(3) 

2302 t = ["radical+strokes"] 

2303 if lang: 

2304 t.extend(lang.split()) 

2305 add_related( 

2306 wxr, 

2307 data, 

2308 t, 

2309 [radical_strokes], 

2310 text, 

2311 True, 

2312 is_reconstruction, 

2313 head_group, 

2314 ruby, 

2315 ) 

2316 prev_tags = None 

2317 following_tags = None 

2318 continue 

2319 

2320 # See if it indicates historical Katakana ortography (←) or 

2321 # just otherwise katakana/hiragana form 

2322 m = re.match(r"←\s*|kana\s+", desc) 

2323 if m: 2323 ↛ 2324line 2323 didn't jump to line 2324 because the condition on line 2323 was never true

2324 if desc.startswith("←"): 

2325 t1 = "historical " 

2326 else: 

2327 t1 = "" 

2328 x = desc[m.end() :] 

2329 if x.endswith("?"): 

2330 x = x[:-1] 

2331 # XXX should we add a tag indicating uncertainty? 

2332 if x: 

2333 name = unicodedata.name(x[0]) 

2334 if name.startswith("HIRAGANA "): 

2335 desc = t1 + "hiragana " + x 

2336 elif name.startswith("KATAKANA "): 

2337 desc = t1 + "katakana " + x 

2338 

2339 # See if it is "n strokes in Chinese" or similar 

2340 m = re.match( 

2341 r"(\d+) strokes in (Chinese|Japanese|" 

2342 r"traditional Chinese|simplified Chinese)$", 

2343 desc, 

2344 ) 

2345 if m: 2345 ↛ 2347line 2345 didn't jump to line 2347 because the condition on line 2345 was never true

2346 # Special case, used to give just strokes for some Han chars 

2347 strokes = m.group(1) 

2348 lang = m.group(2) 

2349 t = ["strokes"] 

2350 t.extend(lang.split()) 

2351 add_related( 

2352 wxr, 

2353 data, 

2354 t, 

2355 [strokes], 

2356 text, 

2357 True, 

2358 is_reconstruction, 

2359 head_group, 

2360 ruby, 

2361 ) 

2362 prev_tags = None 

2363 following_tags = None 

2364 continue 

2365 

2366 # American Sign Language has images (or requests for image) 

2367 # as heads, + this ASL gloss after. 

2368 m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text) 

2369 if m2: 2369 ↛ 2370line 2369 didn't jump to line 2370 because the condition on line 2369 was never true

2370 add_related( 

2371 wxr, 

2372 data, 

2373 ["ASL-gloss"], 

2374 [m2.group(1)], 

2375 text, 

2376 True, 

2377 is_reconstruction, 

2378 head_group, 

2379 ruby, 

2380 ) 

2381 continue 

2382 

2383 parts = list(m.group(0) for m in re.finditer(word_re, desc)) 

2384 if not parts: 2384 ↛ 2385line 2384 didn't jump to line 2385 because the condition on line 2384 was never true

2385 prev_tags = None 

2386 following_tags = None 

2387 continue 

2388 

2389 # Check for certain language-specific header part starts that 

2390 # modify 

2391 if len(parts) == 2 and language in lang_specific_head_map: 2391 ↛ 2392line 2391 didn't jump to line 2392 because the condition on line 2391 was never true

2392 ht = lang_specific_head_map[language] 

2393 if parts[0] in ht: 

2394 rem_tags, add_tags = ht[parts[0]] 

2395 new_prev_tags1: list[list[str]] = [] 

2396 tags2: Union[tuple[str, ...], list[str]] 

2397 for tags2 in prev_tags or [()]: 

2398 if rem_tags is True: # Remove all old tags 

2399 tsets = set() 

2400 else: 

2401 tsets = set(tags2) - set(rem_tags.split()) 

2402 tsets = tsets | set(add_tags.split()) 

2403 tags = list(sorted(tsets)) 

2404 add_related( 

2405 wxr, 

2406 data, 

2407 tags, 

2408 [parts[1]], 

2409 text, 

2410 True, 

2411 is_reconstruction, 

2412 head_group, 

2413 ruby, 

2414 ) 

2415 new_prev_tags1.append(tags) 

2416 prev_tags = new_prev_tags1 

2417 following_tags = None 

2418 continue 

2419 

2420 # Handle the special case of descriptors that are parenthesized, 

2421 # e.g., (archaic or Scotland) 

2422 m = re.match(r"\(([^)]+)\)\s+(.*)$", desc) 

2423 if m is not None and classify_desc(m.group(1)) == "tags": 2423 ↛ 2424line 2423 didn't jump to line 2424 because the condition on line 2423 was never true

2424 tagpart = m.group(1) 

2425 related = [m.group(2)] 

2426 tagsets, topics = decode_tags(tagpart, no_unknown_starts=True) 

2427 if topics: 

2428 wxr.wtp.debug( 

2429 "parenthized head part {!r} contains topics: {}".format( 

2430 tagpart, topics 

2431 ), 

2432 sortid="form_descriptions/1647", 

2433 ) 

2434 elif m is not None and re.match(r"in the sense ", m.group(1)): 2434 ↛ 2437line 2434 didn't jump to line 2437 because the condition on line 2434 was never true

2435 # Handle certain ignored cases 

2436 # e.g. bord/Danish: in the sense "plank" 

2437 related = [m.group(2)] 

2438 tagsets = [()] 

2439 else: 

2440 # Normal parsing of the descriptor 

2441 alt_related = None 

2442 alt_tagsets = None 

2443 tagsets = None 

2444 for i in range(len(parts), 0, -1): 

2445 related = parts[i:] 

2446 tagparts = parts[:i] 

2447 # print(" i={} related={} tagparts={}" 

2448 # .format(i, related, tagparts)) 

2449 tagsets, topics = decode_tags( 

2450 " ".join(tagparts), no_unknown_starts=True 

2451 ) 

2452 # print("tagparts={!r} tagsets={} topics={} related={} " 

2453 # "alt_related={} distw={:.2f}" 

2454 # .format(tagparts, tagsets, topics, related, 

2455 # alt_related, 

2456 # distw(titleparts, parts[i - 1]))) 

2457 if ( 

2458 topics 

2459 or not tagsets 

2460 or any("error-unknown-tag" in x for x in tagsets) 

2461 ): 

2462 if alt_related is not None: 2462 ↛ 2464line 2462 didn't jump to line 2464 because the condition on line 2462 was never true

2463 # We already had a good division, so let's stop. 

2464 break 

2465 # Bad division, try deeper 

2466 continue 

2467 # print(f"{parts[i-1]=}, {parts=}") 

2468 if ( 

2469 i > 1 

2470 and len(parts[i - 1]) >= 4 

2471 and ( 

2472 distw(titleparts, parts[i - 1]) <= 0.4 

2473 or ( 

2474 wxr.wtp.section == "English" 

2475 and wxr.wtp.title 

2476 in WORDS_WITH_FALSE_POSITIVE_TAGS 

2477 and parts[i - 1] 

2478 in WORDS_WITH_FALSE_POSITIVE_TAGS[wxr.wtp.title] 

2479 ) 

2480 ) 

2481 # Fixes 'unaccountability' wiktext #1196 

2482 and not ( 

2483 wxr.wtp.section == "English" 

2484 and wxr.wtp.title in WORDS_WITH_FALSE_POSITIVE_FORMS 

2485 and parts[i - 1] 

2486 in WORDS_WITH_FALSE_POSITIVE_FORMS[wxr.wtp.title] 

2487 ) 

2488 # Fixes wiktextract #983, where "participle" 

2489 # was too close to "Martinize" and so this accepted 

2490 # ["participle", "Martinize"] as matching; this 

2491 # kludge prevents this from happening if titleparts 

2492 # is shorter than what would be 'related'. 

2493 # This breaks if we want to detect stuff that 

2494 # actually gets an extra space-separated word when 

2495 # 'inflected'. 

2496 and ( 

2497 len(titleparts) >= len(parts[i - 1 :]) 

2498 or "or" in parts[i - 1 :] 

2499 ) 

2500 ): 

2501 # print(f"Reached; {parts=}, {parts[i-1]=}") 

2502 alt_related = related 

2503 alt_tagsets = tagsets 

2504 continue 

2505 alt_related = None 

2506 alt_tagsets = None 

2507 break 

2508 else: 

2509 if alt_related is None: 2509 ↛ 2541line 2509 didn't jump to line 2541 because the condition on line 2509 was always true

2510 # Check if the parenthesized part is likely a 

2511 # romanization 

2512 if ( 2512 ↛ 2520line 2512 didn't jump to line 2520 because the condition on line 2512 was never true

2513 (have_ruby or classify_desc(base) == "other") 

2514 and classify_desc(paren) == "romanization" 

2515 and not ( 

2516 "categories" in data 

2517 and desc in data["categories"] 

2518 ) 

2519 ): 

2520 for r in split_at_comma_semi( 

2521 paren, extra=[" or "], skipped=links 

2522 ): 

2523 add_romanization( 

2524 wxr, 

2525 data, 

2526 r, 

2527 text, 

2528 is_reconstruction, 

2529 head_group, 

2530 ruby, 

2531 ) 

2532 have_romanization = True 

2533 continue 

2534 tagsets = [("error-unrecognized-head-form",)] 

2535 wxr.wtp.debug( 

2536 "unrecognized head form: {}".format(desc), 

2537 sortid="form_descriptions/1698", 

2538 ) 

2539 continue 

2540 

2541 if alt_related is not None: 2541 ↛ 2542line 2541 didn't jump to line 2542 because the condition on line 2541 was never true

2542 related = alt_related 

2543 tagsets = alt_tagsets 

2544 

2545 # print("FORM END: tagsets={} related={}".format(tagsets, related)) 

2546 # print("==================") 

2547 

2548 if ( 2548 ↛ 2569line 2548 didn't jump to line 2569 because the condition on line 2548 was never true

2549 len(related) <= 0 

2550 and wxr.wtp.section == "English" 

2551 and tagsets is not None 

2552 and len(tagsets) > 0 

2553 and not any( 

2554 s.startswith("error-") for tagset in tagsets for s in tagset 

2555 ) 

2556 and any( 

2557 s in FORM_ASSOCIATED_TAG_WORDS 

2558 for tagset in tagsets 

2559 for s in tagset 

2560 ) 

2561 and ( 

2562 wxr.wtp.title not in FALSE_POSITIVE_MISSING_FORMS 

2563 and not any( 

2564 rel in FALSE_POSITIVE_MISSING_FORMS[wxr.wtp.title or ""] 

2565 for rel in related 

2566 ) 

2567 ) 

2568 ): 

2569 wxr.wtp.debug( 

2570 f"Form tags without form: {desc=}, {tagsets=}", 

2571 sortid="form_description/20250107", 

2572 ) 

2573 if not tagsets: 2573 ↛ 2574line 2573 didn't jump to line 2574 because the condition on line 2573 was never true

2574 continue 

2575 

2576 # print(f"{alts=}, {related=}") 

2577 

2578 assert isinstance(related, (list, tuple)) 

2579 related_str = " ".join(related) 

2580 if "or" in titleparts: 

2581 alts = [related_str] 

2582 else: 

2583 alts = split_at_comma_semi( 

2584 related_str, separators=[r"\bor\b"], skipped=links 

2585 ) 

2586 # print(f"{related_str=}, {alts=}") 

2587 if not alts: 

2588 alts = [""] 

2589 for related_str in alts: 

2590 if related_str: 

2591 if prev_tags and ( 

2592 all( 

2593 all( 

2594 t in ["nonstandard", "dialectal"] 

2595 or valid_tags[t] == "dialect" 

2596 for t in tags 

2597 ) 

2598 for ts in tagsets 

2599 ) 

2600 or ( 

2601 any("participle" in ts for ts in prev_tags) 

2602 and all( 

2603 "attributive" in ts 

2604 or any(valid_tags[t] == "gender" for t in ts) 

2605 for ts in tagsets 

2606 ) 

2607 ) 

2608 ): 

2609 # Merged with previous tags. Don't update previous 

2610 # tags here; cf. burn/English/Verb 

2611 for tags_l in tagsets: 

2612 for ts in prev_tags: 

2613 tags_l1 = sorted(set(tags_l) | set(ts)) 

2614 add_related( 

2615 wxr, 

2616 data, 

2617 tags_l1, 

2618 [related_str], 

2619 text, 

2620 True, 

2621 is_reconstruction, 

2622 head_group, 

2623 ruby, 

2624 ) 

2625 else: 

2626 # Not merged with previous tags 

2627 for tags_l in tagsets: 

2628 if following_tags is not None: 2628 ↛ 2629line 2628 didn't jump to line 2629 because the condition on line 2628 was never true

2629 for ts in following_tags: 

2630 tags_l1 = list( 

2631 sorted(set(tags_l) | set(ts)) 

2632 ) 

2633 add_related( 

2634 wxr, 

2635 data, 

2636 tags_l1, 

2637 [related_str], 

2638 text, 

2639 True, 

2640 is_reconstruction, 

2641 head_group, 

2642 ruby, 

2643 ) 

2644 else: 

2645 ret = add_related( 

2646 wxr, 

2647 data, 

2648 tags_l, 

2649 [related_str], 

2650 text, 

2651 True, 

2652 is_reconstruction, 

2653 head_group, 

2654 ruby, 

2655 ) 

2656 if ret is not None: 2656 ↛ 2657line 2656 didn't jump to line 2657 because the condition on line 2656 was never true

2657 following_tags = ret 

2658 prev_tags = tagsets 

2659 else: 

2660 if desc_i < len(new_desc) - 1 and all( 2660 ↛ 2667line 2660 didn't jump to line 2667 because the condition on line 2660 was never true

2661 "participle" in ts or "infinitive" in ts 

2662 for ts in tagsets 

2663 ): 

2664 # Interpret it as a standalone form description 

2665 # in the middle, probably followed by forms or 

2666 # language-specific descriptors. cf. drikke/Danish 

2667 new_prev_tags2 = [] 

2668 for ts1 in prev_tags or [()]: 

2669 for ts2 in tagsets: 

2670 ts = tuple(sorted(set(ts1) | set(ts2))) 

2671 new_prev_tags2.append(ts) 

2672 prev_tags = new_prev_tags2 

2673 continue 

2674 for tags in tagsets: 

2675 data_extend(data, "tags", tags) 

2676 prev_tags = tagsets 

2677 following_tags = None 

2678 

2679 # Finally, if we collected hirakana/katakana, add them now 

2680 if hiragana: 2680 ↛ 2681line 2680 didn't jump to line 2681 because the condition on line 2680 was never true

2681 add_related( 

2682 wxr, 

2683 data, 

2684 ["hiragana"], 

2685 [hiragana], 

2686 text, 

2687 True, 

2688 is_reconstruction, 

2689 head_group, 

2690 ruby, 

2691 ) 

2692 if katakana: 2692 ↛ 2693line 2692 didn't jump to line 2693 because the condition on line 2692 was never true

2693 add_related( 

2694 wxr, 

2695 data, 

2696 ["katakana"], 

2697 [katakana], 

2698 text, 

2699 True, 

2700 is_reconstruction, 

2701 head_group, 

2702 ruby, 

2703 ) 

2704 

2705 # XXX check if this is actually relevant, tags in word root data 

2706 # is extremely rare (not sure where they slip through). 

2707 tags = data.get("tags", []) # type:ignore 

2708 if len(tags) > 0: 

2709 # wxr.wtp.debug( 

2710 # f"Tags appear in word root data: {data['tags']=}", # type:ignore 

2711 # sortid="form_descriptions/2620/20240606", 

2712 # ) # Messes up tests. 

2713 data["tags"] = sorted(set(tags)) # type:ignore 

2714 

2715 

2716def parse_sense_qualifier( 

2717 wxr: WiktextractContext, text: str, data: Union[SenseData, LinkageData] 

2718) -> None: 

2719 """Parses tags or topics for a sense or some other data. The values are 

2720 added into the dictionary ``data``.""" 

2721 assert isinstance(wxr, WiktextractContext) 

2722 assert isinstance(text, str) 

2723 assert isinstance(data, dict) 

2724 # print("parse_sense_qualifier:", text) 

2725 if re.match(r"\([^()]+\)$", text): 2725 ↛ 2726line 2725 didn't jump to line 2726 because the condition on line 2725 was never true

2726 text = text[1:-1] 

2727 if re.match(r'"[^"]+"$', text): 2727 ↛ 2728line 2727 didn't jump to line 2728 because the condition on line 2727 was never true

2728 text = text[1:-1] 

2729 lst = map_with(xlat_descs_map, [text]) 

2730 sense_tags: list[str] = [] 

2731 for text in lst: 

2732 for semi in split_at_comma_semi(text): 

2733 if not semi: 2733 ↛ 2734line 2733 didn't jump to line 2734 because the condition on line 2733 was never true

2734 continue 

2735 orig_semi = semi 

2736 idx = semi.find(":") 

2737 if idx >= 0: 2737 ↛ 2738line 2737 didn't jump to line 2738 because the condition on line 2737 was never true

2738 semi = semi[:idx] 

2739 cls = classify_desc(semi, allow_unknown_tags=True) 

2740 # print("parse_sense_qualifier: classify_desc: {} -> {}" 

2741 # .format(semi, cls)) 

2742 if cls == "tags": 

2743 tagsets, topics = decode_tags(semi) 

2744 data_extend(data, "topics", topics) 

2745 # XXX should think how to handle distinct options better, 

2746 # e.g., "singular and plural genitive"; that can't really be 

2747 # done with changing the calling convention of this function. 

2748 # Should split sense if more than one category of tags differs. 

2749 for tags in tagsets: 

2750 sense_tags.extend(tags) 

2751 elif cls == "taxonomic": 2751 ↛ 2752line 2751 didn't jump to line 2752 because the condition on line 2751 was never true

2752 if re.match(r"×[A-Z]", semi): 

2753 sense_tags.append("extinct") 

2754 semi = semi[1:] 

2755 data["taxonomic"] = semi 

2756 elif cls == "english": 

2757 if "qualifier" in data and data["qualifier"] != orig_semi: 2757 ↛ 2758line 2757 didn't jump to line 2758 because the condition on line 2757 was never true

2758 data["qualifier"] += "; " + orig_semi 

2759 else: 

2760 data["qualifier"] = orig_semi 

2761 else: 

2762 wxr.wtp.debug( 

2763 "unrecognized sense qualifier: {}".format(text), 

2764 sortid="form_descriptions/1831", 

2765 ) 

2766 sense_tags = sorted(set(sense_tags)) 

2767 data_extend(data, "tags", sense_tags) 

2768 

2769 

2770def parse_pronunciation_tags( 

2771 wxr: WiktextractContext, text: str, data: SoundData 

2772) -> None: 

2773 assert isinstance(wxr, WiktextractContext) 

2774 assert isinstance(text, str) 

2775 assert isinstance(data, dict) 

2776 text = text.strip() 

2777 if not text: 2777 ↛ 2778line 2777 didn't jump to line 2778 because the condition on line 2777 was never true

2778 return 

2779 cls = classify_desc(text) 

2780 notes = [] 

2781 if cls == "tags": 

2782 tagsets, topics = decode_tags(text) 

2783 data_extend(data, "topics", topics) 

2784 for tagset in tagsets: 

2785 for t in tagset: 

2786 if " " in t: 2786 ↛ 2787line 2786 didn't jump to line 2787 because the condition on line 2786 was never true

2787 notes.append(t) 

2788 else: 

2789 data_append(data, "tags", t) 

2790 else: 

2791 notes.append(text) 

2792 if notes: 

2793 data["note"] = "; ".join(notes) 

2794 

2795 

2796def parse_translation_desc( 

2797 wxr: WiktextractContext, lang: str, text: str, tr: TranslationData 

2798) -> None: 

2799 assert isinstance(wxr, WiktextractContext) 

2800 assert isinstance(lang, str) # The language of ``text`` 

2801 assert isinstance(text, str) 

2802 assert isinstance(tr, dict) 

2803 # print("parse_translation_desc:", text) 

2804 

2805 # Process all parenthesized parts from the translation item 

2806 note = None 

2807 restore_beginning = "" 

2808 restore_end = "" 

2809 while True: 

2810 beginning = False 

2811 # See if we can find a parenthesized expression at the end 

2812 m = re.search(r"\s*\((([^()]|\([^()]+\))+)\)\.?$", text) 

2813 if m: 

2814 par = m.group(1) 

2815 text = text[: m.start()] 

2816 if par.startswith(("literally ", "lit.")): 

2817 continue # Not useful for disambiguation in many idioms 

2818 else: 

2819 # See if we can find a parenthesized expression at the start 

2820 m = re.match(r"^\^?\((([^()]|\([^()]+\))+)\):?(\s+|$)", text) 

2821 if m: 

2822 par = m.group(1) 

2823 text = text[m.end() :] 

2824 beginning = True 

2825 if re.match(r"^(\d|\s|,| or | and )+$", par): 2825 ↛ 2830line 2825 didn't jump to line 2830 because the condition on line 2825 was never true

2826 # Looks like this beginning parenthesized expression only 

2827 # contains digits or their combinations. We assume such 

2828 # to be sense descriptions if no sense has been selected, 

2829 # or otherwise just ignore them. 

2830 if not tr.get("sense"): 

2831 tr["sense"] = par 

2832 continue 

2833 else: 

2834 # See if we can find a parenthesized expression in the middle. 

2835 # Romanizations are sometimes between word and gender marker, 

2836 # e.g. wife/English/Tr/Yiddish. 

2837 m = re.search(r"\s+\((([^()]|\([^()]+\))+)\)", text) 

2838 if m: 

2839 par = m.group(1) 

2840 text = text[: m.start()] + text[m.end() :] 

2841 else: 

2842 # No more parenthesized expressions - break out of the loop 

2843 break 

2844 

2845 # Some cleanup of artifacts that may result from skipping some templates 

2846 # in earlier stages 

2847 if par.startswith(": "): 2847 ↛ 2848line 2847 didn't jump to line 2848 because the condition on line 2847 was never true

2848 par = par[2:] 

2849 if par.endswith(","): 2849 ↛ 2850line 2849 didn't jump to line 2850 because the condition on line 2849 was never true

2850 par = par[:-1] 

2851 if re.match(r'^[“"]([^“”"]*)[“”"]$', par): 2851 ↛ 2852line 2851 didn't jump to line 2852 because the condition on line 2851 was never true

2852 par = par[1:-1] 

2853 par = par.strip() 

2854 

2855 # Check for special script pronunciation followed by romanization, 

2856 # used in many Asian languages. 

2857 lst = par.split(", ") 

2858 if len(lst) == 2: 

2859 a, r = lst 

2860 if classify_desc(a) == "other": 

2861 cls = classify_desc(r) 

2862 # print("parse_translation_desc: r={} cls={}".format(r, cls)) 

2863 if cls == "romanization" or ( 

2864 cls == "english" and len(r.split()) == 1 and r[0].islower() 

2865 ): 

2866 if tr.get("alt") and tr.get("alt") != a: 2866 ↛ 2867line 2866 didn't jump to line 2867 because the condition on line 2866 was never true

2867 wxr.wtp.debug( 

2868 'more than one value in "alt": {} vs. {}'.format( 

2869 tr["alt"], a 

2870 ), 

2871 sortid="form_descriptions/1930", 

2872 ) 

2873 tr["alt"] = a 

2874 if tr.get("roman") and tr.get("roman") != r: 2874 ↛ 2875line 2874 didn't jump to line 2875 because the condition on line 2874 was never true

2875 wxr.wtp.debug( 

2876 'more than one value in "roman": ' 

2877 "{} vs. {}".format(tr["roman"], r), 

2878 sortid="form_descriptions/1936", 

2879 ) 

2880 tr["roman"] = r 

2881 continue 

2882 

2883 # Check for certain comma-separated tags combined with English text 

2884 # at the beginning or end of a comma-separated parenthesized list 

2885 while len(lst) > 1: 

2886 cls = classify_desc(lst[0]) 

2887 if cls == "tags": 2887 ↛ 2888line 2887 didn't jump to line 2888 because the condition on line 2887 was never true

2888 tagsets, topics = decode_tags(lst[0]) 

2889 for t in tagsets: 

2890 data_extend(tr, "tags", t) 

2891 data_extend(tr, "topics", topics) 

2892 lst = lst[1:] 

2893 continue 

2894 cls = classify_desc(lst[-1]) 

2895 if cls == "tags": 

2896 tagsets, topics = decode_tags(lst[-1]) 

2897 for t in tagsets: 

2898 data_extend(tr, "tags", t) 

2899 data_extend(tr, "topics", topics) 

2900 lst = lst[:-1] 

2901 continue 

2902 break 

2903 par = ", ".join(lst) 

2904 

2905 if not par: 2905 ↛ 2906line 2905 didn't jump to line 2906 because the condition on line 2905 was never true

2906 continue 

2907 if re.search(tr_ignored_parens_re, par): 2907 ↛ 2908line 2907 didn't jump to line 2908 because the condition on line 2907 was never true

2908 continue 

2909 if par.startswith("numeral:"): 

2910 par = par[8:].strip() 

2911 

2912 # Classify the part in parenthesis and process accordingly 

2913 cls = classify_desc(par) 

2914 # print("parse_translation_desc classify: {!r} -> {}" 

2915 # .format(par, cls)) 

2916 if par == text: 

2917 pass 

2918 if par == "f": 2918 ↛ 2919line 2918 didn't jump to line 2919 because the condition on line 2918 was never true

2919 data_append(tr, "tags", "feminine") 

2920 elif par == "m": 2920 ↛ 2921line 2920 didn't jump to line 2921 because the condition on line 2920 was never true

2921 data_append(tr, "tags", "masculine") 

2922 elif cls == "tags": 

2923 tagsets, topics = decode_tags(par) 

2924 for tags in tagsets: 

2925 data_extend(tr, "tags", tags) 

2926 data_extend(tr, "topics", topics) 

2927 elif cls == "english": 

2928 # If the text contains any of certain grammatical words, treat it 

2929 # as a "note" instead of "english" 

2930 if re.search(tr_note_re, par): 

2931 if par.endswith(":"): 2931 ↛ 2932line 2931 didn't jump to line 2932 because the condition on line 2931 was never true

2932 par = par[:-1] 

2933 if par not in ("see entry for forms",): 2933 ↛ 2809line 2933 didn't jump to line 2809 because the condition on line 2933 was always true

2934 if note: 2934 ↛ 2935line 2934 didn't jump to line 2935 because the condition on line 2934 was never true

2935 note = note + ";" + par 

2936 else: 

2937 note = par 

2938 else: 

2939 # There can be more than one parenthesized english item, see 

2940 # e.g. Aunt/English/Translations/Tamil 

2941 if "translation" in tr and "english" in tr: 

2942 tr["english"] += "; " + par # DEPRECATED for "translation" 

2943 tr["translation"] += "; " + par 

2944 else: 

2945 tr["english"] = par # DEPRECATED for "translation" 

2946 tr["translation"] = par 

2947 elif cls == "romanization": 

2948 # print("roman text={!r} text cls={}" 

2949 # .format(text, classify_desc(text))) 

2950 if classify_desc(text) in ( 

2951 "english", 

2952 "romanization", 

2953 ) and lang not in ("Egyptian",): 

2954 if beginning: 

2955 restore_beginning += "({}) ".format(par) 

2956 else: 

2957 restore_end = " ({})".format(par) + restore_end 

2958 else: 

2959 if tr.get("roman"): 2959 ↛ 2960line 2959 didn't jump to line 2960 because the condition on line 2959 was never true

2960 wxr.wtp.debug( 

2961 'more than one value in "roman": {} vs. {}'.format( 

2962 tr["roman"], par 

2963 ), 

2964 sortid="form_descriptions/2013", 

2965 ) 

2966 tr["roman"] = par 

2967 elif cls == "taxonomic": 2967 ↛ 2968line 2967 didn't jump to line 2968 because the condition on line 2967 was never true

2968 if tr.get("taxonomic"): 

2969 wxr.wtp.debug( 

2970 'more than one value in "taxonomic": {} vs. {}'.format( 

2971 tr["taxonomic"], par 

2972 ), 

2973 sortid="form_descriptions/2019", 

2974 ) 

2975 if re.match(r"×[A-Z]", par): 

2976 data_append(tr, "tags", "extinct") 

2977 par = par[1:] 

2978 tr["taxonomic"] = par 

2979 elif cls == "other": 2979 ↛ 2989line 2979 didn't jump to line 2989 because the condition on line 2979 was always true

2980 if tr.get("alt"): 2980 ↛ 2981line 2980 didn't jump to line 2981 because the condition on line 2980 was never true

2981 wxr.wtp.debug( 

2982 'more than one value in "alt": {} vs. {}'.format( 

2983 tr["alt"], par 

2984 ), 

2985 sortid="form_descriptions/2028", 

2986 ) 

2987 tr["alt"] = par 

2988 else: 

2989 wxr.wtp.debug( 

2990 "parse_translation_desc unimplemented cls {}: {}".format( 

2991 cls, par 

2992 ), 

2993 sortid="form_descriptions/2033", 

2994 ) 

2995 

2996 # Check for gender indications in suffix 

2997 text, final_tags = parse_head_final_tags(wxr, lang, text) 

2998 data_extend(tr, "tags", final_tags) 

2999 

3000 # Restore those parts that we did not want to remove (they are often 

3001 # optional words or words that are always used with the given translation) 

3002 text = restore_beginning + text + restore_end 

3003 

3004 if note: 

3005 tr["note"] = note.strip() 

3006 if text and text not in ignored_translations: 

3007 tr["word"] = text.strip() 

3008 

3009 # Sometimes gender seems to be at the end of "roman" field, see e.g. 

3010 # fire/English/Noun/Translations/Egyptian (for "oxidation reaction") 

3011 roman = tr.get("roman") 

3012 if roman: 

3013 if roman.endswith(" f"): 3013 ↛ 3014line 3013 didn't jump to line 3014 because the condition on line 3013 was never true

3014 data_append(tr, "tags", "feminine") 

3015 tr["roman"] = roman[:-2].strip() 

3016 elif roman.endswith(" m"): 3016 ↛ 3017line 3016 didn't jump to line 3017 because the condition on line 3016 was never true

3017 data_append(tr, "tags", "masculine") 

3018 tr["roman"] = roman[:-2].strip() 

3019 

3020 # If the word now has "translation" field but no "roman" field, and 

3021 # the word would be classified "other" (generally non-latin 

3022 # characters), and the value in "translation" is only one lowercase 

3023 # word, move it to "roman". This happens semi-frequently when the 

3024 # translation is transliterated the same as some English word. 

3025 roman = tr.get("roman") 

3026 english = tr.get("translation") 

3027 if english and not roman and "word" in tr: 

3028 cls = classify_desc(tr["word"]) 

3029 if cls == "other" and " " not in english and english[0].islower(): 

3030 del tr["translation"] 

3031 if "english" in tr: # DEPRECATED for "translation" 3031 ↛ 3033line 3031 didn't jump to line 3033 because the condition on line 3031 was always true

3032 del tr["english"] 

3033 tr["roman"] = english 

3034 

3035 # If the entry now has both tr["roman"] and tr["word"] and they have 

3036 # the same value, delete tr["roman"] (e.g., man/English/Translations 

3037 # Evenki) 

3038 if tr.get("word") and tr.get("roman") == tr.get("word"): 3038 ↛ 3039line 3038 didn't jump to line 3039 because the condition on line 3038 was never true

3039 del tr["roman"] 

3040 

3041 

3042def parse_alt_or_inflection_of( 

3043 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str] 

3044) -> Optional[tuple[list[str], Optional[list[AltOf]]]]: 

3045 """Tries to parse an inflection-of or alt-of description. If successful, 

3046 this returns (tags, alt-of/inflection-of-dict). If the description cannot 

3047 be parsed, this returns None. This may also return (tags, None) when the 

3048 gloss describes a form (or some other tags were extracted from it), but 

3049 there was no alt-of/form-of/synonym-of word.""" 

3050 # print("parse_alt_or_inflection_of: {!r}".format(gloss)) 

3051 # Occasionally inflection_of/alt_of have "A(n) " etc. at the beginning. 

3052 

3053 # Never interpret a gloss that is equal to the word itself as a tag 

3054 # (e.g., instrumental/Romanian, instrumental/Spanish). 

3055 if gloss.lower() == wxr.wtp.title.lower() or ( # type:ignore[union-attr] 

3056 len(gloss) >= 5 and distw([gloss.lower()], wxr.wtp.title.lower()) < 0.2 # type:ignore[union-attr] 

3057 ): 

3058 return None 

3059 

3060 # First try parsing it as-is 

3061 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args) 

3062 if parsed is not None: 

3063 return parsed 

3064 

3065 # Next try parsing it with the first character converted to lowercase if 

3066 # it was previously uppercase. 

3067 if gloss and gloss[0].isupper(): 

3068 gloss = gloss[0].lower() + gloss[1:] 

3069 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args) 

3070 if parsed is not None: 

3071 return parsed 

3072 

3073 return None 

3074 

3075 

3076# These tags are not allowed in alt-or-inflection-of parsing 

3077alt_infl_disallowed: set[str] = set( 

3078 [ 

3079 "error-unknown-tag", 

3080 "place", # Not in inflected forms and causes problems e.g. house/English 

3081 ] 

3082) 

3083 

3084 

3085def parse_alt_or_inflection_of1( 

3086 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str] 

3087) -> Optional[tuple[list[str], Optional[list[AltOf]]]]: 

3088 """Helper function for parse_alt_or_inflection_of. This handles a single 

3089 capitalization.""" 

3090 if not gloss or not gloss.strip(): 3090 ↛ 3091line 3090 didn't jump to line 3091 because the condition on line 3090 was never true

3091 return None 

3092 

3093 # Prevent some common errors where we would parse something we shouldn't 

3094 if re.search(r"(?i)form of address ", gloss): 3094 ↛ 3095line 3094 didn't jump to line 3095 because the condition on line 3094 was never true

3095 return None 

3096 

3097 gloss = re.sub(r"only used in [^,]+, ", "", gloss) 

3098 

3099 # First try all formats ending with "of" (or other known last words that 

3100 # can end a form description) 

3101 matches = list(re.finditer(r"\b(of|for|by|as|letter|number) ", gloss)) 

3102 m: Optional[re.Match] 

3103 for m in reversed(matches): 

3104 desc = gloss[: m.end()].strip() 

3105 base = gloss[m.end() :].strip() 

3106 tagsets, topics = decode_tags(desc, no_unknown_starts=True) 

3107 if not topics and any( 

3108 not (alt_infl_disallowed & set(ts)) for ts in tagsets 

3109 ): 

3110 # Successfully parsed, including "of" etc. 

3111 tags: list[str] = [] 

3112 # If you have ("Western-Armenian", ..., "form-of") as your 

3113 # tag set, it's most probable that it's something like 

3114 # "Western Armenian form of խոսել (xosel)", which should 

3115 # get "alt-of" instead of "form-of" (inflection). 

3116 # խօսիլ/Armenian 

3117 for ts_t in tagsets: 

3118 if "form-of" in ts_t and any( 

3119 valid_tags.get(tk) == "dialect" for tk in ts_t 

3120 ): 

3121 ts_s = (set(ts_t) - {"form-of"}) | {"alt-of"} 

3122 else: 

3123 ts_s = set(ts_t) 

3124 if not (alt_infl_disallowed & ts_s): 3124 ↛ 3117line 3124 didn't jump to line 3117 because the condition on line 3124 was always true

3125 tags.extend(ts_s) 

3126 if ( 

3127 "alt-of" in tags 

3128 or "form-of" in tags 

3129 or "synonym-of" in tags 

3130 or "compound-of" in tags 

3131 ): 

3132 break 

3133 if m.group(1) == "of": 

3134 # Try parsing without the final "of". This is commonly used in 

3135 # various form-of expressions. 

3136 desc = gloss[: m.start()] 

3137 base = gloss[m.end() :] 

3138 tagsets, topics = decode_tags(desc, no_unknown_starts=True) 

3139 # print("ALT_OR_INFL: desc={!r} base={!r} tagsets={} topics={}" 

3140 # .format(desc, base, tagsets, topics)) 

3141 if not topics and any( 

3142 not (alt_infl_disallowed & set(t)) for t in tagsets 

3143 ): 

3144 tags = [] 

3145 for t in tagsets: 

3146 if not (alt_infl_disallowed & set(t)): 3146 ↛ 3145line 3146 didn't jump to line 3145 because the condition on line 3146 was always true

3147 tags.extend(t) 

3148 # It must have at least one tag from form_of_tags 

3149 if set(tags) & form_of_tags: 

3150 # Accept this as form-of 

3151 tags.append("form-of") 

3152 break 

3153 if set(tags) & alt_of_tags: 

3154 # Accept this as alt-of 

3155 tags.append("alt-of") 

3156 break 

3157 

3158 else: 

3159 # Did not find a form description based on last word; see if the 

3160 # whole description is tags 

3161 tagsets, topics = decode_tags(gloss, no_unknown_starts=True) 

3162 if not topics and any( 

3163 not (alt_infl_disallowed & set(ts)) and form_of_tags & set(ts) 

3164 for ts in tagsets 

3165 ): 

3166 tags = [] 

3167 for ts in tagsets: 

3168 if not (alt_infl_disallowed & set(ts)) and form_of_tags & set( 3168 ↛ 3167line 3168 didn't jump to line 3167 because the condition on line 3168 was always true

3169 ts 

3170 ): 

3171 tags.extend(ts) 

3172 base = "" 

3173 else: 

3174 return None 

3175 

3176 # kludge for Spanish (again): 'x of [word] combined with [clitic]' 

3177 m = re.search(r"combined with \w+$", base) 

3178 if m: 3178 ↛ 3179line 3178 didn't jump to line 3179 because the condition on line 3178 was never true

3179 tagsets, topics = decode_tags(m.group(0), no_unknown_starts=True) 

3180 if not topics: 

3181 for ts in tagsets: 

3182 tags.extend(ts) 

3183 base = base[: m.start()] 

3184 

3185 # It is fairly common for form_of glosses to end with something like 

3186 # "ablative case" or "in instructive case". Parse that ending. 

3187 base = base.strip() 

3188 lst = base.split() 

3189 # print("parse_alt_or_inflection_of: lst={}".format(lst)) 

3190 if len(lst) >= 3 and lst[-1] in ("case", "case."): 3190 ↛ 3191line 3190 didn't jump to line 3191 because the condition on line 3190 was never true

3191 node = valid_sequences.children.get(lst[-2]) 

3192 if node and node.end: 

3193 for s in node.tags: 

3194 tags.extend(s.split(" ")) 

3195 lst = lst[:-2] 

3196 if lst[-1] == "in" and len(lst) > 1: 

3197 lst = lst[:-1] 

3198 

3199 # Eliminate empty and duplicate tags 

3200 tags = sorted(set(t for t in tags if t)) 

3201 

3202 # Clean up some extra stuff from the linked word, separating the text 

3203 # into ``base`` (the linked word) and ``extra`` (additional information, 

3204 # such as English translation or clarifying word sense information). 

3205 orig_base = base 

3206 base = re.sub(alt_of_form_of_clean_re, "", orig_base) 

3207 base = re.sub(r" [(⟨][^()]*[)⟩]", "", base) # Remove all (...) groups 

3208 extra = orig_base[len(base) :] 

3209 extra = re.sub(r"^[- :;.,,—]+", "", extra) 

3210 if extra.endswith(".") and extra.count(".") == 1: 

3211 extra = extra[:-1].strip() 

3212 m = re.match(r"^\(([^()]*)\)$", extra) 

3213 if m: 3213 ↛ 3214line 3213 didn't jump to line 3214 because the condition on line 3213 was never true

3214 extra = m.group(1) 

3215 else: 

3216 # These weird backets used in "slash mark" 

3217 m = re.match(r"^⟨([^()]*)⟩$", extra) 

3218 if m: 3218 ↛ 3219line 3218 didn't jump to line 3219 because the condition on line 3218 was never true

3219 extra = m.group(1) 

3220 m = re.match(r'^[“"]([^"“”]*)["”]$', extra) 

3221 if m: 3221 ↛ 3222line 3221 didn't jump to line 3222 because the condition on line 3221 was never true

3222 extra = m.group(1) 

3223 # Note: base might still contain comma-separated values and values 

3224 # separated by "and" 

3225 base = base.strip() 

3226 if base.endswith(",") and len(base) > 2: 3226 ↛ 3227line 3226 didn't jump to line 3227 because the condition on line 3226 was never true

3227 base = base[:-1].strip() 

3228 while ( 

3229 base.endswith(".") 

3230 and not wxr.wtp.page_exists(base) 

3231 and base not in gloss_template_args 

3232 ): 

3233 base = base[:-1].strip() 

3234 if base.endswith('(\u201cconjecture")'): 3234 ↛ 3235line 3234 didn't jump to line 3235 because the condition on line 3234 was never true

3235 base = base[:-14].strip() 

3236 tags.append("conjecture") 

3237 while ( 3237 ↛ 3242line 3237 didn't jump to line 3242 because the condition on line 3237 was never true

3238 base.endswith(".") 

3239 and not wxr.wtp.page_exists(base) 

3240 and base not in gloss_template_args 

3241 ): 

3242 base = base[:-1].strip() 

3243 if ( 3243 ↛ 3248line 3243 didn't jump to line 3248 because the condition on line 3243 was never true

3244 base.endswith(".") 

3245 and base not in gloss_template_args 

3246 and base[:-1] in gloss_template_args 

3247 ): 

3248 base = base[:-1] 

3249 base = base.strip() 

3250 if not base: 

3251 return tags, None 

3252 

3253 # Kludge: Spanish verb forms seem to have a dot added at the end. 

3254 # Remove it; we know of no Spanish verbs ending with a dot. 

3255 language = wxr.wtp.section 

3256 pos = wxr.wtp.subsection 

3257 # print("language={} pos={} base={}".format(language, pos, base)) 

3258 if ( 3258 ↛ 3264line 3258 didn't jump to line 3264 because the condition on line 3258 was never true

3259 base.endswith(".") 

3260 and len(base) > 1 

3261 and base[-2].isalpha() 

3262 and (language == "Spanish" and pos == "Verb") 

3263 ): 

3264 base = base[:-1] 

3265 

3266 # Split base to alternatives when multiple alternatives provided 

3267 parts = split_at_comma_semi(base, extra=[" / ", "/", r" \+ "]) 

3268 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "") 

3269 if ( 

3270 len(parts) <= 1 

3271 or base.startswith("/") 

3272 or base.endswith("/") 

3273 or "/" in titleword 

3274 ): 

3275 parts = [base] 

3276 # Split base to alternatives when of form "a or b" and "a" and "b" are 

3277 # similar (generally spelling variants of the same word or similar words) 

3278 if len(parts) == 1: 

3279 pp = base.split() 

3280 if len(pp) == 3 and pp[1] == "or" and distw([pp[0]], pp[2]) < 0.4: 

3281 parts = [pp[0], pp[2]] 

3282 

3283 # Create form-of/alt-of entries based on the extracted data 

3284 dt_lst: list[AltOf] = [] 

3285 for p in parts: 

3286 # Check for some suspicious base forms 

3287 m = re.search(r"[.,] |[{}()]", p) 

3288 if m and not wxr.wtp.page_exists(p): 3288 ↛ 3289line 3288 didn't jump to line 3289 because the condition on line 3288 was never true

3289 wxr.wtp.debug( 

3290 "suspicious alt_of/form_of with {!r}: {}".format(m.group(0), p), 

3291 sortid="form_descriptions/2278", 

3292 ) 

3293 if p.startswith("*") and len(p) >= 3 and p[1].isalpha(): 3293 ↛ 3294line 3293 didn't jump to line 3294 because the condition on line 3293 was never true

3294 p = p[1:] 

3295 dt: AltOf = {"word": p} 

3296 if extra: 

3297 dt["extra"] = extra 

3298 dt_lst.append(dt) 

3299 # print("alt_or_infl_of returning tags={} lst={} base={!r}" 

3300 # .format(tags, lst, base)) 

3301 return tags, dt_lst 

3302 

3303 

3304@functools.lru_cache(maxsize=65536) 

3305def classify_desc( 

3306 desc: str, 

3307 allow_unknown_tags=False, 

3308 no_unknown_starts=False, 

3309 accepted: Union[tuple[str, ...], frozenset[str]] = tuple(), 

3310) -> str: 

3311 """Determines whether the given description is most likely tags, english, 

3312 a romanization, or something else. Returns one of: "tags", "english", 

3313 "romanization", or "other". If ``allow_unknown_tags`` is True, then 

3314 allow "tags" classification even when the only tags are those starting 

3315 with a word in allowed_unknown_starts.""" 

3316 assert isinstance(desc, str) 

3317 # Empty and whitespace-only strings are treated as "other" 

3318 desc = desc.strip() 

3319 if not desc: 

3320 return "other" 

3321 

3322 normalized_desc = unicodedata.normalize("NFKD", desc) 

3323 

3324 # If it can be fully decoded as tags without errors, treat as tags 

3325 tagsets, topics = decode_tags(desc, no_unknown_starts=no_unknown_starts) 

3326 for tagset in tagsets: 

3327 assert isinstance(tagset, (list, tuple, set)) 

3328 if "error-unknown-tag" not in tagset and ( 

3329 topics or allow_unknown_tags or any(" " not in x for x in tagset) 

3330 ): 

3331 return "tags" 

3332 

3333 # Check if it looks like the taxonomic name of a species 

3334 if desc in known_species: 

3335 return "taxonomic" 

3336 desc1 = re.sub(r"^×([A-Z])", r"\1", desc) 

3337 desc1 = re.sub(r"\s*×.*", "", desc1) 

3338 lst = desc1.split() 

3339 if len(lst) > 1 and len(lst) <= 5 and lst[0] in known_firsts: 

3340 have_non_english = 1 if lst[0].lower() not in english_words else 0 

3341 for x in lst[1:]: 

3342 if x in ("A", "B", "C", "D", "E", "F", "I", "II", "III", "IV", "V"): 

3343 continue 

3344 if x[0].isupper(): 

3345 break 

3346 if x not in english_words: 

3347 have_non_english += 1 

3348 else: 

3349 # Starts with known taxonomic term, does not contain uppercase 

3350 # words (except allowed letters) and at least one word is not 

3351 # English 

3352 if have_non_english >= len(lst) - 1 and have_non_english > 0: 3352 ↛ 3358line 3352 didn't jump to line 3358 because the condition on line 3352 was always true

3353 return "taxonomic" 

3354 

3355 # If all words are in our English dictionary, interpret as English. 

3356 # [ -~] is regex black magic, "ALL CHARACTERS from space to tilde" 

3357 # in ASCII. Took me a while to figure out. 

3358 if re.match(r"[ -~―—“”…'‘’ʹ€]+$", normalized_desc) and len(desc) > 1: 

3359 if desc in english_words and desc[0].isalpha(): 

3360 return "english" # Handles ones containing whitespace 

3361 desc1 = re.sub( 

3362 tokenizer_fixup_re, lambda m: tokenizer_fixup_map[m.group(0)], desc 

3363 ) 

3364 tokens = tokenizer.tokenize(desc1) 

3365 if not tokens: 3365 ↛ 3366line 3365 didn't jump to line 3366 because the condition on line 3365 was never true

3366 return "other" 

3367 lst_bool = list( 

3368 x not in not_english_words 

3369 and 

3370 # not x.isdigit() and 

3371 ( 

3372 x in english_words 

3373 or x.lower() in english_words 

3374 or x in known_firsts 

3375 or x[0].isdigit() 

3376 or x in accepted 

3377 or 

3378 # (x[0].isupper() and x.find("-") < 0 and x.isascii()) or 

3379 ( 

3380 x.endswith("s") and len(x) >= 4 and x[:-1] in english_words 

3381 ) # Plural 

3382 or ( 

3383 x.endswith("ies") 

3384 and len(x) >= 5 

3385 and x[:-3] + "y" in english_words 

3386 ) # E.g. lily - lilies 

3387 or ( 

3388 x.endswith("ing") 

3389 and len(x) >= 5 

3390 and x[:-3] in english_words 

3391 ) # E.g. bring - bringing 

3392 or ( 

3393 x.endswith("ing") 

3394 and len(x) >= 5 

3395 and x[:-3] + "e" in english_words 

3396 ) # E.g., tone - toning 

3397 or ( 

3398 x.endswith("ed") and len(x) >= 5 and x[:-2] in english_words 

3399 ) # E.g. hang - hanged 

3400 or ( 

3401 x.endswith("ed") 

3402 and len(x) >= 5 

3403 and x[:-2] + "e" in english_words 

3404 ) # E.g. atone - atoned 

3405 or (x.endswith("'s") and x[:-2] in english_words) 

3406 or (x.endswith("s'") and x[:-2] in english_words) 

3407 or ( 

3408 x.endswith("ise") 

3409 and len(x) >= 5 

3410 and x[:-3] + "ize" in english_words 

3411 ) 

3412 or ( 

3413 x.endswith("ised") 

3414 and len(x) >= 6 

3415 and x[:-4] + "ized" in english_words 

3416 ) 

3417 or ( 

3418 x.endswith("ising") 

3419 and len(x) >= 7 

3420 and x[:-5] + "izing" in english_words 

3421 ) 

3422 or ( 

3423 re.search(r"[-/]", x) 

3424 and all( 

3425 ((y in english_words and len(y) > 2) or not y) 

3426 for y in re.split(r"[-/]", x) 

3427 ) 

3428 ) 

3429 ) 

3430 for x in tokens 

3431 ) 

3432 cnt = lst_bool.count(True) 

3433 rejected_words = tuple( 

3434 x for i, x in enumerate(tokens) if not lst_bool[i] 

3435 ) 

3436 if ( 

3437 any( 

3438 lst_bool[i] and x[0].isalpha() and len(x) > 1 

3439 for i, x in enumerate(tokens) 

3440 ) 

3441 and not desc.startswith("-") 

3442 and not desc.endswith("-") 

3443 and re.search(r"\w+", desc) 

3444 and ( 

3445 cnt == len(lst_bool) 

3446 or ( 

3447 any( 

3448 lst_bool[i] and len(x) > 3 for i, x in enumerate(tokens) 

3449 ) 

3450 and cnt >= len(lst_bool) - 1 

3451 ) 

3452 or cnt / len(lst_bool) >= 0.8 

3453 or ( 

3454 all(x in potentially_english_words for x in rejected_words) 

3455 and cnt / len(lst_bool) >= 0.50 

3456 ) 

3457 ) 

3458 ): 

3459 return "english" 

3460 # Some translations have apparent pronunciation descriptions in /.../ 

3461 # which we'll put in the romanization field (even though they probably are 

3462 # not exactly romanizations). 

3463 if desc.startswith("/") and desc.endswith("/"): 

3464 return "romanization" 

3465 # If all characters are in classes that could occur in romanizations, 

3466 # treat as romanization 

3467 classes = list( 

3468 unicodedata.category(x) if x not in ("-", ",", ":", "/", '"') else "OK" 

3469 for x in normalized_desc 

3470 ) 

3471 classes1 = [] 

3472 num_latin = 0 

3473 num_greek = 0 

3474 # part = "" 

3475 # for ch, cl in zip(normalized_desc, classes): 

3476 # part += f"{ch}({cl})" 

3477 # print(part) 

3478 for ch, cl in zip(normalized_desc, classes): 

3479 if ch in ( 

3480 "'", # ' in Arabic, / in IPA-like parenthesized forms 

3481 ".", # e.g., "..." in translations 

3482 ";", 

3483 ":", 

3484 "!", 

3485 "‘", 

3486 "’", 

3487 '"', 

3488 "“", 

3489 "”", 

3490 "/", 

3491 "?", 

3492 "…", # alternative to "..." 

3493 "⁉", # 見る/Japanese automatic transcriptions... 

3494 "?", 

3495 "!", 

3496 "⁻", # superscript -, used in some Cantonese roman, e.g. "we" 

3497 "ʔ", 

3498 "ʼ", 

3499 "ʾ", 

3500 "ʹ", 

3501 ): # ʹ e.g. in understand/English/verb Russian transl 

3502 classes1.append("OK") 

3503 continue 

3504 if cl not in ("Ll", "Lu"): 

3505 classes1.append(cl) 

3506 continue 

3507 try: 

3508 name = unicodedata.name(ch) 

3509 first = name.split()[0] 

3510 if first == "LATIN": 

3511 num_latin += 1 

3512 elif first == "GREEK": 

3513 num_greek += 1 

3514 elif first == "COMBINING": # Combining diacritic 3514 ↛ 3515line 3514 didn't jump to line 3515 because the condition on line 3514 was never true

3515 cl = "OK" 

3516 elif re.match(non_latin_scripts_re, name): 3516 ↛ 3520line 3516 didn't jump to line 3520 because the condition on line 3516 was always true

3517 cl = "NO" # Not acceptable in romanizations 

3518 except ValueError: 

3519 cl = "NO" # Not acceptable in romanizations 

3520 classes1.append(cl) 

3521 # print("classify_desc: {!r} classes1: {}".format(desc, classes1)) 

3522 # print(set(classes1) ) 

3523 if all( 

3524 x in ("Ll", "Lu", "Lt", "Lm", "Mn", "Mc", "Zs", "Nd", "OK") 

3525 for x in classes1 

3526 ): 

3527 if ( 

3528 (num_latin >= num_greek + 2 or num_greek == 0) 

3529 and classes1.count("OK") < len(classes1) 

3530 and classes1.count("Nd") < len(classes1) 

3531 ): 

3532 return "romanization" 

3533 # Otherwise it is something else, such as hanji version of the word 

3534 return "other" 

3535 

3536 

3537def remove_text_in_parentheses(text: str) -> str: 

3538 parentheses = 0 

3539 new_text = "" 

3540 for c in text: 

3541 if c == "(": 

3542 parentheses += 1 

3543 elif c == ")": 

3544 parentheses -= 1 

3545 elif parentheses == 0: 

3546 new_text += c 

3547 return new_text