Coverage for src / wiktextract / extractor / en / form_descriptions.py: 79%

1344 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-06 11:18 +0000

1# Code for parsing linguistic form descriptions and tags for word senses 

2# (both the word entry head - initial part and parenthesized parts - 

3# and tags at the beginning of word senses) 

4# 

5# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

6 

7import functools 

8import re 

9import unicodedata 

10from typing import ( 

11 Any, 

12 Literal, 

13 Optional, 

14 Sequence, 

15 Union, 

16) 

17 

18import Levenshtein 

19from nltk import TweetTokenizer # type:ignore[import-untyped] 

20 

21from ...datautils import data_append, data_extend, split_at_comma_semi 

22from ...tags import ( 

23 alt_of_tags, 

24 form_of_tags, 

25 head_final_bantu_langs, 

26 head_final_bantu_map, 

27 head_final_numeric_langs, 

28 head_final_other_langs, 

29 head_final_other_map, 

30 head_final_semitic_langs, 

31 head_final_semitic_map, 

32 uppercase_tags, 

33 valid_tags, 

34 xlat_descs_map, 

35 xlat_head_map, 

36 xlat_tags_map, 

37) 

38from ...topics import topic_generalize_map, valid_topics 

39from ...wxr_context import WiktextractContext 

40from .english_words import ( 

41 english_words, 

42 not_english_words, 

43 potentially_english_words, 

44) 

45from .form_descriptions_known_firsts import known_firsts 

46from .taxondata import known_species 

47from .type_utils import ( 

48 AltOf, 

49 FormData, 

50 LinkageData, 

51 SenseData, 

52 SoundData, 

53 TranslationData, 

54 WordData, 

55) 

56 

57# Tokenizer for classify_desc() 

58tokenizer = TweetTokenizer() 

59 

60# These are ignored as the value of a related form in form head. 

61IGNORED_RELATED: set[str] = set( 

62 [ 

63 "-", 

64 "־", 

65 "᠆", 

66 "‐", 

67 "‑", 

68 "‒", 

69 "–", 

70 "—", 

71 "―", 

72 "−", 

73 "⸺", 

74 "⸻", 

75 "﹘", 

76 "﹣", 

77 "-", 

78 "?", 

79 "(none)", 

80 ] 

81) 

82 

83 

84# First words of unicodedata.name() that indicate scripts that cannot be 

85# accepted in romanizations or english (i.e., should be considered "other" 

86# in classify_desc()). 

87non_latin_scripts: list[str] = [ 

88 "ADLAM", 

89 "ARABIC", 

90 "ARABIC-INDIC", 

91 "ARMENIAN", 

92 "BALINESE", 

93 "BENGALI", 

94 "BRAHMI", 

95 "BRAILLE", 

96 "CANADIAN", 

97 "CHAKMA", 

98 "CHAM", 

99 "CHEROKEE", 

100 "CJK", 

101 "COPTIC", 

102 "COUNTING ROD", 

103 "CUNEIFORM", 

104 "CYRILLIC", 

105 "DOUBLE-STRUCK", 

106 "EGYPTIAN", 

107 "ETHIOPIC", 

108 "EXTENDED ARABIC-INDIC", 

109 "GEORGIAN", 

110 "GLAGOLITIC", 

111 "GOTHIC", 

112 "GREEK", 

113 "GUJARATI", 

114 "GURMUKHI", 

115 "HANGUL", 

116 "HANIFI ROHINGYA", 

117 "HEBREW", 

118 "HIRAGANA", 

119 "JAVANESE", 

120 "KANNADA", 

121 "KATAKANA", 

122 "KAYAH LI", 

123 "KHMER", 

124 "KHUDAWADI", 

125 "LAO", 

126 "LEPCHA", 

127 "LIMBU", 

128 "MALAYALAM", 

129 "MEETEI", 

130 "MYANMAR", 

131 "NEW TAI LUE", 

132 "NKO", 

133 "OL CHIKI", 

134 "OLD PERSIAN", 

135 "OLD SOUTH ARABIAN", 

136 "ORIYA", 

137 "OSMANYA", 

138 "PHOENICIAN", 

139 "SAURASHTRA", 

140 "SHARADA", 

141 "SINHALA", 

142 "SUNDANESE", 

143 "SYLOTI", 

144 "TAI THAM", 

145 "TAKRI", 

146 "TAMIL", 

147 "TELUGU", 

148 "THAANA", 

149 "THAI", 

150 "TIBETAN", 

151 "TIFINAGH", 

152 "TIRHUTA", 

153 "UGARITIC", 

154 "WARANG CITI", 

155 "YI", 

156] 

157non_latin_scripts_re = re.compile( 

158 r"(" + r"|".join(re.escape(x) for x in non_latin_scripts) + r")\b" 

159) 

160 

161# Sanity check xlat_head_map values 

162for k, v in xlat_head_map.items(): 

163 if v.startswith("?"): 

164 v = v[1:] 

165 for tag in v.split(): 

166 if tag not in valid_tags: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true

167 print( 

168 "WARNING: xlat_head_map[{}] contains unrecognized tag {}".format( 

169 k, tag 

170 ) 

171 ) 

172 

173# Regexp for finding nested translations from translation items (these are 

174# used in, e.g., year/English/Translations/Arabic). This is actually used 

175# in page.py. 

176nested_translations_re = re.compile( 

177 r"\s+\((({}): ([^()]|\([^()]+\))+)\)".format( 

178 "|".join( 

179 re.escape(x.removeprefix("?")) 

180 for x in sorted(xlat_head_map.values(), key=len, reverse=True) 

181 if x and not x.startswith("class-") 

182 ) 

183 ) 

184) 

185 

186# Regexp that matches head tag specifiers. Used to match tags from end of 

187# translations and linkages 

188head_final_re_text = r"( -)?( ({}))+".format( 

189 "|".join( 

190 re.escape(x) 

191 for x in 

192 # The sort is to put longer ones first, preferring them in 

193 # the regexp match 

194 sorted(xlat_head_map.keys(), key=len, reverse=True) 

195 ) 

196) 

197head_final_re = re.compile(head_final_re_text + r"$") 

198 

199# Regexp used to match head tag specifiers at end of a form for certain 

200# Bantu languages (particularly Swahili and similar languages). 

201head_final_bantu_re_text = r" ({})".format( 

202 "|".join(re.escape(x) for x in head_final_bantu_map.keys()) 

203) 

204head_final_bantu_re = re.compile(head_final_bantu_re_text + "$") 

205 

206# Regexp used to match head tag specifiers at end of a form for certain 

207# Semitic languages (particularly Arabic and similar languages). 

208head_final_semitic_re_text = r" ({})".format( 

209 "|".join(re.escape(x) for x in head_final_semitic_map.keys()) 

210) 

211head_final_semitic_re = re.compile(head_final_semitic_re_text + "$") 

212 

213# Regexp used to match head tag specifiers at end of a form for certain 

214# other languages (e.g., Lithuanian, Finnish, French). 

215head_final_other_re_text = r" ({})".format( 

216 "|".join(re.escape(x) for x in head_final_other_map.keys()) 

217) 

218head_final_other_re = re.compile(head_final_other_re_text + "$") 

219 

220# Regexp for splitting heads. See parse_word_head(). 

221head_split_re_text_part_1 = ( 

222 "(" 

223 + head_final_re_text 

224 + "|" 

225 + head_final_bantu_re_text 

226 + "|" 

227 + head_final_semitic_re_text 

228 + "|" 

229 + head_final_other_re_text 

230) 

231 

232head_split_re_text = head_split_re_text_part_1 + ")?( or |[,;]+| *$)" 

233 

234head_split_re_text_no_semicolon = head_split_re_text_part_1 + ")?( or |,+| *$)" 

235 

236head_split_re = re.compile(head_split_re_text) 

237head_split_no_semicolon_re = re.compile(head_split_re_text_no_semicolon) 

238 

239head_split_re_parens = 0 

240for m in re.finditer(r"(^|[^\\])[(]+", head_split_re_text): 

241 head_split_re_parens += m.group(0).count("(") 

242 

243# Parenthesized parts that are ignored in translations 

244tr_ignored_parens: set[str] = set( 

245 [ 

246 "please verify", 

247 "(please verify)", 

248 "transliteration needed", 

249 "(transliteration needed)", 

250 "in words with back vowel harmony", 

251 "(in words with back vowel harmony)", 

252 "in words with front vowel harmony", 

253 "(in words with front vowel harmony)", 

254 "see below", 

255 "see usage notes below", 

256 ] 

257) 

258tr_ignored_parens_re = re.compile( 

259 r"^(" 

260 + "|".join(re.escape(x) for x in tr_ignored_parens) 

261 + ")$" 

262 + r"|^(Can we clean up|Can we verify|for other meanings see " 

263 r"lit\. )" 

264) 

265 

266# Translations that are ignored 

267ignored_translations: set[str] = set( 

268 [ 

269 "[script needed]", 

270 "please add this translation if you can", 

271 ] 

272) 

273 

274# Put english text into the "note" field in a translation if it contains one 

275# of these words 

276tr_note_re = re.compile( 

277 r"(\b(article|definite|indefinite|superlative|comparative|pattern|" 

278 r"adjective|adjectives|clause|clauses|pronoun|pronouns|preposition|prep|" 

279 r"postposition|postp|action|actions|articles|" 

280 r"adverb|adverbs|noun|nouns|verb|verbs|before|" 

281 r"after|placed|prefix|suffix|used with|translated|" 

282 r"nominative|genitive|dative|infinitive|participle|past|perfect|imperfect|" 

283 r"perfective|imperfective|auxiliary|negative|future|present|tense|aspect|" 

284 r"conjugation|declension|class|category|plural|singular|positive|" 

285 r"seldom used|formal|informal|familiar|unspoken|spoken|written|" 

286 r"indicative|progressive|conditional|potential|" 

287 r"accusative|adessive|inessive|superessive|elative|allative|" 

288 r"dialect|dialects|object|subject|predicate|movies|recommended|language|" 

289 r"locative|continuous|simple|continuousness|gerund|subjunctive|" 

290 r"periphrastically|no equivalent|not used|not always used|" 

291 r"used only with|not applicable|use the|signifying|wordplay|pronounced|" 

292 r"preconsonantal|spelled|spelling|respelling|respellings|phonetic|" 

293 r"may be replaced|stricter sense|for nonhumans|" 

294 r"sense:|used:|in full:|informally used|followed by|" 

295 r"not restricted to|pertaining to|or optionally with|are optional|" 

296 r"in conjunction with|in compounds|depending on the relationship|" 

297 r"person addressed|one person|multiple persons|may be replaced with|" 

298 r"optionally completed with|in the phrase|in response to|" 

299 r"before a|before an|preceded by|verbs ending|very common|after a verb|" 

300 r"with verb|with uncountable|with the objects|with stative|" 

301 r"can be replaced by|often after|used before|used after|" 

302 r"used in|clipping of|spoken|somewhat|capitalized|" 

303 r"short form|shortening of|shortened form|initialism of|" 

304 r"said to|rare:|rarer also|is rarer|negatively connoted|" 

305 r"previously mentioned|uncountable noun|countable noun|" 

306 r"countable nouns|uncountable nouns|" 

307 r"with predicative|with -|with imperfect|with a negated|" 

308 r"colloquial|misspelling|holophrastic|frequently|esp\.|especially|" 

309 r'"|' 

310 r"general term|after a vowel|before a vowel|" 

311 r"form|regular|irregular|alternative)" 

312 r")($|[) ])|^(" 

313 # Following are only matched at the beginning of the string 

314 r"pl|pl\.|see:|pl:|sg:|plurals:|e\.g\.|e\.g\.:|e\.g\.,|cf\.|compare|such as|" 

315 r"see|only|often|usually|used|usage:|of|not|in|compare|usu\.|" 

316 r"as|about|abbrv\.|abbreviation|abbr\.|that:|optionally|" 

317 r"mainly|from|for|also|also:|acronym|" 

318 r"\+|with) " 

319) 

320# \b does not work at the end??? 

321 

322# Related forms matching this regexp will be considered suspicious if the 

323# page title does not also match one of these. 

324suspicious_related_re = re.compile( 

325 r"(^| )(f|m|n|c|or|pl|sg|inan|anim|pers|anml|impf|pf|vir|nvir)( |$)" 

326 r"|[][:=<>&#*|]" 

327 r"| \d+$" 

328) 

329 

330# Word forms (head forms, translations, etc) that will be considered ok and 

331# silently accepted even if they would otherwise trigger a suspicious 

332# form warning. 

333ok_suspicious_forms: set[str] = set( 

334 [ 

335 "but en or", # "golden goal"/English/Tr/French 

336 "cœur en or", # "heart of gold"/Eng/Tr/French 

337 "en or", # golden/Eng/Tr/French 

338 "men du", # jet/Etym2/Noun/Tr/Cornish 

339 "parachute en or", # "golden parachute"/Eng/Tr/French 

340 "vieil or", # "old gold"/Eng/Tr/French 

341 # "all that glitters is not gold"/Eng/Tr/French 

342 "tout ce qui brille n’est pas or", 

343 "μη αποκλειστικό or", # inclusive or/Eng/Tr/Greek 

344 "period or full stop", 

345 ] 

346) 

347 

348 

349# Replacements to be done in classify_desc before tokenizing. This is a 

350# workaround for shortcomings in TweetTokenizer. 

351tokenizer_fixup_map = { 

352 r"a.m.": "AM", 

353 r"p.m.": "PM", 

354} 

355tokenizer_fixup_re = re.compile( 

356 r"\b(" 

357 + "|".join( 

358 re.escape(x) 

359 for x in sorted( 

360 tokenizer_fixup_map.keys(), key=lambda x: len(x), reverse=True 

361 ) 

362 ) 

363 + r")" 

364) 

365 

366# Unknown tags starting with these words will be silently ignored. 

367ignored_unknown_starts: set[str] = set( 

368 [ 

369 "originally", 

370 "e.g.", 

371 "c.f.", 

372 "supplanted by", 

373 "supplied by", 

374 ] 

375) 

376 

377ignored_unknown_starts_re = re.compile( 

378 r"^(" 

379 + "|".join( 

380 re.escape(x) 

381 for x in sorted(ignored_unknown_starts, key=lambda x: -len(x)) 

382 ) 

383 + ") " 

384) 

385 

386# If an unknown sequence starts with one of these, it will continue as an 

387# unknown sequence until the end, unless it turns out to have a replacement. 

388allowed_unknown_starts: set[str] = set( 

389 [ 

390 "Relating", 

391 "accompanied", 

392 "added", 

393 "after", 

394 "answering", 

395 "as", 

396 "based", 

397 "before", 

398 "conjugated", 

399 "conjunction", 

400 "construed", 

401 "especially", 

402 "expression:", 

403 "figurative:", 

404 "followed", 

405 "for", 

406 "forms", 

407 "from", 

408 "governs", 

409 "in", 

410 "indicating", 

411 "modifying", 

412 "normally", 

413 "not", 

414 "of", 

415 "preceding", 

416 "prefixed", 

417 "referring", 

418 "relating", 

419 "revived", 

420 "said", 

421 "since", 

422 "takes", 

423 "used", 

424 "with", 

425 "With", 

426 "without", 

427 ] 

428) 

429# Allow the ignored unknown starts without complaining 

430allowed_unknown_starts.update(ignored_unknown_starts) 

431 

432# Full unknown tags that will be ignored in decode_tags() 

433# XXX this is unused, ask Tatu where the contents is now 

434ignored_unknown_tags: set[str] = set([]) 

435 

436# Head endings that are mapped to tags 

437head_end_map = { 

438 " 1st conj.": "conjugation-1", 

439 " 2nd conj.": "conjugation-2", 

440 " 3rd conj.": "conjugation-3", 

441 " 4th conj.": "conjugation-4", 

442 " 5th conj.": "conjugation-5", 

443 " 6th conj.": "conjugation-6", 

444 " 7th conj.": "conjugation-7", 

445} 

446head_end_re = re.compile( 

447 r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$" 

448) 

449 

450 

451# Dictionary of language-specific parenthesized head part starts that 

452# either introduce new tags or modify previous tags. The value for each 

453# language is a dictionary that maps the first word of the head part to 

454# (rem_tags, add_tags), where ``rem_tags`` can be True to remove all previous 

455# tags or a space-separated string of tags to remove, and ``add_tags`` should 

456# be a string of tags to add. 

457lang_specific_head_map: dict[ 

458 str, dict[str, Union[tuple[str, str], tuple[Literal[True], str]]] 

459] = { 

460 "Danish": { 

461 # prefix: (rem_tags space separate string/True, add_tags s-sep str) 

462 "c": ("neuter", "common-gender"), 

463 "n": ("common-gender", "neuter"), 

464 "pl": ("singular neuter common-gender", "plural"), 

465 "sg": ("plural neuter common-gender", "singular"), 

466 }, 

467} 

468 

469 

470# Regular expression used to strip additional stuff from the end of alt_of and 

471# form_of. 

472alt_of_form_of_clean_re = re.compile( 

473 r"(?s)(" 

474 + "|".join( 

475 [ 

476 r":", 

477 r'[“"]', 

478 r";", 

479 r" \(", 

480 r" - ", 

481 r" ־ ", 

482 r" ᠆ ", 

483 r" ‐ ", 

484 r" ‑ ", 

485 r" ‒ ", 

486 r" – ", 

487 r" — ", 

488 r" ― ", 

489 r" − ", 

490 r" ⸺ ", 

491 r" ⸻ ", 

492 r" ﹘ ", 

493 r" ﹣ ", 

494 r" - ", 

495 r" \+ ", 

496 r" \(with ", 

497 r" with -ra/-re", 

498 r"\. Used ", 

499 r"\. Also ", 

500 r"\. Since ", 

501 r"\. A ", 

502 r"\.\. A ", 

503 r"\. An ", 

504 r"\.\. An ", 

505 r"\. an ", 

506 r"\. The ", 

507 r"\. Spanish ", 

508 r"\. Language ", 

509 r"\. former name of ", 

510 r"\. AIM", 

511 r"\. OT", 

512 r"\. Not ", 

513 r"\. Now ", 

514 r"\. Nowadays ", 

515 r"\. Early ", 

516 r"\. ASEAN", 

517 r"\. UN", 

518 r"\. IMF", 

519 r"\. WHO", 

520 r"\. WIPO", 

521 r"\. AC", 

522 r"\. DC", 

523 r"\. DNA", 

524 r"\. RNA", 

525 r"\. SOB", 

526 r"\. IMO", 

527 r"\. Behavior", 

528 r"\. Income ", 

529 r"\. More ", 

530 r"\. Most ", 

531 r"\. Only ", 

532 r"\. Also ", 

533 r"\. From ", 

534 r"\. Of ", 

535 r"\.\. Of ", 

536 r"\. To ", 

537 r"\. For ", 

538 r"\. If ", 

539 r"\. Praenominal ", 

540 r"\. This ", 

541 r"\. Replaced ", 

542 r"\. CHCS is the ", 

543 r"\. Equivalent ", 

544 r"\. Initialism ", 

545 r"\. Note ", 

546 r"\. Alternative ", 

547 r"\. Compare ", 

548 r"\. Cf\. ", 

549 r"\. Comparable ", 

550 r"\. Involves ", 

551 r"\. Sometimes ", 

552 r"\. Commonly ", 

553 r"\. Often ", 

554 r"\. Typically ", 

555 r"\. Possibly ", 

556 r"\. Although ", 

557 r"\. Rare ", 

558 r"\. Instead ", 

559 r"\. Integrated ", 

560 r"\. Distinguished ", 

561 r"\. Given ", 

562 r"\. Found ", 

563 r"\. Was ", 

564 r"\. In ", 

565 r"\. It ", 

566 r"\.\. It ", 

567 r"\. One ", 

568 r"\. Any ", 

569 r"\. They ", 

570 r"\. Members ", 

571 r"\. Each ", 

572 r"\. Original ", 

573 r"\. Especially ", 

574 r"\. Usually ", 

575 r"\. Known ", 

576 r"\.\. Known ", 

577 r"\. See ", 

578 r"\. see ", 

579 r"\. target was not ", 

580 r"\. Popular ", 

581 r"\. Pedantic ", 

582 r"\. Positive ", 

583 r"\. Society ", 

584 r"\. Plan ", 

585 r"\. Environmentally ", 

586 r"\. Affording ", 

587 r"\. Encompasses ", 

588 r"\. Expresses ", 

589 r"\. Indicates ", 

590 r"\. Text ", 

591 r"\. Large ", 

592 r"\. Sub-sorting ", 

593 r"\. Sax", 

594 r"\. First-person ", 

595 r"\. Second-person ", 

596 r"\. Third-person ", 

597 r"\. 1st ", 

598 r"\. 2nd ", 

599 r"\. 3rd ", 

600 r"\. Term ", 

601 r"\. Northeastern ", 

602 r"\. Northwestern ", 

603 r"\. Southeast ", 

604 r"\. Egyptian ", 

605 r"\. English ", 

606 r"\. Cape Province was split into ", 

607 r"\. Pañcat", 

608 r"\. of the ", 

609 r"\. is ", 

610 r"\. after ", 

611 r"\. or ", 

612 r"\. chromed", 

613 r"\. percussion", 

614 r"\. with his ", 

615 r"\. a\.k\.a\. ", 

616 r"\. comparative form ", 

617 r"\. singular ", 

618 r"\. plural ", 

619 r"\. present ", 

620 r"\. his ", 

621 r"\. her ", 

622 r"\. equivalent ", 

623 r"\. measuring ", 

624 r"\. used in ", 

625 r"\. cutely ", 

626 r"\. Protects", 

627 r'\. "', 

628 r"\.^", 

629 r"\. \+ ", 

630 r"\., ", 

631 r". — ", 

632 r", a ", 

633 r", an ", 

634 r", the ", 

635 r", obsolete ", 

636 r", possessed", # 'd/English 

637 r", imitating", # 1/English 

638 r", derived from", 

639 r", called ", 

640 r", especially ", 

641 r", slang for ", 

642 r", used to", # c/o /English 

643 r", commonly", # b/w /English 

644 r" corresponding to ", 

645 r" equivalent to ", 

646 r" popularized by ", 

647 r" denoting ", 

648 r" in its various senses\.", 

649 r" used by ", 

650 r" but not for ", 

651 r" since ", 

652 r" i\.e\. ", 

653 r" i\. e\. ", 

654 r" e\.g\. ", 

655 r" eg\. ", 

656 r" etc\. ", 

657 r"\[http", 

658 r" — used as ", 

659 r" by K\. Forsyth ", 

660 r" by J\. R\. Allen ", 

661 r" by S\. Ferguson ", 

662 r" by G\. Donaldson ", 

663 r" May refer to ", 

664 r" An area or region ", 

665 ] 

666 ) 

667 + r").*$" 

668) 

669 

670 

671class ValidNode: 

672 """Node in the valid_sequences tree. Each node is part of a chain 

673 or chains that form sequences built out of keys in key->tags 

674 maps like xlat_tags, etc. The ValidNode's 'word' is the key 

675 by which it is refered to in the root dict or a `children` dict, 

676 `end` marks that the node is the end-terminus of a sequence (but 

677 it can still continue if the sequence is shared by the start of 

678 other sequences: "nominative$" and "nominative plural$" for example), 

679 `tags` and `topics` are the dicts containing tag and topic strings 

680 for terminal nodes (end==True).""" 

681 

682 __slots__ = ( 

683 "end", 

684 "tags", 

685 "topics", 

686 "children", 

687 ) 

688 

689 def __init__( 

690 self, 

691 end=False, 

692 tags: Optional[list[str]] = None, 

693 topics: Optional[list[str]] = None, 

694 children: Optional[dict[str, "ValidNode"]] = None, 

695 ) -> None: 

696 self.end = end 

697 self.tags: list[str] = tags or [] 

698 self.topics: list[str] = topics or [] 

699 self.children: dict[str, "ValidNode"] = children or {} 

700 

701 

702def add_to_valid_tree(tree: ValidNode, desc: str, v: Optional[str]) -> None: 

703 """Helper function for building trees of valid tags/sequences during 

704 initialization.""" 

705 assert isinstance(tree, ValidNode) 

706 assert isinstance(desc, str) 

707 assert v is None or isinstance(v, str) 

708 node = tree 

709 

710 # Build the tree structure: each node has children nodes 

711 # whose names are denoted by their dict key. 

712 for w in desc.split(" "): 

713 if w in node.children: 

714 node = node.children[w] 

715 else: 

716 new_node = ValidNode() 

717 node.children[w] = new_node 

718 node = new_node 

719 if not node.end: 

720 node.end = True 

721 if not v: 

722 return None # Terminate early because there are no tags 

723 

724 tagslist = [] 

725 topicslist = [] 

726 for vv in v.split(): 

727 if vv in valid_tags: 

728 tagslist.append(vv) 

729 elif vv in valid_topics: 729 ↛ 732line 729 didn't jump to line 732 because the condition on line 729 was always true

730 topicslist.append(vv) 

731 else: 

732 print( 

733 "WARNING: tag/topic {!r} maps to unknown {!r}".format(desc, vv) 

734 ) 

735 topics = " ".join(topicslist) 

736 tags = " ".join(tagslist) 

737 # Changed to "_tags" and "_topics" to avoid possible key-collisions. 

738 if topics: 

739 node.topics.extend([topics]) 

740 if tags: 

741 node.tags.extend([tags]) 

742 

743 

744def add_to_valid_tree1( 

745 tree: ValidNode, 

746 k: str, 

747 v: Union[list[str], tuple[str, ...], str], 

748 valid_values: Union[set[str], dict[str, Any]], 

749) -> list[str]: 

750 assert isinstance(tree, ValidNode) 

751 assert isinstance(k, str) 

752 assert v is None or isinstance(v, (list, tuple, str)) 

753 assert isinstance(valid_values, (set, dict)) 

754 if not v: 754 ↛ 755line 754 didn't jump to line 755 because the condition on line 754 was never true

755 add_to_valid_tree(valid_sequences, k, None) 

756 return [] 

757 elif isinstance(v, str): 

758 v = [v] 

759 q = [] 

760 for vv in v: 

761 assert isinstance(vv, str) 

762 add_to_valid_tree(valid_sequences, k, vv) 

763 vvs = vv.split() 

764 for x in vvs: 

765 q.append(x) 

766 # return each individual tag 

767 return q 

768 

769 

770def add_to_valid_tree_mapping( 

771 tree: ValidNode, 

772 mapping: Union[dict[str, Union[list[str], str]], dict[str, str]], 

773 valid_values: Union[set[str], dict[str, Any]], 

774 recurse: bool, 

775) -> None: 

776 assert isinstance(tree, ValidNode) 

777 assert isinstance(mapping, dict) 

778 assert isinstance(valid_values, (set, dict)) 

779 assert recurse in (True, False) 

780 for k, v in mapping.items(): 

781 assert isinstance(k, str) 

782 assert isinstance(v, (list, str)) 

783 if isinstance(v, str): 

784 q = add_to_valid_tree1(tree, k, [v], valid_values) 

785 else: 

786 q = add_to_valid_tree1(tree, k, v, valid_values) 

787 if recurse: 

788 visited = set() 

789 while q: 

790 v = q.pop() 

791 if v in visited: 

792 continue 

793 visited.add(v) 

794 if v not in mapping: 

795 continue 

796 vv = mapping[v] 

797 qq = add_to_valid_tree1(tree, k, vv, valid_values) 

798 q.extend(qq) 

799 

800 

801# Tree of sequences considered to be tags (includes sequences that are 

802# mapped to something that becomes one or more valid tags) 

803valid_sequences = ValidNode() 

804sequences_with_slashes: set[str] = set() 

805for tag in valid_tags: 

806 # The basic tags used in our tag system; some are a bit weird, but easier 

807 # to implement this with 'false' positives than filter out stuff no one else 

808 # uses. 

809 if "/" in tag: 

810 sequences_with_slashes.add(tag) 

811 add_to_valid_tree(valid_sequences, tag, tag) 

812for tag in uppercase_tags: 

813 hyphenated = re.sub(r"\s+", "-", tag) 

814 if "/" in tag: 

815 sequences_with_slashes.add(tag) 

816 add_to_valid_tree(valid_sequences, tag, hyphenated) 

817 

818# xlat_tags_map! 

819add_to_valid_tree_mapping(valid_sequences, xlat_tags_map, valid_tags, False) 

820for k in xlat_tags_map: 

821 if "/" in k: 

822 sequences_with_slashes.add(k) 

823# Add topics to the same table, with all generalized topics also added 

824for topic in valid_topics: 

825 assert " " not in topic 

826 if "/" in topic: 826 ↛ 827line 826 didn't jump to line 827 because the condition on line 826 was never true

827 sequences_with_slashes.add(topic) 

828 add_to_valid_tree(valid_sequences, topic, topic) 

829# Let each original topic value stand alone. These are not generally on 

830# valid_topics. We add the original topics with spaces replaced by hyphens. 

831for topic in topic_generalize_map.keys(): 

832 hyphenated = re.sub(r"\s+", "-", topic) 

833 if "/" in topic: 833 ↛ 834line 833 didn't jump to line 834 because the condition on line 833 was never true

834 sequences_with_slashes.add(topic) 

835 add_to_valid_tree(valid_sequences, topic, hyphenated) 

836# Add canonicalized/generalized topic values 

837add_to_valid_tree_mapping( 

838 valid_sequences, topic_generalize_map, valid_topics, True 

839) 

840 

841# Regex used to divide a decode candidate into parts that shouldn't 

842# have their slashes turned into spaces 

843slashes_re = re.compile( 

844 r"(" + "|".join((re.escape(s) for s in sequences_with_slashes)) + r")" 

845) 

846 

847# Regexp used to find "words" from word heads and linguistic descriptions 

848word_pattern = ( 

849 r"[^ ,;()\u200e]+|" 

850 r"\([^ ,;()\u200e]+\)[^ ,;()\u200e]+|" 

851 r"[\u2800-\u28ff]|" # Braille characters 

852 r"\(([^()]|\([^()]*\))*\)" 

853) 

854 

855word_re_global = re.compile(word_pattern) 

856 

857 

858def distw(titleparts: Sequence[str], word: str) -> float: 

859 """Computes how distinct ``word`` is from the most similar word in 

860 ``titleparts``. Returns 1 if words completely distinct, 0 if 

861 identical, or otherwise something in between.""" 

862 assert isinstance(titleparts, (list, tuple)) 

863 assert isinstance(word, str) 

864 w = min( 

865 Levenshtein.distance(word, tw) / max(len(tw), len(word)) 

866 for tw in titleparts 

867 ) 

868 return w 

869 

870 

871def map_with( 

872 ht: dict[str, str | list[str]] | dict[str, str], 

873 lst: Sequence[str], 

874) -> list[str]: 

875 """Takes alternatives from ``lst``, maps them using ``ht`` to zero or 

876 more alternatives each, and returns a combined list of alternatives.""" 

877 assert isinstance(ht, dict) 

878 assert isinstance(lst, (list, tuple)) 

879 ret = [] 

880 for x in lst: 

881 assert isinstance(x, str) 

882 x = x.strip() 

883 x = ht.get(x, x) 

884 if isinstance(x, str): 884 ↛ 887line 884 didn't jump to line 887 because the condition on line 884 was always true

885 if x: 885 ↛ 880line 885 didn't jump to line 880 because the condition on line 885 was always true

886 ret.append(x) 

887 elif isinstance(x, (list, tuple)): 

888 ret.extend(x) 

889 else: 

890 raise RuntimeError("map_with unexpected value: {!r}".format(x)) 

891 return ret 

892 

893 

894TagList = list[str] 

895PosPathStep = tuple[int, TagList, TagList] 

896 

897 

898def check_unknown( 

899 from_i: int, 

900 to_i: int, 

901 i: int, 

902 wordlst: Sequence[str], 

903 allow_any: bool, 

904 no_unknown_starts: bool, 

905) -> list[PosPathStep]: 

906 """Check if the current section from_i->to_i is actually unknown 

907 or if it needs some special handling. We already presupposed that 

908 this is UNKNOWN; this is just called to see what *kind* of UNKNOWN.""" 

909 assert isinstance(to_i, int) 

910 assert isinstance(from_i, int) 

911 assert isinstance(i, int) 

912 # Adds unknown tag if needed. Returns new last_i 

913 # print("check_unknown to_i={} from_i={} i={}" 

914 # .format(to_i, from_i, i)) 

915 if from_i >= to_i: 

916 return [] 

917 words = wordlst[from_i:to_i] 

918 tag = " ".join(words) 

919 assert tag 

920 # print(f"{tag=}") 

921 if re.match(ignored_unknown_starts_re, tag): 

922 # Tags with this start are to be ignored 

923 return [(from_i, ["UNKNOWN"], [])] 

924 if tag in ignored_unknown_tags: 924 ↛ 925line 924 didn't jump to line 925 because the condition on line 924 was never true

925 return [] # One of the tags listed as to be ignored 

926 if tag in ("and", "or"): 

927 return [] 

928 if ( 

929 not allow_any 

930 and not words[0].startswith("~") 

931 and ( 

932 no_unknown_starts 

933 or words[0] not in allowed_unknown_starts 

934 or len(words) <= 1 

935 ) 

936 ): 

937 # print("ERR allow_any={} words={}" 

938 # .format(allow_any, words)) 

939 return [ 

940 (from_i, ["UNKNOWN"], ["error-unknown-tag"]) 

941 ] # Add ``tag`` here to include 

942 else: 

943 return [(from_i, ["UNKNOWN"], [tag])] 

944 

945 

946def add_new1( 

947 node: ValidNode, 

948 i: int, 

949 start_i: int, 

950 last_i: int, 

951 new_paths: list[list[PosPathStep]], 

952 new_nodes: list[tuple[ValidNode, int, int]], 

953 pos_paths: list[list[list[PosPathStep]]], 

954 wordlst: list[str], 

955 allow_any: bool, 

956 no_unknown_starts: bool, 

957 max_last_i: int, 

958) -> int: 

959 assert isinstance(new_paths, list) 

960 # print("add_new: start_i={} last_i={}".format(start_i, last_i)) 

961 # print("$ {} last_i={} start_i={}" 

962 # .format(w, last_i, start_i)) 

963 max_last_i = max(max_last_i, last_i) # if last_i has grown 

964 if (node, start_i, last_i) not in new_nodes: 

965 new_nodes.append((node, start_i, last_i)) 

966 if node.end: 

967 # We can see a terminal point in the search tree. 

968 u = check_unknown( 

969 last_i, start_i, i, wordlst, allow_any, no_unknown_starts 

970 ) 

971 # Create new paths candidates based on different past possible 

972 # paths; pos_path[last_i] contains possible paths, so add this 

973 # new one at the beginning(?) 

974 # The list comprehension inside the parens generates an iterable 

975 # of lists, so this is .extend( [(last_i...)], [(last_i...)], ... ) 

976 # XXX: this is becoming impossible to annotate, nodes might 

977 # need to become classed objects and not just dicts, or at least 

978 # a TypedDict with a "children" node 

979 new_paths.extend( 

980 [(last_i, node.tags, node.topics)] + u + x 

981 for x in pos_paths[last_i] 

982 ) 

983 max_last_i = i + 1 

984 return max_last_i 

985 

986 

987@functools.lru_cache(maxsize=65536) 

988def decode_tags( 

989 src: str, 

990 allow_any=False, 

991 no_unknown_starts=False, 

992) -> tuple[list[tuple[str, ...]], list[str]]: 

993 tagsets, topics = decode_tags1(src, allow_any, no_unknown_starts) 

994 # print(f"decode_tags: {src=}, {tagsets=}") 

995 

996 # Insert retry-code here that modifies the text source 

997 if ( 

998 any(s.startswith("error-") for tagset in tagsets for s in tagset) 

999 # I hate Python's *nested* list comprehension syntax ^ 

1000 or any(s.startswith("error-") for s in topics) 

1001 ): 

1002 new_tagsets: list[tuple[str, ...]] = [] 

1003 new_topics: list[str] = [] 

1004 

1005 if "/" in src: 

1006 # slashes_re contains valid key entries with slashes; we're going 

1007 # to skip them by splitting the string and skipping handling every 

1008 # second entry, which contains the splitting group like "masculine/ 

1009 # feminine" style keys. 

1010 split_parts = re.split(slashes_re, src) 

1011 new_parts: list[str] = [] 

1012 if len(split_parts) > 1: 

1013 for i, s in enumerate(split_parts): 

1014 if i % 2 == 0: 

1015 new_parts.append(s.replace("/", " ")) 

1016 else: 

1017 new_parts.append(s) 

1018 new_src = "".join(new_parts) 

1019 else: 

1020 new_src = src 

1021 new_tagsets, new_topics = decode_tags1( 

1022 new_src, allow_any, no_unknown_starts 

1023 ) 

1024 elif " or " in src or " and " in src: 

1025 # Annoying kludge. 

1026 new_src = src.replace(" and ", " ") 

1027 new_src = new_src.replace(" or ", " ") 

1028 new_tagsets, new_topics = decode_tags1( 

1029 new_src, allow_any, no_unknown_starts 

1030 ) 

1031 # print(f"{new_tagsets=}") 

1032 

1033 if new_tagsets or new_topics: 

1034 old_errors = sum( 

1035 1 for tagset in tagsets for s in tagset if s.startswith("error") 

1036 ) 

1037 old_errors += sum(1 for s in topics if s.startswith("error")) 

1038 new_errors = sum( 

1039 1 

1040 for new_tagset in new_tagsets 

1041 for s in new_tagset 

1042 if s.startswith("error") 

1043 ) 

1044 new_errors += sum(1 for s in new_topics if s.startswith("error")) 

1045 

1046 if new_errors <= old_errors: 1046 ↛ 1049line 1046 didn't jump to line 1049 because the condition on line 1046 was always true

1047 return new_tagsets, new_topics 

1048 

1049 return tagsets, topics 

1050 

1051 

1052def decode_tags1( 

1053 src: str, 

1054 allow_any=False, 

1055 no_unknown_starts=False, 

1056) -> tuple[list[tuple[str, ...]], list[str]]: 

1057 """Decodes tags, doing some canonicalizations. This returns a list of 

1058 lists of tags and a list of topics.""" 

1059 assert isinstance(src, str) 

1060 

1061 # print("decode_tags: src={!r}".format(src)) 

1062 

1063 pos_paths: list[list[list[PosPathStep]]] = [[[]]] 

1064 wordlst: list[str] = [] 

1065 max_last_i = 0 # pre-initialized here so that it can be used as a ref 

1066 

1067 add_new = functools.partial( 

1068 add_new1, # pre-set parameters and references for function 

1069 pos_paths=pos_paths, 

1070 wordlst=wordlst, 

1071 allow_any=allow_any, 

1072 no_unknown_starts=no_unknown_starts, 

1073 max_last_i=max_last_i, 

1074 ) 

1075 # First split the tags at commas and semicolons. Their significance is that 

1076 # a multi-word sequence cannot continue across them. 

1077 parts = split_at_comma_semi(src, extra=[";", ":"]) 

1078 

1079 for part in parts: 

1080 max_last_i = len(wordlst) # "how far have we gone?" 

1081 lst1 = part.split() 

1082 if not lst1: 

1083 continue 

1084 wordlst.extend(lst1) 

1085 cur_nodes: list[tuple[ValidNode, int, int]] = [] # Currently seen 

1086 for w in lst1: 

1087 i = len(pos_paths) - 1 

1088 new_nodes: list[tuple[ValidNode, int, int]] = [] 

1089 # replacement nodes for next loop 

1090 new_paths: list[list[PosPathStep]] = [] 

1091 # print("ITER i={} w={} max_last_i={} wordlst={}" 

1092 # .format(i, w, max_last_i, wordlst)) 

1093 node: ValidNode 

1094 start_i: int 

1095 last_i: int 

1096 for node, start_i, last_i in cur_nodes: 

1097 # ValidNodes are part of a search tree that checks if a 

1098 # phrase is found in xlat_tags_map and other text->tags dicts. 

1099 if w in node.children: 

1100 # the phrase continues down the tree 

1101 # print("INC", w) 

1102 max_last_i = add_new( 

1103 node.children[w], 

1104 i, 

1105 start_i, 

1106 last_i, 

1107 new_paths, 

1108 new_nodes, 

1109 ) 

1110 if node.end: 

1111 # we've hit an end point, the tags and topics have already 

1112 # been gathered at some point, don't do anything with the 

1113 # old stuff 

1114 if w in valid_sequences.children: 

1115 # This starts a *new* possible section 

1116 max_last_i = add_new( 

1117 valid_sequences.children[w], # root-> 

1118 i, 

1119 i, 

1120 i, 

1121 new_paths, 

1122 new_nodes, 

1123 ) 

1124 if w not in node.children and not node.end: 

1125 # print("w not in node and $: i={} last_i={} wordlst={}" 

1126 # .format(i, last_i, wordlst)) 

1127 # If i == last_i == 0, for example (beginning) 

1128 if ( 

1129 i == last_i 

1130 or no_unknown_starts 

1131 or wordlst[last_i] not in allowed_unknown_starts 

1132 ): 

1133 # print("NEW", w) 

1134 if w in valid_sequences.children: 

1135 # Start new sequences here 

1136 max_last_i = add_new( 

1137 valid_sequences.children[w], 

1138 i, 

1139 i, 

1140 last_i, 

1141 new_paths, 

1142 new_nodes, 

1143 ) 

1144 if not new_nodes: 

1145 # This is run at the start when i == max_last_i == 0, 

1146 # which is what populates the first node in new_nodes. 

1147 # Some initial words cause the rest to be interpreted as unknown 

1148 # print("not new nodes: i={} last_i={} wordlst={}" 

1149 # .format(i, max_last_i, wordlst)) 

1150 if ( 

1151 i == max_last_i 

1152 or no_unknown_starts 

1153 or wordlst[max_last_i] not in allowed_unknown_starts 

1154 ): 

1155 # print("RECOVER w={} i={} max_last_i={} wordlst={}" 

1156 # .format(w, i, max_last_i, wordlst)) 

1157 if w in valid_sequences.children: 

1158 max_last_i = add_new( 

1159 # new sequence from root 

1160 valid_sequences.children[w], 

1161 i, 

1162 i, 

1163 max_last_i, 

1164 new_paths, 

1165 new_nodes, 

1166 ) 

1167 cur_nodes = new_nodes # Completely replace nodes! 

1168 # 2023-08-18, fix to improve performance 

1169 # Decode tags does a big search of the best-shortest matching 

1170 # sequences of tags, but the original algorithm didn't have 

1171 # any culling happen during operation, so in a case with 

1172 # a lot of tags (for example, big blocks of text inserted 

1173 # somewhere by mistake that is processed by decode_tags), 

1174 # it would lead to exponential growth of new_paths contents. 

1175 # This culling, using the same weighting algorithm code as 

1176 # in the original is just applied to new_paths before it is 

1177 # added to pos_paths. Basically it's "take the 10 best paths". 

1178 # This *can* cause bugs if it gets stuck in a local minimum 

1179 # or something, but this whole process is one-dimensional 

1180 # and not that complex, so hopefully it works out... 

1181 pw = [] 

1182 path: list[PosPathStep] 

1183 for path in new_paths: 

1184 weight = len(path) 

1185 if any(x[1] == ["UNKNOWN"] for x in path): 

1186 weight += 100 # Penalize unknown paths 

1187 pw.append((weight, path)) 

1188 new_paths = [weightpath[1] for weightpath in sorted(pw)[:10]] 

1189 pos_paths.append(new_paths) 

1190 

1191 # print("END max_last_i={} len(wordlst)={} len(pos_paths)={}" 

1192 # .format(max_last_i, len(wordlst), len(pos_paths))) 

1193 

1194 if cur_nodes: 

1195 # print("END HAVE_NODES") 

1196 for node, start_i, last_i in cur_nodes: 

1197 if node.end: 

1198 # print("$ END start_i={} last_i={}" 

1199 # .format(start_i, last_i)) 

1200 for path in pos_paths[start_i]: 

1201 pos_paths[-1].append( 

1202 [(last_i, node.tags, node.topics)] + path 

1203 ) 

1204 else: 

1205 # print("UNK END start_i={} last_i={} wordlst={}" 

1206 # .format(start_i, last_i, wordlst)) 

1207 u = check_unknown( 

1208 last_i, 

1209 len(wordlst), 

1210 len(wordlst), 

1211 wordlst, 

1212 allow_any, 

1213 no_unknown_starts, 

1214 ) 

1215 if pos_paths[start_i]: 

1216 for path in pos_paths[start_i]: 

1217 pos_paths[-1].append(u + path) 

1218 else: 

1219 pos_paths[-1].append(u) 

1220 else: 

1221 # Check for a final unknown tag 

1222 # print("NO END NODES max_last_i={}".format(max_last_i)) 

1223 paths = pos_paths[max_last_i] or [[]] 

1224 u = check_unknown( 

1225 max_last_i, 

1226 len(wordlst), 

1227 len(wordlst), 

1228 wordlst, 

1229 allow_any, 

1230 no_unknown_starts, 

1231 ) 

1232 if u: 

1233 # print("end max_last_i={}".format(max_last_i)) 

1234 for path in list(paths): # Copy in case it is the last pos 

1235 pos_paths[-1].append(u + path) 

1236 

1237 # import json 

1238 # print("POS_PATHS:", json.dumps(pos_paths, indent=2, sort_keys=True)) 

1239 

1240 if not pos_paths[-1]: 

1241 # print("decode_tags: {}: EMPTY POS_PATHS[-1]".format(src)) 

1242 return [], [] 

1243 

1244 # Find the best path 

1245 pw = [] 

1246 for path in pos_paths[-1]: 

1247 weight = len(path) 

1248 if any(x[1] == ["UNKNOWN"] for x in path): 

1249 weight += 100 # Penalize unknown paths 

1250 pw.append((weight, path)) 

1251 path = min(pw)[1] 

1252 

1253 # Convert the best path to tagsets and topics 

1254 tagsets: list[list[str]] = [[]] 

1255 topics: list[str] = [] 

1256 for i, tagspec, topicspec in path: 

1257 if len(tagsets or "") > 16: 

1258 # ctx.error("Too many tagsets! This is probably exponential", 

1259 # sortid="form_descriptions/20230818") 

1260 return [("error-unknown-tag", "error-exponential-tagsets")], [] 

1261 if tagspec == ["UNKNOWN"]: 

1262 new_tagsets = [] 

1263 for x in tagsets: 

1264 new_tagsets.append(x + topicspec) 

1265 tagsets = new_tagsets 

1266 continue 

1267 if tagspec: 

1268 new_tagsets = [] 

1269 for x in tagsets: 

1270 for t in tagspec: 

1271 if t: 1271 ↛ 1278line 1271 didn't jump to line 1278 because the condition on line 1271 was always true

1272 new_tags = list(x) 

1273 for tag in t.split(): 

1274 if tag not in new_tags: 

1275 new_tags.append(tag) 

1276 new_tagsets.append(new_tags) 

1277 else: 

1278 new_tagsets.append(x) 

1279 tagsets = new_tagsets 

1280 if topicspec: 

1281 for t in topicspec: 

1282 for topic in t.split(): 

1283 if topic not in topics: 

1284 topics.append(topic) 

1285 

1286 # print("unsorted tagsets:", tagsets) 

1287 ret_tagsets = sorted(set(tuple(sorted(set(tags))) for tags in tagsets)) 

1288 # topics = list(sorted(set(topics))) XXX tests expect not sorted 

1289 # print("decode_tags: {} -> {} topics {}".format(src, tagsets, topics)) 

1290 # Yes, ret_tagsets is a list of tags in tuples, while topics is a LIST 

1291 # of tags. Turning topics into a tuple breaks tests, turning the tuples 

1292 # inside tagsets into lists breaks tests, I'm leaving them mismatched 

1293 # for now. XXX 

1294 return ret_tagsets, topics 

1295 

1296 

1297def parse_head_final_tags( 

1298 wxr: WiktextractContext, lang: str, form: str 

1299) -> tuple[str, list[str]]: 

1300 """Parses tags that are allowed at the end of a form head from the end 

1301 of the form. This can also be used for parsing the final gender etc tags 

1302 from translations and linkages.""" 

1303 assert isinstance(wxr, WiktextractContext) 

1304 assert isinstance(lang, str) # Should be language that "form" is for 

1305 assert isinstance(form, str) 

1306 

1307 # print("parse_head_final_tags: lang={} form={!r}".format(lang, form)) 

1308 

1309 # Make sure there are no double spaces in the form as this code does not 

1310 # handle them otherwise. 

1311 form = re.sub(r"\s+", " ", form.strip()) 

1312 if not form: 

1313 return form, [] 

1314 

1315 origform = form 

1316 

1317 tags = [] 

1318 

1319 # If parsing for certain Bantu languages (e.g., Swahili), handle 

1320 # some extra head-final tags first 

1321 if lang in head_final_bantu_langs: 

1322 m = re.search(head_final_bantu_re, form) 

1323 if m is not None: 

1324 tagkeys = m.group(1) 

1325 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1325 ↛ 1340line 1325 didn't jump to line 1340 because the condition on line 1325 was always true

1326 form = form[: m.start()] 

1327 v = head_final_bantu_map[tagkeys] 

1328 if v.startswith("?"): 1328 ↛ 1329line 1328 didn't jump to line 1329 because the condition on line 1328 was never true

1329 v = v[1:] 

1330 wxr.wtp.debug( 

1331 "suspicious suffix {!r} in language {}: {}".format( 

1332 tagkeys, lang, origform 

1333 ), 

1334 sortid="form_descriptions/1028", 

1335 ) 

1336 tags.extend(v.split()) 

1337 

1338 # If parsing for certain Semitic languages (e.g., Arabic), handle 

1339 # some extra head-final tags first 

1340 if lang in head_final_semitic_langs: 

1341 m = re.search(head_final_semitic_re, form) 

1342 if m is not None: 

1343 tagkeys = m.group(1) 

1344 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1344 ↛ 1359line 1344 didn't jump to line 1359 because the condition on line 1344 was always true

1345 form = form[: m.start()] 

1346 v = head_final_semitic_map[tagkeys] 

1347 if v.startswith("?"): 1347 ↛ 1348line 1347 didn't jump to line 1348 because the condition on line 1347 was never true

1348 v = v[1:] 

1349 wxr.wtp.debug( 

1350 "suspicious suffix {!r} in language {}: {}".format( 

1351 tagkeys, lang, origform 

1352 ), 

1353 sortid="form_descriptions/1043", 

1354 ) 

1355 tags.extend(v.split()) 

1356 

1357 # If parsing for certain other languages (e.g., Lithuanian, 

1358 # French, Finnish), handle some extra head-final tags first 

1359 if lang in head_final_other_langs: 

1360 m = re.search(head_final_other_re, form) 

1361 if m is not None: 

1362 tagkeys = m.group(1) 

1363 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1363 ↛ 1369line 1363 didn't jump to line 1369 because the condition on line 1363 was always true

1364 form = form[: m.start()] 

1365 tags.extend(head_final_other_map[tagkeys].split(" ")) 

1366 

1367 # Handle normal head-final tags 

1368 # Loop this until nothing is found 

1369 while True: 

1370 prev_form = form 

1371 m = re.search(head_final_re, form) 

1372 if m is not None: 

1373 print(f"{m=}, {m.groups()=}") 

1374 tagkeys = m.group(3) 

1375 # Only replace tags ending with numbers in languages that have 

1376 # head-final numeric tags (e.g., Bantu classes); also, don't replace 

1377 # tags if the main title ends with them (then presume they are part 

1378 # of the word) 

1379 # print("head_final_tags form={!r} tagkeys={!r} lang={}" 

1380 # .format(form, tagkeys, lang)) 

1381 tagkeys_contains_digit = re.search(r"\d", tagkeys) 

1382 if ( 

1383 (not tagkeys_contains_digit or lang in head_final_numeric_langs) 

1384 and not wxr.wtp.title.endswith(" " + tagkeys) # type:ignore[union-attr] 

1385 and 

1386 # XXX the above test does not capture when the whole word is a 

1387 # xlat_head_map key, so I added the below test to complement 

1388 # it; does this break anything? 

1389 not wxr.wtp.title == tagkeys 

1390 ): # defunct/English, 

1391 # "more defunct" -> "more" ["archaic"] 

1392 if ( 1392 ↛ 1410line 1392 didn't jump to line 1410 because the condition on line 1392 was always true

1393 not tagkeys_contains_digit 

1394 or lang in head_final_numeric_langs 

1395 ): 

1396 # m.start(3) gets the start of what is in m.group(3), handy 

1397 form = form[: m.start(3)].strip() 

1398 v = xlat_head_map[tagkeys] 

1399 if v.startswith("?"): 1399 ↛ 1400line 1399 didn't jump to line 1400 because the condition on line 1399 was never true

1400 v = v[1:] 

1401 wxr.wtp.debug( 

1402 "suspicious suffix {!r} in language {}: {}".format( 

1403 tagkeys, lang, origform 

1404 ), 

1405 sortid="form_descriptions/1077", 

1406 ) 

1407 tags.extend(v.split()) 

1408 else: 

1409 break 

1410 if prev_form == form: 

1411 break 

1412 

1413 # Generate warnings about words ending in " or" after processing 

1414 if ( 

1415 (form.endswith(" or") and not origform.endswith(" or")) 

1416 or re.search( 

1417 r" (1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|" 

1418 r"1a|2a|9a|10a|m1|f1|f2|m2|f3|m3|f4|m4|f5|m5|or|\?)" 

1419 r"($|/| (f|m|sg|pl|anim|inan))", 

1420 form, 

1421 ) 

1422 or form.endswith(" du") 

1423 ): 

1424 if form not in ok_suspicious_forms: 

1425 wxr.wtp.debug( 

1426 "suspicious unhandled suffix in {}: {!r}, originally {!r}".format( 

1427 lang, form, origform 

1428 ), 

1429 sortid="form_descriptions/1089", 

1430 ) 

1431 

1432 # print("parse_head_final_tags: form={!r} tags={}".format(form, tags)) 

1433 return form, tags 

1434 

1435 

1436def quote_kept_parens(s: str) -> str: 

1437 """Changes certain parenthesized expressions so that they won't be 

1438 interpreted as parentheses. This is used for parts that are kept as 

1439 part of the word, such as "read admiral (upper half)".""" 

1440 return re.sub( 

1441 r"\((lower half|upper half|k|s|n|II|III|A|C|G|U|Y|" 

1442 r"vinyl|p-phenylene vinylene|\(\(\s*\)\))\)", 

1443 r"__lpar__\1__rpar__", 

1444 s, 

1445 ) 

1446 

1447 

1448def quote_kept_ruby( 

1449 wxr: WiktextractContext, 

1450 ruby_tuples: list[ 

1451 tuple[ 

1452 str, 

1453 str, 

1454 ] 

1455 ], 

1456 s: str, 

1457) -> str: 

1458 if len(ruby_tuples) < 1: 1458 ↛ 1459line 1458 didn't jump to line 1459 because the condition on line 1458 was never true

1459 wxr.wtp.debug( 

1460 "quote_kept_ruby called with no ruby", 

1461 sortid="form_description/1114/20230517", 

1462 ) 

1463 return s 

1464 ks = [] 

1465 rs = [] 

1466 for k, r in ruby_tuples: 

1467 ks.append(re.escape(k)) 

1468 rs.append(re.escape(r)) 

1469 if not (ks and rs): 1469 ↛ 1470line 1469 didn't jump to line 1470 because the condition on line 1469 was never true

1470 wxr.wtp.debug( 

1471 f"empty column in ruby_tuples: {ruby_tuples}", 

1472 sortid="form_description/1124/20230606", 

1473 ) 

1474 return s 

1475 newm = re.compile( 

1476 r"({})\s*\(\s*({})\s*\)".format("|".join(ks), "|".join(rs)) 

1477 ) 

1478 rub_re = re.compile( 

1479 r"({})".format( 

1480 r"|".join( 

1481 r"{}\(*{}\)*".format( 

1482 re.escape(k), 

1483 re.escape(r), 

1484 ) 

1485 for k, r in ruby_tuples 

1486 ) 

1487 ) 

1488 ) 

1489 

1490 def paren_replace(m: re.Match) -> str: 

1491 return re.sub(newm, r"\1__lrub__\2__rrub__", m.group(0)) 

1492 

1493 return re.sub(rub_re, paren_replace, s) 

1494 

1495 

1496def unquote_kept_parens(s: str) -> str: 

1497 """Conerts the quoted parentheses back to normal parentheses.""" 

1498 return re.sub(r"__lpar__(.*?)__rpar__", r"(\1)", s) 

1499 

1500 

1501def add_romanization( 

1502 wxr: WiktextractContext, 

1503 data: WordData, 

1504 roman: str, 

1505 text: str, 

1506 is_reconstruction: bool, 

1507 head_group: Optional[int], 

1508 ruby: Sequence[tuple[str, str]], 

1509) -> None: 

1510 tags_lst = ["romanization"] 

1511 m = re.match(r"([^:]+):(.+)", roman) 

1512 # This function's purpose is to intercept broken romanizations, 

1513 # like "Yale: hēnpyeng" style tags. Most romanization styles 

1514 # are already present as tags, so we can use decode_tags to find 

1515 # them. 

1516 if m: 1516 ↛ 1517line 1516 didn't jump to line 1517 because the condition on line 1516 was never true

1517 tagsets, topics = decode_tags(m.group(1)) 

1518 if tagsets: 

1519 for tags in tagsets: 

1520 tags_lst.extend(tags) 

1521 roman = m.group(2) 

1522 add_related( 

1523 wxr, 

1524 data, 

1525 tags_lst, 

1526 [roman], 

1527 text, 

1528 True, 

1529 is_reconstruction, 

1530 head_group, 

1531 ruby, 

1532 ) 

1533 

1534 

1535def add_related( 

1536 wxr: WiktextractContext, 

1537 data: WordData, 

1538 tags_lst: Union[list[str], tuple[str, ...]], 

1539 related_list: list[str], 

1540 origtext: str, 

1541 add_all_canonicals: bool, 

1542 is_reconstruction: bool, 

1543 head_group: Optional[int], 

1544 ruby_data: Optional[Sequence[tuple[str, str]]] = None, 

1545) -> Optional[list[tuple[str, ...]]]: 

1546 """Internal helper function for some post-processing entries for related 

1547 forms (e.g., in word head). This returns a list of list of tags to be 

1548 added to following related forms or None (cf. walrus/English word head, 

1549 parenthesized part starting with "both").""" 

1550 assert isinstance(wxr, WiktextractContext) 

1551 assert isinstance(tags_lst, (list, tuple)) 

1552 for x in tags_lst: 

1553 assert isinstance(x, str) 

1554 assert isinstance(related_list, (list, tuple)) 

1555 assert isinstance(origtext, str) 

1556 assert add_all_canonicals in (True, False) 

1557 assert isinstance(ruby_data, (list, tuple)) or ruby_data is None 

1558 if ruby_data is None: 1558 ↛ 1559line 1558 didn't jump to line 1559 because the condition on line 1558 was never true

1559 ruby_data = [] 

1560 related = " ".join(related_list) 

1561 # print("add_related: tags_lst={} related={}".format(tags_lst, related)) 

1562 if related == "[please provide]": 1562 ↛ 1563line 1562 didn't jump to line 1563 because the condition on line 1562 was never true

1563 return None 

1564 if related in IGNORED_RELATED: 1564 ↛ 1565line 1564 didn't jump to line 1565 because the condition on line 1564 was never true

1565 return None 

1566 if is_reconstruction and related.startswith("*") and len(related) > 1: 

1567 related = related[1:] 

1568 

1569 # Get title word, with any reconstruction prefix removed 

1570 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title) # type:ignore[arg-type] 

1571 

1572 def check_related(related: str) -> None: 

1573 # Warn about some suspicious related forms 

1574 m = re.search(suspicious_related_re, related) 

1575 if (m and m.group(0) not in titleword) or ( 

1576 related in ("f", "m", "n", "c") and len(titleword) >= 3 

1577 ): 

1578 if "eumhun" in tags_lst: 1578 ↛ 1579line 1578 didn't jump to line 1579 because the condition on line 1578 was never true

1579 return 

1580 if "cangjie-input" in tags_lst: 1580 ↛ 1581line 1580 didn't jump to line 1581 because the condition on line 1580 was never true

1581 return 

1582 if "class" in tags_lst: 1582 ↛ 1583line 1582 didn't jump to line 1583 because the condition on line 1582 was never true

1583 return 

1584 if wxr.wtp.section == "Korean" and re.search( 1584 ↛ 1588line 1584 didn't jump to line 1588 because the condition on line 1584 was never true

1585 r"^\s*\w*>\w*\s*$", related 

1586 ): 

1587 # ignore Korean "i>ni" / "라>나" values 

1588 return 

1589 if ( 1589 ↛ 1596line 1589 didn't jump to line 1596 because the condition on line 1589 was never true

1590 wxr.wtp.section == "Burmese" 

1591 and "romanization" in tags_lst 

1592 and re.search(r":", related) 

1593 ): 

1594 # ignore Burmese with ":", that is used in Burmese 

1595 # translitteration of "း", the high-tone visarga. 

1596 return 

1597 wxr.wtp.debug( 

1598 "suspicious related form tags {}: {!r} in {!r}".format( 

1599 tags_lst, related, origtext 

1600 ), 

1601 sortid="form_descriptions/1147", 

1602 ) 

1603 

1604 following_tagsets = None # Tagsets to add to following related forms 

1605 roman = None 

1606 tagsets1: list[tuple[str, ...]] = [tuple()] 

1607 topics1: list[str] = [] 

1608 

1609 m = re.match(r"\((([^()]|\([^()]*\))*)\)\s+", related) 

1610 if m: 

1611 paren = m.group(1) 

1612 related = related[m.end() :] 

1613 m = re.match(r"^(all|both) (.*)", paren) 

1614 if m: 1614 ↛ 1615line 1614 didn't jump to line 1615 because the condition on line 1614 was never true

1615 tagsets1, topics1 = decode_tags(m.group(2)) 

1616 following_tagsets = tagsets1 

1617 else: 

1618 tagsets1, topics1 = decode_tags(paren) 

1619 else: 

1620 m = re.search(r"\s+\((([^()]|\([^()]*\))*)\)$", related) 

1621 if m: 

1622 paren = m.group(1) 

1623 if paren.startswith("U+"): 1623 ↛ 1624line 1623 didn't jump to line 1624 because the condition on line 1623 was never true

1624 related = related[: m.start()] 

1625 else: 

1626 cls = classify_desc(paren) 

1627 if ( 1627 ↛ 1634line 1627 didn't jump to line 1634 because the condition on line 1627 was always true

1628 cls in ("romanization", "english") 

1629 and classify_desc(related[: m.start()]) == "other" 

1630 ): 

1631 roman = paren 

1632 related = related[: m.start()] 

1633 else: 

1634 related = related[: m.start()] 

1635 tagsets1, topics1 = decode_tags(paren) 

1636 if related and related.startswith("{{"): 1636 ↛ 1637line 1636 didn't jump to line 1637 because the condition on line 1636 was never true

1637 wxr.wtp.debug( 

1638 "{{ in word head form - possible Wiktionary error: {!r}".format( 

1639 related 

1640 ), 

1641 sortid="form_descriptions/1177", 

1642 ) 

1643 return None # Likely Wiktionary coding error 

1644 related = unquote_kept_parens(related) 

1645 # Split related by "/" (e.g., grande/Spanish) superlative in head 

1646 # Do not split if / in word title, see π//Japanese 

1647 if len(related) > 5 and "/" not in wxr.wtp.title: # type:ignore[operator] 

1648 alts = split_at_comma_semi(related, separators=["/"]) 

1649 else: 

1650 alts = [related] 

1651 if ruby_data: 

1652 # prepare some regex stuff in advance 

1653 ks, rs = [], [] 

1654 for k, r in ruby_data: 

1655 ks.append(re.escape(k)) 

1656 rs.append(re.escape(r)) 

1657 splitter = r"((?:{})__lrub__(?:{})__rrub__)".format( 

1658 "|".join(ks), "|".join(rs) 

1659 ) 

1660 for related in alts: 

1661 ruby: list[tuple[str, str]] = [] 

1662 if ruby_data: 

1663 new_related = [] 

1664 rub_split = re.split(splitter, related) 

1665 for s in rub_split: 

1666 m = re.match(r"(.+)__lrub__(.+)__rrub__", s) 

1667 if m: 

1668 # add ruby with (\1, \2) 

1669 ruby.append((m.group(1), m.group(2))) 

1670 new_related.append(m.group(1)) 

1671 else: 

1672 new_related.append(s) 

1673 related = "".join(new_related) 

1674 tagsets2, topics2 = decode_tags(" ".join(tags_lst)) 

1675 for tags1 in tagsets1: 

1676 assert isinstance(tags1, (list, tuple)) 

1677 for tags2 in tagsets2: 

1678 assert isinstance(tags1, (list, tuple)) 

1679 dt: LinkageData = {"word": related} 

1680 if roman: 

1681 dt["roman"] = roman 

1682 if ruby: 

1683 dt["ruby"] = ruby 

1684 if "alt-of" in tags2: 1684 ↛ 1685line 1684 didn't jump to line 1685 because the condition on line 1684 was never true

1685 check_related(related) 

1686 data_extend(data, "tags", tags1) 

1687 data_extend(data, "tags", tags2) 

1688 data_extend(data, "topics", topics1) 

1689 data_extend(data, "topics", topics2) 

1690 data_append(data, "alt_of", dt) 

1691 elif "form-of" in tags2: 1691 ↛ 1692line 1691 didn't jump to line 1692 because the condition on line 1691 was never true

1692 check_related(related) 

1693 data_extend(data, "tags", tags1) 

1694 data_extend(data, "tags", tags2) 

1695 data_extend(data, "topics", topics1) 

1696 data_extend(data, "topics", topics2) 

1697 data_append(data, "form_of", dt) 

1698 elif "compound-of" in tags2: 1698 ↛ 1699line 1698 didn't jump to line 1699 because the condition on line 1698 was never true

1699 check_related(related) 

1700 data_extend(data, "tags", tags1) 

1701 data_extend(data, "tags", tags2) 

1702 data_extend(data, "topics", topics1) 

1703 data_extend(data, "topics", topics2) 

1704 data_append(data, "compound", related) 

1705 else: 

1706 lang = wxr.wtp.section or "LANG_MISSING" 

1707 related, final_tags = parse_head_final_tags( 

1708 wxr, lang, related 

1709 ) 

1710 # print("add_related: related={!r} tags1={!r} tags2={!r} " 

1711 # "final_tags={!r}" 

1712 # .format(related, tags1, tags2, final_tags)) 

1713 tags = list(tags1) + list(tags2) + list(final_tags) 

1714 check_related(related) 

1715 form: FormData = {"form": related} 

1716 if head_group: 

1717 form["head_nr"] = head_group 

1718 if roman: 

1719 form["roman"] = roman 

1720 if ruby: 

1721 form["ruby"] = ruby 

1722 data_extend(form, "topics", topics1) 

1723 data_extend(form, "topics", topics2) 

1724 if topics1 or topics2: 1724 ↛ 1725line 1724 didn't jump to line 1725 because the condition on line 1724 was never true

1725 wxr.wtp.debug( 

1726 "word head form has topics: {}".format(form), 

1727 sortid="form_descriptions/1233", 

1728 ) 

1729 # Add tags from canonical form into the main entry 

1730 if "canonical" in tags: 

1731 if related in ("m", "f") and len(titleword) > 1: 1731 ↛ 1732line 1731 didn't jump to line 1732 because the condition on line 1731 was never true

1732 wxr.wtp.debug( 

1733 "probably incorrect canonical form " 

1734 "{!r} ignored (probably tag combination " 

1735 "missing from xlat_head_map)".format(related), 

1736 sortid="form_descriptions/1241", 

1737 ) 

1738 continue 

1739 if ( 

1740 related != titleword 

1741 or add_all_canonicals 

1742 or topics1 

1743 or topics2 

1744 or ruby 

1745 ): 

1746 data_extend(form, "tags", sorted(set(tags))) 

1747 else: 

1748 # We won't add canonical form here 

1749 filtered_tags = list( 

1750 x for x in tags if x != "canonical" 

1751 ) 

1752 data_extend(data, "tags", filtered_tags) 

1753 continue 

1754 else: 

1755 data_extend(form, "tags", sorted(set(tags))) 

1756 # Only insert if the form is not already there 

1757 for old in data.get("forms", ()): 

1758 if form == old: 1758 ↛ 1759line 1758 didn't jump to line 1759 because the condition on line 1758 was never true

1759 break 

1760 else: 

1761 data_append(data, "forms", form) 

1762 

1763 # If this form had pre-tags that started with "both" or "all", add those 

1764 # tags also to following related forms that don't have their own tags 

1765 # specified. 

1766 return following_tagsets 

1767 

1768 

1769# Issue #967, in English word forms sometimes forms are skipped because 

1770# they are taggable words and their distw() is too big, like clipping from clip 

1771WORDS_WITH_FALSE_POSITIVE_TAGS: dict[str, list[str]] = { 

1772 "clip": ["clipping"], # XXX remember to change me back to clipping after 

1773 "English": ["English", "Englishes"], 

1774 "common": ["common", "commoner"], 

1775 # tests. 

1776} 

1777 

1778WORDS_WITH_FALSE_POSITIVE_FORMS: dict[str, list[str]] = { 

1779 "unaccountability": ["countable", "uncountable"], 

1780 "uncountability": ["countable", "uncountable"], 

1781} 

1782 

1783FALSE_POSITIVE_MISSING_FORMS: dict[str, list[str]] = {} 

1784 

1785FORM_ASSOCIATED_TAG_WORDS: set[str] = { 

1786 "participle", 

1787 "past", 

1788 "present", 

1789 "singular", 

1790 "plural", 

1791 "first-person", 

1792 "second-person", 

1793 "third-person", 

1794 "gerund", 

1795} 

1796 

1797 

1798def parse_word_head( 

1799 wxr: WiktextractContext, 

1800 pos: str, 

1801 text: str, 

1802 data: WordData, 

1803 is_reconstruction: bool, 

1804 head_group: Optional[int], 

1805 ruby=None, 

1806 links=None, 

1807) -> None: 

1808 """Parses the head line for a word for in a particular language and 

1809 part-of-speech, extracting tags and related forms.""" 

1810 assert isinstance(wxr, WiktextractContext) 

1811 assert isinstance(pos, str) 

1812 assert isinstance(text, str) 

1813 assert isinstance(data, dict) 

1814 assert isinstance(ruby, (list, tuple)) or ruby is None 

1815 if ruby is None: 

1816 ruby = [] 

1817 assert is_reconstruction in (True, False) 

1818 # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text)) 

1819 # print(f"PARSE_WORD_HEAD: {data=}") 

1820 if links is None: 

1821 links = [] 

1822 

1823 if len(links) > 0: 

1824 # if we have link data (that is, links with stuff like commas and 

1825 # spaces, replace word_re with a modified local scope pattern 

1826 # print(f"links {list((c, ord(c)) for link in links for c in link)=}") 

1827 word_re = re.compile( 

1828 r"\b" # In case we have forms that are longer and contain links 

1829 + 

1830 # or words as a substring... 

1831 r"\b|\b".join( 

1832 sorted((re.escape(s) for s in links), key=lambda x: -len(x)) 

1833 ) 

1834 + r"\b|" 

1835 + word_pattern 

1836 ) 

1837 else: 

1838 word_re = word_re_global 

1839 

1840 if "Lua execution error" in text or "Lua timeout error" in text: 1840 ↛ 1841line 1840 didn't jump to line 1841 because the condition on line 1840 was never true

1841 return 

1842 

1843 # Fix words with "superlative:" or "comparative:" at end of head 

1844 # e.g. grande/Spanish/Adj 

1845 text = re.sub(r" (superlative|comparative): (.*)", r" (\1 \2)", text) 

1846 

1847 # Parse Arabic non-past forms, e.g. أبلع/Arabic/Verb 

1848 m = re.search(r", non-past ([^)]+ \([^)]+\))", text) 

1849 if m: 

1850 add_related( 

1851 wxr, 

1852 data, 

1853 ["non-past"], 

1854 [m.group(1)], 

1855 text, 

1856 True, 

1857 is_reconstruction, 

1858 head_group, 

1859 ruby, 

1860 ) 

1861 text = text[: m.start()] + text[m.end() :] 

1862 

1863 language = wxr.wtp.section 

1864 titleword = re.sub( 

1865 r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "MISSING_TITLE" 

1866 ) 

1867 titleparts = list( 

1868 m.group(0) 

1869 for m in re.finditer(word_re, wxr.wtp.title or "MISSING_TITLE") 

1870 ) 

1871 if not titleparts: 1871 ↛ 1872line 1871 didn't jump to line 1872 because the condition on line 1871 was never true

1872 return 

1873 

1874 # Remove " or" from the end to prevent weird canonical forms 

1875 if text.endswith(" or"): 

1876 for tp in titleparts: 

1877 if text.endswith(tp): 1877 ↛ 1878line 1877 didn't jump to line 1878 because the condition on line 1877 was never true

1878 break 

1879 else: 

1880 text = text.removesuffix(" or").rstrip() 

1881 

1882 # Handle the part of the head that is not in parentheses. However, certain 

1883 # parenthesized parts are part of word, and those must be handled 

1884 # specially here. 

1885 if ruby: 

1886 text = quote_kept_ruby(wxr, ruby, text) 

1887 base = text 

1888 base = quote_kept_parens(base) 

1889 base = remove_text_in_parentheses(base) 

1890 base = base.replace("?", "") # Removes uncertain articles etc 

1891 base = re.sub(r"\s+", " ", base) 

1892 base = re.sub(r" ([,;])", r"\1", base) 

1893 base = re.sub(r" • ", r" ", base) 

1894 # Many languages use • as a punctuation mark separating the base 

1895 # from the rest of the head. στάδιος/Ancient Greek, issue #176 

1896 base = base.strip() 

1897 # print(f"{base=}") 

1898 

1899 # Check for certain endings in head (mostly for compatibility with weird 

1900 # heads, e.g. rata/Romanian "1st conj." at end) 

1901 m = re.search(head_end_re, base) 

1902 tags: Union[tuple[str, ...], list[str]] = [] 

1903 if m: 1903 ↛ 1904line 1903 didn't jump to line 1904 because the condition on line 1903 was never true

1904 tags = head_end_map[m.group(1).lower()].split() 

1905 data_extend(data, "tags", tags) 

1906 base = base[: m.start()] 

1907 

1908 # Special case: handle Hán Nôm readings for Vietnamese characters 

1909 m = re.match( 

1910 r"{}: (Hán Nôm) readings: (.*)".format(re.escape(titleword)), base 

1911 ) 

1912 if m: 1912 ↛ 1913line 1912 didn't jump to line 1913 because the condition on line 1912 was never true

1913 tag, readings = m.groups() 

1914 tag = re.sub(r"\s+", "-", tag) 

1915 for reading in split_at_comma_semi(readings, skipped=links): 

1916 add_related( 

1917 wxr, 

1918 data, 

1919 [tag], 

1920 [reading], 

1921 text, 

1922 True, 

1923 is_reconstruction, 

1924 head_group, 

1925 ruby, 

1926 ) 

1927 return 

1928 

1929 # Special case: Hebrew " [pattern: nnn]" ending 

1930 m = re.search(r"\s+\[pattern: ([^]]+)\]", base) 

1931 if m: 1931 ↛ 1932line 1931 didn't jump to line 1932 because the condition on line 1931 was never true

1932 add_related( 

1933 wxr, 

1934 data, 

1935 ["class"], 

1936 [m.group(1)], 

1937 text, 

1938 True, 

1939 is_reconstruction, 

1940 head_group, 

1941 ruby, 

1942 ) 

1943 base = base[: m.start()] + base[m.end() :] 

1944 

1945 # Clean away some messy "Upload an image" template text used in 

1946 # American Sign Language: 

1947 # S@NearBaseForearm-PalmUp Frontandback S@BaseForearm-PalmUp 

1948 m = re.search(r"Upload .+ gif image.", base) 

1949 if m: 1949 ↛ 1950line 1949 didn't jump to line 1950 because the condition on line 1949 was never true

1950 base = base[: m.start()] + base[m.end() :] 

1951 

1952 semicolon_present = False 

1953 # Split the head into alternatives. This is a complicated task, as 

1954 # we do not want so split on "or" or "," when immediately followed by more 

1955 # head-final tags, but otherwise do want to split by them. 

1956 # 20230907 added "or" to this to handle 'true or false', titles with 'or' 

1957 if wxr.wtp.title and ( 

1958 "," in wxr.wtp.title or ";" in wxr.wtp.title or " or " in wxr.wtp.title 

1959 ): 

1960 # If the title has ";", we don't want to split on that and can remove 

1961 # the ; from the splitting regex pretty easily because it's uncommon. 

1962 # However, commas are so common that not splitting on them is just 

1963 # not feasible, and we have to just deal with that if there are 

1964 # alternative forms or variations with stray commas that shouldn't 

1965 # be split. 

1966 if ";" in wxr.wtp.title: 

1967 semicolon_present = True 

1968 base = base.replace(";", "<SEMICOLON>") 

1969 default_splitter = head_split_no_semicolon_re 

1970 else: 

1971 default_splitter = head_split_re 

1972 # A kludge to handle article titles/phrases with commas. 

1973 # Preprocess splits to first capture the title, then handle 

1974 # all the others as usual. 

1975 presplits = re.split(r"({})".format(wxr.wtp.title), base) 

1976 splits = [] 

1977 for psplit in presplits: 

1978 if psplit == wxr.wtp.title: 

1979 splits.append(psplit) 

1980 else: 

1981 splits.extend(re.split(default_splitter, psplit)) 

1982 else: 

1983 # Do the normal split; previous only-behavior. 

1984 splits = re.split(head_split_re, base) 

1985 # print("BASE: ", repr(base)) 

1986 # print("SPLITS:", splits) 

1987 alts: list[str] = [] 

1988 # print("parse_word_head: splits:", splits, 

1989 # "head_split_re_parens:", head_split_re_parens) 

1990 for i in range( 

1991 0, len(splits) - head_split_re_parens, head_split_re_parens + 1 

1992 ): 

1993 v = splits[i] 

1994 ending = splits[i + 1] or "" # XXX is this correct??? 

1995 # print("parse_word_head alts v={!r} ending={!r} alts={}" 

1996 # .format(v, ending, alts)) 

1997 if alts and (v == "" and ending): 

1998 assert ending[0] == " " 

1999 alts[-1] += " or" + ending # endings starts with space 

2000 elif v or ending: 

2001 alts.append((v or "") + (ending or "")) 

2002 last = splits[-1].strip() 

2003 conn = "" if len(splits) < 3 else splits[-2] 

2004 # print("parse_word_head alts last={!r} conn={!r} alts={}" 

2005 # .format(last, conn, alts)) 

2006 if ( 2006 ↛ 2017line 2006 didn't jump to line 2017 because the condition on line 2006 was never true

2007 alts 

2008 and last 

2009 and ( 

2010 last.split()[0] in xlat_head_map 

2011 or ( 

2012 conn == " or " 

2013 and (alts[-1] + " or " + last).strip() in xlat_head_map 

2014 ) 

2015 ) 

2016 ): 

2017 alts[-1] += " or " + last 

2018 elif last: 2018 ↛ 2019line 2018 didn't jump to line 2019 because the condition on line 2018 was never true

2019 alts.append(last) 

2020 

2021 # print("parse_word_head alts: {}".format(alts)) 

2022 # print(f"{base=}") 

2023 

2024 # Process the head alternatives 

2025 canonicals: list[tuple[list[str], list[str]]] = [] 

2026 mode: Optional[str] = None 

2027 for alt_i, alt in enumerate(alts): 

2028 alt = alt.strip() 

2029 if alt.startswith("compound form:"): 2029 ↛ 2030line 2029 didn't jump to line 2030 because the condition on line 2029 was never true

2030 mode = "compound-form" 

2031 alt = alt[14:].strip() 

2032 if ((dash_i := alt.find(" -")) > 0) and ( 

2033 dash_i > (wxr.wtp.title or "").find(" -") 

2034 ): 

2035 # test_en_head / test_suffixes_at_end_of_form1 

2036 # Some heads have suffixes that end up attached to the form 

2037 # like in https://en.wiktionary.org/wiki/%E6%A5%BD%E3%81%97%E3%81%84 

2038 alt = alt[:dash_i] 

2039 if mode == "compound-form": 2039 ↛ 2040line 2039 didn't jump to line 2040 because the condition on line 2039 was never true

2040 add_related( 

2041 wxr, 

2042 data, 

2043 ["in-compounds"], 

2044 [alt], 

2045 text, 

2046 True, 

2047 is_reconstruction, 

2048 head_group, 

2049 ruby, 

2050 ) 

2051 continue 

2052 # For non-first parts, see if it can be treated as tags-only 

2053 if alt_i == 0: 

2054 expanded_alts = [alt] 

2055 else: 

2056 expanded_alts = map_with(xlat_descs_map, [alt]) 

2057 # print("EXPANDED_ALTS:", expanded_alts) 

2058 tagsets: Optional[list[tuple[str, ...]]] 

2059 for alt in expanded_alts: 

2060 baseparts = list(m.group(0) for m in word_re.finditer(alt)) 

2061 if alt_i > 0: 

2062 tagsets, topics = decode_tags(" ".join(baseparts)) 

2063 if not any("error-unknown-tag" in x for x in tagsets): 

2064 data_extend(data, "topics", topics) 

2065 for tags1 in tagsets: 

2066 data_extend(data, "tags", tags1) 

2067 continue 

2068 

2069 alt, tags = parse_head_final_tags( 

2070 wxr, language or "MISSING_LANG", alt 

2071 ) 

2072 tags = list(tags) # Make sure we don't modify anything cached 

2073 tags.append("canonical") 

2074 if alt_i == 0 and "," in wxr.wtp.title or ";" in wxr.wtp.title: # type:ignore[operator] 

2075 # Kludge to handle article titles/phrases with commas. 

2076 # basepart's regex strips commas, which leads to a 

2077 # canonical form that is the title phrase without a comma. 

2078 # basepart in add_related is almost immediately joined with 

2079 # spaces anyhow. XXX not exactly sure why it's 

2080 # canonicals.append((tags, baseparts)) and not (tags, [alt]) 

2081 baseparts = [alt] 

2082 canonicals.append((tags, baseparts)) 

2083 

2084 # If more of this kind of replace-and-return-original kind of stuff is 

2085 # needed, make semicolon_present into a flag enum, something like `modified` 

2086 if semicolon_present: 

2087 new_cans = [] 

2088 for tags, baseparts in canonicals: 

2089 new_cans.append( 

2090 (tags, [s.replace("<SEMICOLON>", ";") for s in baseparts]) 

2091 ) 

2092 canonicals = new_cans 

2093 for tags, baseparts in canonicals: 

2094 add_related( 

2095 wxr, 

2096 data, 

2097 tags, 

2098 baseparts, 

2099 text, 

2100 len(canonicals) > 1, 

2101 is_reconstruction, 

2102 head_group, 

2103 ruby, 

2104 ) 

2105 

2106 # Handle parenthesized descriptors for the word form and links to 

2107 # related words 

2108 text = quote_kept_parens(text) 

2109 parens = list( 

2110 m.group(2) 

2111 for m in re.finditer(r"(^|\s)\((([^()]|\([^()]*\))*)\)", text) 

2112 ) 

2113 parens.extend( 

2114 m.group(1) 

2115 for m in re.finditer(r"[^\s]\((([^()]|\([^()]*\))*)\)($|\s)", text) 

2116 ) 

2117 have_romanization = False 

2118 have_ruby = False 

2119 hiragana = "" 

2120 katakana = "" 

2121 for paren in parens: 

2122 paren = paren.strip() 

2123 if not paren: 2123 ↛ 2124line 2123 didn't jump to line 2124 because the condition on line 2123 was never true

2124 continue 

2125 if paren.startswith("see "): 

2126 continue 

2127 if paren.startswith("U+"): 2127 ↛ 2128line 2127 didn't jump to line 2128 because the condition on line 2127 was never true

2128 continue 

2129 # In some rare cases, strip word that inflects form the form 

2130 # description, e.g. "look through rose-tinted glasses"/English. 

2131 paren = re.sub(r"\s*\(\[[^])]*\]\)", "", paren) 

2132 

2133 # If it starts with hiragana or katakana, treat as such form. Note 

2134 # that each hiragana/katakana character is in separate parentheses, 

2135 # so we must concatenate them. 

2136 try: 

2137 un = unicodedata.name(paren[0]).split()[0] 

2138 except ValueError: 

2139 un = "INVALID" 

2140 if un == "KATAKANA": 2140 ↛ 2141line 2140 didn't jump to line 2141 because the condition on line 2140 was never true

2141 katakana += paren 

2142 have_ruby = True 

2143 continue 

2144 if un == "HIRAGANA": 2144 ↛ 2145line 2144 didn't jump to line 2145 because the condition on line 2144 was never true

2145 hiragana += paren 

2146 have_ruby = True 

2147 continue 

2148 

2149 # Parse format ", 16 (Japan, Mainland), 17 (Hong Kong, Taiwan) strokes," 

2150 # in the middle of the parenthesized expression, e.g. 薄 

2151 def strokes_repl(m: re.Match) -> str: 

2152 strokes1, tags1, strokes2, tags2 = m.groups() 

2153 for strokes, tags in [[strokes1, tags1], [strokes2, tags2]]: 

2154 tags = tags.split(", ") 

2155 tags = list( 

2156 "Mainland China" if t == "Mainland" else t for t in tags 

2157 ) 

2158 tags.append("strokes") 

2159 add_related( 

2160 wxr, 

2161 data, 

2162 tags, 

2163 [strokes], 

2164 text, 

2165 True, 

2166 is_reconstruction, 

2167 head_group, 

2168 ruby, 

2169 ) 

2170 return ", " 

2171 

2172 paren = re.sub( 

2173 r", (\d+) \(([^()]+)\), (\d+) \(([^()]+)\) strokes, ", 

2174 strokes_repl, 

2175 paren, 

2176 ) 

2177 

2178 descriptors = map_with(xlat_descs_map, [paren]) 

2179 new_desc = [] 

2180 for desc in descriptors: 

2181 new_desc.extend( 

2182 map_with( 

2183 xlat_tags_map, 

2184 split_at_comma_semi(desc, extra=[", or "], skipped=links), 

2185 ) 

2186 ) 

2187 prev_tags: Union[list[list[str]], list[tuple[str, ...]], None] = None 

2188 following_tags = None # Added to prev_tags from previous parenthesized 

2189 # part, e.g. walrus/English 

2190 # "(both nonstandard, proscribed, uncommon)" 

2191 for desc_i, desc in enumerate(new_desc): 

2192 # print("HEAD DESC: {!r}".format(desc)) 

2193 

2194 # Abort on certain descriptors (assume remaining values are 

2195 # examples or uninteresting, cf. gaan/Navajo, horior/Latin) 

2196 if re.match(r"^(per |e\.g\.$)", desc): 2196 ↛ 2197line 2196 didn't jump to line 2197 because the condition on line 2196 was never true

2197 break 

2198 

2199 # If it all consists of CJK characters, add it with the 

2200 # CJK tag. This is used at least for some Vietnamese 

2201 # words (e.g., ba/Vietnamese) 

2202 try: 

2203 if all(unicodedata.name(x).startswith("CJK ") for x in desc): 2203 ↛ 2204line 2203 didn't jump to line 2204 because the condition on line 2203 was never true

2204 add_related( 

2205 wxr, 

2206 data, 

2207 ["CJK"], 

2208 [desc], 

2209 text, 

2210 True, 

2211 is_reconstruction, 

2212 head_group, 

2213 ruby, 

2214 ) 

2215 continue 

2216 except ValueError: 

2217 pass 

2218 

2219 # Handle some special cases 

2220 splitdesc = desc.split() 

2221 if ( 2221 ↛ 2230line 2221 didn't jump to line 2230 because the condition on line 2221 was never true

2222 len(splitdesc) >= 3 

2223 and splitdesc[1] == "superlative" 

2224 and classify_desc(splitdesc[0]) != "tags" 

2225 and prev_tags 

2226 ): 

2227 # Handle the special case of second comparative after comma, 

2228 # followed by superlative without comma. E.g. 

2229 # mal/Portuguese/Adv 

2230 for ts in prev_tags: 

2231 add_related( 

2232 wxr, 

2233 data, 

2234 ts, 

2235 [splitdesc[0]], 

2236 text, 

2237 True, 

2238 is_reconstruction, 

2239 head_group, 

2240 ruby, 

2241 ) 

2242 desc = " ".join(splitdesc[1:]) 

2243 elif ( 2243 ↛ 2251line 2243 didn't jump to line 2251 because the condition on line 2243 was never true

2244 len(splitdesc) == 2 

2245 and splitdesc[0] in ("also", "and") 

2246 and prev_tags 

2247 and classify_desc(splitdesc[1]) != "tags" 

2248 ): 

2249 # Sometimes alternative forms are prefixed with "also" or 

2250 # "and" 

2251 for ts in prev_tags: 

2252 add_related( 

2253 wxr, 

2254 data, 

2255 ts, 

2256 [splitdesc[1]], 

2257 text, 

2258 True, 

2259 is_reconstruction, 

2260 head_group, 

2261 ruby, 

2262 ) 

2263 continue 

2264 elif len(splitdesc) >= 2 and splitdesc[0] in ("including",): 2264 ↛ 2265line 2264 didn't jump to line 2265 because the condition on line 2264 was never true

2265 continue 

2266 

2267 # If only one word, assume it is comma-separated alternative 

2268 # to the previous one 

2269 if " " not in desc: 

2270 cls = classify_desc(desc) 

2271 if cls != "tags": 

2272 if prev_tags: 2272 ↛ 2274line 2272 didn't jump to line 2274 because the condition on line 2272 was never true

2273 # Assume comma-separated alternative to previous one 

2274 for ts in prev_tags: 

2275 add_related( 

2276 wxr, 

2277 data, 

2278 ts, 

2279 [desc], 

2280 text, 

2281 True, 

2282 is_reconstruction, 

2283 head_group, 

2284 ruby, 

2285 ) 

2286 continue 

2287 elif distw(titleparts, desc) <= 0.5: 2287 ↛ 2290line 2287 didn't jump to line 2290 because the condition on line 2287 was never true

2288 # Similar to head word, assume a dialectal variation to 

2289 # the base form. Cf. go/Alemannic German/Verb 

2290 add_related( 

2291 wxr, 

2292 data, 

2293 ["alternative"], 

2294 [desc], 

2295 text, 

2296 True, 

2297 is_reconstruction, 

2298 head_group, 

2299 ruby, 

2300 ) 

2301 continue 

2302 elif ( 

2303 cls in ("romanization", "english") 

2304 and not have_romanization 

2305 and classify_desc(titleword) == "other" 

2306 and not ( 

2307 "categories" in data and desc in data["categories"] 

2308 ) 

2309 ): 

2310 # Assume it to be a romanization 

2311 add_romanization( 

2312 wxr, 

2313 data, 

2314 desc, 

2315 text, 

2316 is_reconstruction, 

2317 head_group, 

2318 ruby, 

2319 ) 

2320 have_romanization = True 

2321 continue 

2322 

2323 m = re.match(r"^(\d+) strokes?$", desc) 

2324 if m: 

2325 # Special case, used to give #strokes for Han characters 

2326 add_related( 

2327 wxr, 

2328 data, 

2329 ["strokes"], 

2330 [m.group(1)], 

2331 text, 

2332 True, 

2333 is_reconstruction, 

2334 head_group, 

2335 ruby, 

2336 ) 

2337 continue 

2338 

2339 # See if it is radical+strokes 

2340 m = re.match( 

2341 r"^([\u2F00-\u2FDF\u2E80-\u2EFF\U00018800-\U00018AFF" 

2342 r"\uA490-\uA4CF\u4E00-\u9FFF]\+\d+)" 

2343 r"( in (Japanese|Chinese|traditional Chinese|" 

2344 r"simplified Chinese))?$", 

2345 desc, 

2346 ) 

2347 if m: 2347 ↛ 2350line 2347 didn't jump to line 2350 because the condition on line 2347 was never true

2348 # Special case, used to give radical + strokes for Han 

2349 # characters 

2350 radical_strokes = m.group(1) 

2351 lang = m.group(3) 

2352 t = ["radical+strokes"] 

2353 if lang: 

2354 t.extend(lang.split()) 

2355 add_related( 

2356 wxr, 

2357 data, 

2358 t, 

2359 [radical_strokes], 

2360 text, 

2361 True, 

2362 is_reconstruction, 

2363 head_group, 

2364 ruby, 

2365 ) 

2366 prev_tags = None 

2367 following_tags = None 

2368 continue 

2369 

2370 # See if it indicates historical Katakana ortography (←) or 

2371 # just otherwise katakana/hiragana form 

2372 m = re.match(r"←\s*|kana\s+", desc) 

2373 if m: 2373 ↛ 2374line 2373 didn't jump to line 2374 because the condition on line 2373 was never true

2374 if desc.startswith("←"): 

2375 t1 = "historical " 

2376 else: 

2377 t1 = "" 

2378 x = desc[m.end() :] 

2379 if x.endswith("?"): 

2380 x = x[:-1] 

2381 # XXX should we add a tag indicating uncertainty? 

2382 if x: 

2383 name = unicodedata.name(x[0]) 

2384 if name.startswith("HIRAGANA "): 

2385 desc = t1 + "hiragana " + x 

2386 elif name.startswith("KATAKANA "): 

2387 desc = t1 + "katakana " + x 

2388 

2389 # See if it is "n strokes in Chinese" or similar 

2390 m = re.match( 

2391 r"(\d+) strokes in (Chinese|Japanese|" 

2392 r"traditional Chinese|simplified Chinese)$", 

2393 desc, 

2394 ) 

2395 if m: 2395 ↛ 2397line 2395 didn't jump to line 2397 because the condition on line 2395 was never true

2396 # Special case, used to give just strokes for some Han chars 

2397 strokes = m.group(1) 

2398 lang = m.group(2) 

2399 t = ["strokes"] 

2400 t.extend(lang.split()) 

2401 add_related( 

2402 wxr, 

2403 data, 

2404 t, 

2405 [strokes], 

2406 text, 

2407 True, 

2408 is_reconstruction, 

2409 head_group, 

2410 ruby, 

2411 ) 

2412 prev_tags = None 

2413 following_tags = None 

2414 continue 

2415 

2416 # American Sign Language has images (or requests for image) 

2417 # as heads, + this ASL gloss after. 

2418 m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text) 

2419 if m2: 2419 ↛ 2420line 2419 didn't jump to line 2420 because the condition on line 2419 was never true

2420 add_related( 

2421 wxr, 

2422 data, 

2423 ["ASL-gloss"], 

2424 [m2.group(1)], 

2425 text, 

2426 True, 

2427 is_reconstruction, 

2428 head_group, 

2429 ruby, 

2430 ) 

2431 continue 

2432 

2433 parts = list(m.group(0) for m in re.finditer(word_re, desc)) 

2434 if not parts: 2434 ↛ 2435line 2434 didn't jump to line 2435 because the condition on line 2434 was never true

2435 prev_tags = None 

2436 following_tags = None 

2437 continue 

2438 

2439 # Check for certain language-specific header part starts that 

2440 # modify 

2441 if len(parts) == 2 and language in lang_specific_head_map: 2441 ↛ 2442line 2441 didn't jump to line 2442 because the condition on line 2441 was never true

2442 ht = lang_specific_head_map[language] 

2443 if parts[0] in ht: 

2444 rem_tags, add_tags = ht[parts[0]] 

2445 new_prev_tags1: list[list[str]] = [] 

2446 tags2: Union[tuple[str, ...], list[str]] 

2447 for tags2 in prev_tags or [()]: 

2448 if rem_tags is True: # Remove all old tags 

2449 tsets = set() 

2450 else: 

2451 tsets = set(tags2) - set(rem_tags.split()) 

2452 tsets = tsets | set(add_tags.split()) 

2453 tags = list(sorted(tsets)) 

2454 add_related( 

2455 wxr, 

2456 data, 

2457 tags, 

2458 [parts[1]], 

2459 text, 

2460 True, 

2461 is_reconstruction, 

2462 head_group, 

2463 ruby, 

2464 ) 

2465 new_prev_tags1.append(tags) 

2466 prev_tags = new_prev_tags1 

2467 following_tags = None 

2468 continue 

2469 

2470 # Handle the special case of descriptors that are parenthesized, 

2471 # e.g., (archaic or Scotland) 

2472 m = re.match(r"\(([^)]+)\)\s+(.*)$", desc) 

2473 if m is not None and classify_desc(m.group(1)) == "tags": 2473 ↛ 2474line 2473 didn't jump to line 2474 because the condition on line 2473 was never true

2474 tagpart = m.group(1) 

2475 related = [m.group(2)] 

2476 tagsets, topics = decode_tags(tagpart, no_unknown_starts=True) 

2477 if topics: 

2478 wxr.wtp.debug( 

2479 "parenthized head part {!r} contains topics: {}".format( 

2480 tagpart, topics 

2481 ), 

2482 sortid="form_descriptions/1647", 

2483 ) 

2484 elif m is not None and re.match(r"in the sense ", m.group(1)): 2484 ↛ 2487line 2484 didn't jump to line 2487 because the condition on line 2484 was never true

2485 # Handle certain ignored cases 

2486 # e.g. bord/Danish: in the sense "plank" 

2487 related = [m.group(2)] 

2488 tagsets = [()] 

2489 else: 

2490 # Normal parsing of the descriptor 

2491 alt_related = None 

2492 alt_tagsets = None 

2493 tagsets = None 

2494 for i in range(len(parts), 0, -1): 

2495 related = parts[i:] 

2496 tagparts = parts[:i] 

2497 # print(" i={} related={} tagparts={}" 

2498 # .format(i, related, tagparts)) 

2499 tagsets, topics = decode_tags( 

2500 " ".join(tagparts), no_unknown_starts=True 

2501 ) 

2502 # print("tagparts={!r} tagsets={} topics={} related={} " 

2503 # "alt_related={} distw={:.2f}" 

2504 # .format(tagparts, tagsets, topics, related, 

2505 # alt_related, 

2506 # distw(titleparts, parts[i - 1]))) 

2507 if ( 

2508 topics 

2509 or not tagsets 

2510 or any("error-unknown-tag" in x for x in tagsets) 

2511 ): 

2512 if alt_related is not None: 2512 ↛ 2514line 2512 didn't jump to line 2514 because the condition on line 2512 was never true

2513 # We already had a good division, so let's stop. 

2514 break 

2515 # Bad division, try deeper 

2516 continue 

2517 # print(f"{parts[i-1]=}, {parts=}") 

2518 if ( 

2519 i > 1 

2520 and len(parts[i - 1]) >= 4 

2521 and ( 

2522 distw(titleparts, parts[i - 1]) <= 0.4 

2523 or ( 

2524 wxr.wtp.section == "English" 

2525 and wxr.wtp.title 

2526 in WORDS_WITH_FALSE_POSITIVE_TAGS 

2527 and parts[i - 1] 

2528 in WORDS_WITH_FALSE_POSITIVE_TAGS[wxr.wtp.title] 

2529 ) 

2530 ) 

2531 # Fixes 'unaccountability' wiktext #1196 

2532 and not ( 

2533 wxr.wtp.section == "English" 

2534 and wxr.wtp.title in WORDS_WITH_FALSE_POSITIVE_FORMS 

2535 and parts[i - 1] 

2536 in WORDS_WITH_FALSE_POSITIVE_FORMS[wxr.wtp.title] 

2537 ) 

2538 # Fixes wiktextract #983, where "participle" 

2539 # was too close to "Martinize" and so this accepted 

2540 # ["participle", "Martinize"] as matching; this 

2541 # kludge prevents this from happening if titleparts 

2542 # is shorter than what would be 'related'. 

2543 # This breaks if we want to detect stuff that 

2544 # actually gets an extra space-separated word when 

2545 # 'inflected'. 

2546 and ( 

2547 len(titleparts) >= len(parts[i - 1 :]) 

2548 or "or" in parts[i - 1 :] 

2549 ) 

2550 ): 

2551 # print(f"Reached; {parts=}, {parts[i-1]=}") 

2552 alt_related = related 

2553 alt_tagsets = tagsets 

2554 continue 

2555 alt_related = None 

2556 alt_tagsets = None 

2557 break 

2558 else: 

2559 if alt_related is None: 2559 ↛ 2591line 2559 didn't jump to line 2591 because the condition on line 2559 was always true

2560 # Check if the parenthesized part is likely a 

2561 # romanization 

2562 if ( 2562 ↛ 2570line 2562 didn't jump to line 2570 because the condition on line 2562 was never true

2563 (have_ruby or classify_desc(base) == "other") 

2564 and classify_desc(paren) == "romanization" 

2565 and not ( 

2566 "categories" in data 

2567 and desc in data["categories"] 

2568 ) 

2569 ): 

2570 for r in split_at_comma_semi( 

2571 paren, extra=[" or "], skipped=links 

2572 ): 

2573 add_romanization( 

2574 wxr, 

2575 data, 

2576 r, 

2577 text, 

2578 is_reconstruction, 

2579 head_group, 

2580 ruby, 

2581 ) 

2582 have_romanization = True 

2583 continue 

2584 tagsets = [("error-unrecognized-head-form",)] 

2585 wxr.wtp.debug( 

2586 "unrecognized head form: {}".format(desc), 

2587 sortid="form_descriptions/1698", 

2588 ) 

2589 continue 

2590 

2591 if alt_related is not None: 2591 ↛ 2592line 2591 didn't jump to line 2592 because the condition on line 2591 was never true

2592 related = alt_related 

2593 tagsets = alt_tagsets 

2594 

2595 # print("FORM END: tagsets={} related={}".format(tagsets, related)) 

2596 # print("==================") 

2597 

2598 if ( 2598 ↛ 2619line 2598 didn't jump to line 2619 because the condition on line 2598 was never true

2599 len(related) <= 0 

2600 and wxr.wtp.section == "English" 

2601 and tagsets is not None 

2602 and len(tagsets) > 0 

2603 and not any( 

2604 s.startswith("error-") for tagset in tagsets for s in tagset 

2605 ) 

2606 and any( 

2607 s in FORM_ASSOCIATED_TAG_WORDS 

2608 for tagset in tagsets 

2609 for s in tagset 

2610 ) 

2611 and ( 

2612 wxr.wtp.title not in FALSE_POSITIVE_MISSING_FORMS 

2613 and not any( 

2614 rel in FALSE_POSITIVE_MISSING_FORMS[wxr.wtp.title or ""] 

2615 for rel in related 

2616 ) 

2617 ) 

2618 ): 

2619 wxr.wtp.debug( 

2620 f"Form tags without form: {desc=}, {tagsets=}", 

2621 sortid="form_description/20250107", 

2622 ) 

2623 if not tagsets: 2623 ↛ 2624line 2623 didn't jump to line 2624 because the condition on line 2623 was never true

2624 continue 

2625 

2626 # print(f"{alts=}, {related=}") 

2627 

2628 assert isinstance(related, (list, tuple)) 

2629 related_str = " ".join(related) 

2630 if "or" in titleparts: 

2631 alts = [related_str] 

2632 else: 

2633 alts = split_at_comma_semi( 

2634 related_str, separators=[r"\bor\b"], skipped=links 

2635 ) 

2636 # print(f"{related_str=}, {alts=}") 

2637 if not alts: 

2638 alts = [""] 

2639 for related_str in alts: 

2640 if related_str: 

2641 if prev_tags and ( 

2642 all( 

2643 all( 

2644 t in ["nonstandard", "dialectal"] 

2645 or valid_tags[t] == "dialect" 

2646 for t in tags 

2647 ) 

2648 for ts in tagsets 

2649 ) 

2650 or ( 

2651 any("participle" in ts for ts in prev_tags) 

2652 and all( 

2653 "attributive" in ts 

2654 or any(valid_tags[t] == "gender" for t in ts) 

2655 for ts in tagsets 

2656 ) 

2657 ) 

2658 ): 

2659 # Merged with previous tags. Don't update previous 

2660 # tags here; cf. burn/English/Verb 

2661 for tags_l in tagsets: 

2662 for ts in prev_tags: 

2663 tags_l1 = sorted(set(tags_l) | set(ts)) 

2664 add_related( 

2665 wxr, 

2666 data, 

2667 tags_l1, 

2668 [related_str], 

2669 text, 

2670 True, 

2671 is_reconstruction, 

2672 head_group, 

2673 ruby, 

2674 ) 

2675 else: 

2676 # Not merged with previous tags 

2677 for tags_l in tagsets: 

2678 if following_tags is not None: 2678 ↛ 2679line 2678 didn't jump to line 2679 because the condition on line 2678 was never true

2679 for ts in following_tags: 

2680 tags_l1 = list( 

2681 sorted(set(tags_l) | set(ts)) 

2682 ) 

2683 add_related( 

2684 wxr, 

2685 data, 

2686 tags_l1, 

2687 [related_str], 

2688 text, 

2689 True, 

2690 is_reconstruction, 

2691 head_group, 

2692 ruby, 

2693 ) 

2694 else: 

2695 ret = add_related( 

2696 wxr, 

2697 data, 

2698 tags_l, 

2699 [related_str], 

2700 text, 

2701 True, 

2702 is_reconstruction, 

2703 head_group, 

2704 ruby, 

2705 ) 

2706 if ret is not None: 2706 ↛ 2707line 2706 didn't jump to line 2707 because the condition on line 2706 was never true

2707 following_tags = ret 

2708 prev_tags = tagsets 

2709 else: 

2710 if desc_i < len(new_desc) - 1 and all( 2710 ↛ 2717line 2710 didn't jump to line 2717 because the condition on line 2710 was never true

2711 "participle" in ts or "infinitive" in ts 

2712 for ts in tagsets 

2713 ): 

2714 # Interpret it as a standalone form description 

2715 # in the middle, probably followed by forms or 

2716 # language-specific descriptors. cf. drikke/Danish 

2717 new_prev_tags2 = [] 

2718 for ts1 in prev_tags or [()]: 

2719 for ts2 in tagsets: 

2720 ts = tuple(sorted(set(ts1) | set(ts2))) 

2721 new_prev_tags2.append(ts) 

2722 prev_tags = new_prev_tags2 

2723 continue 

2724 for tags in tagsets: 

2725 data_extend(data, "tags", tags) 

2726 prev_tags = tagsets 

2727 following_tags = None 

2728 

2729 # Finally, if we collected hirakana/katakana, add them now 

2730 if hiragana: 2730 ↛ 2731line 2730 didn't jump to line 2731 because the condition on line 2730 was never true

2731 add_related( 

2732 wxr, 

2733 data, 

2734 ["hiragana"], 

2735 [hiragana], 

2736 text, 

2737 True, 

2738 is_reconstruction, 

2739 head_group, 

2740 ruby, 

2741 ) 

2742 if katakana: 2742 ↛ 2743line 2742 didn't jump to line 2743 because the condition on line 2742 was never true

2743 add_related( 

2744 wxr, 

2745 data, 

2746 ["katakana"], 

2747 [katakana], 

2748 text, 

2749 True, 

2750 is_reconstruction, 

2751 head_group, 

2752 ruby, 

2753 ) 

2754 

2755 # XXX check if this is actually relevant, tags in word root data 

2756 # is extremely rare (not sure where they slip through). 

2757 tags = data.get("tags", []) # type:ignore 

2758 if len(tags) > 0: 

2759 # wxr.wtp.debug( 

2760 # f"Tags appear in word root data: {data['tags']=}", # type:ignore 

2761 # sortid="form_descriptions/2620/20240606", 

2762 # ) # Messes up tests. 

2763 data["tags"] = sorted(set(tags)) # type:ignore 

2764 

2765 

2766def parse_sense_qualifier( 

2767 wxr: WiktextractContext, text: str, data: Union[SenseData, LinkageData] 

2768) -> None: 

2769 """Parses tags or topics for a sense or some other data. The values are 

2770 added into the dictionary ``data``.""" 

2771 assert isinstance(wxr, WiktextractContext) 

2772 assert isinstance(text, str) 

2773 assert isinstance(data, dict) 

2774 # print("parse_sense_qualifier:", text) 

2775 if re.match(r"\([^()]+\)$", text): 2775 ↛ 2776line 2775 didn't jump to line 2776 because the condition on line 2775 was never true

2776 text = text[1:-1] 

2777 if re.match(r'"[^"]+"$', text): 2777 ↛ 2778line 2777 didn't jump to line 2778 because the condition on line 2777 was never true

2778 text = text[1:-1] 

2779 lst = map_with(xlat_descs_map, [text]) 

2780 sense_tags: list[str] = [] 

2781 for text in lst: 

2782 for semi in split_at_comma_semi(text): 

2783 if not semi: 2783 ↛ 2784line 2783 didn't jump to line 2784 because the condition on line 2783 was never true

2784 continue 

2785 orig_semi = semi 

2786 idx = semi.find(":") 

2787 if idx >= 0: 2787 ↛ 2788line 2787 didn't jump to line 2788 because the condition on line 2787 was never true

2788 semi = semi[:idx] 

2789 cls = classify_desc(semi, allow_unknown_tags=True) 

2790 # print("parse_sense_qualifier: classify_desc: {} -> {}" 

2791 # .format(semi, cls)) 

2792 if cls == "tags": 

2793 tagsets, topics = decode_tags(semi) 

2794 data_extend(data, "topics", topics) 

2795 # XXX should think how to handle distinct options better, 

2796 # e.g., "singular and plural genitive"; that can't really be 

2797 # done with changing the calling convention of this function. 

2798 # Should split sense if more than one category of tags differs. 

2799 for tags in tagsets: 

2800 sense_tags.extend(tags) 

2801 elif cls == "taxonomic": 2801 ↛ 2802line 2801 didn't jump to line 2802 because the condition on line 2801 was never true

2802 if re.match(r"×[A-Z]", semi): 

2803 sense_tags.append("extinct") 

2804 semi = semi[1:] 

2805 data["taxonomic"] = semi 

2806 elif cls == "english": 

2807 if "qualifier" in data and data["qualifier"] != orig_semi: 2807 ↛ 2808line 2807 didn't jump to line 2808 because the condition on line 2807 was never true

2808 data["qualifier"] += "; " + orig_semi 

2809 else: 

2810 data["qualifier"] = orig_semi 

2811 else: 

2812 wxr.wtp.debug( 

2813 "unrecognized sense qualifier: {}".format(text), 

2814 sortid="form_descriptions/1831", 

2815 ) 

2816 sense_tags = sorted(set(sense_tags)) 

2817 data_extend(data, "tags", sense_tags) 

2818 

2819 

2820def parse_pronunciation_tags( 

2821 wxr: WiktextractContext, text: str, data: SoundData 

2822) -> None: 

2823 assert isinstance(wxr, WiktextractContext) 

2824 assert isinstance(text, str) 

2825 assert isinstance(data, dict) 

2826 text = text.strip() 

2827 if not text: 2827 ↛ 2828line 2827 didn't jump to line 2828 because the condition on line 2827 was never true

2828 return 

2829 cls = classify_desc(text) 

2830 notes = [] 

2831 if cls == "tags": 

2832 tagsets, topics = decode_tags(text) 

2833 data_extend(data, "topics", topics) 

2834 for tagset in tagsets: 

2835 for t in tagset: 

2836 if " " in t: 2836 ↛ 2837line 2836 didn't jump to line 2837 because the condition on line 2836 was never true

2837 notes.append(t) 

2838 else: 

2839 data_append(data, "tags", t) 

2840 else: 

2841 notes.append(text) 

2842 if notes: 

2843 data["note"] = "; ".join(notes) 

2844 

2845 

2846def parse_translation_desc( 

2847 wxr: WiktextractContext, lang: str, text: str, tr: TranslationData 

2848) -> None: 

2849 assert isinstance(wxr, WiktextractContext) 

2850 assert isinstance(lang, str) # The language of ``text`` 

2851 assert isinstance(text, str) 

2852 assert isinstance(tr, dict) 

2853 # print("parse_translation_desc:", text) 

2854 

2855 # Process all parenthesized parts from the translation item 

2856 note = None 

2857 restore_beginning = "" 

2858 restore_end = "" 

2859 while True: 

2860 beginning = False 

2861 # See if we can find a parenthesized expression at the end 

2862 m = re.search(r"\s*\((([^()]|\([^()]+\))+)\)\.?$", text) 

2863 if m: 

2864 par = m.group(1) 

2865 text = text[: m.start()] 

2866 if par.startswith(("literally ", "lit.")): 

2867 continue # Not useful for disambiguation in many idioms 

2868 else: 

2869 # See if we can find a parenthesized expression at the start 

2870 m = re.match(r"^\^?\((([^()]|\([^()]+\))+)\):?(\s+|$)", text) 

2871 if m: 

2872 par = m.group(1) 

2873 text = text[m.end() :] 

2874 beginning = True 

2875 if re.match(r"^(\d|\s|,| or | and )+$", par): 2875 ↛ 2880line 2875 didn't jump to line 2880 because the condition on line 2875 was never true

2876 # Looks like this beginning parenthesized expression only 

2877 # contains digits or their combinations. We assume such 

2878 # to be sense descriptions if no sense has been selected, 

2879 # or otherwise just ignore them. 

2880 if not tr.get("sense"): 

2881 tr["sense"] = par 

2882 continue 

2883 else: 

2884 # See if we can find a parenthesized expression in the middle. 

2885 # Romanizations are sometimes between word and gender marker, 

2886 # e.g. wife/English/Tr/Yiddish. 

2887 m = re.search(r"\s+\((([^()]|\([^()]+\))+)\)", text) 

2888 if m: 

2889 par = m.group(1) 

2890 text = text[: m.start()] + text[m.end() :] 

2891 else: 

2892 # No more parenthesized expressions - break out of the loop 

2893 break 

2894 

2895 # Some cleanup of artifacts that may result from skipping some templates 

2896 # in earlier stages 

2897 if par.startswith(": "): 2897 ↛ 2898line 2897 didn't jump to line 2898 because the condition on line 2897 was never true

2898 par = par[2:] 

2899 if par.endswith(","): 2899 ↛ 2900line 2899 didn't jump to line 2900 because the condition on line 2899 was never true

2900 par = par[:-1] 

2901 if re.match(r'^[“"]([^“”"]*)[“”"]$', par): 2901 ↛ 2902line 2901 didn't jump to line 2902 because the condition on line 2901 was never true

2902 par = par[1:-1] 

2903 par = par.strip() 

2904 

2905 # Check for special script pronunciation followed by romanization, 

2906 # used in many Asian languages. 

2907 lst = par.split(", ") 

2908 if len(lst) == 2: 

2909 a, r = lst 

2910 if classify_desc(a) == "other": 

2911 cls = classify_desc(r) 

2912 # print("parse_translation_desc: r={} cls={}".format(r, cls)) 

2913 if cls == "romanization" or ( 

2914 cls == "english" and len(r.split()) == 1 and r[0].islower() 

2915 ): 

2916 if tr.get("alt") and tr.get("alt") != a: 2916 ↛ 2917line 2916 didn't jump to line 2917 because the condition on line 2916 was never true

2917 wxr.wtp.debug( 

2918 'more than one value in "alt": {} vs. {}'.format( 

2919 tr["alt"], a 

2920 ), 

2921 sortid="form_descriptions/1930", 

2922 ) 

2923 tr["alt"] = a 

2924 if tr.get("roman") and tr.get("roman") != r: 2924 ↛ 2925line 2924 didn't jump to line 2925 because the condition on line 2924 was never true

2925 wxr.wtp.debug( 

2926 'more than one value in "roman": {} vs. {}'.format( 

2927 tr["roman"], r 

2928 ), 

2929 sortid="form_descriptions/1936", 

2930 ) 

2931 tr["roman"] = r 

2932 continue 

2933 

2934 # Check for certain comma-separated tags combined with English text 

2935 # at the beginning or end of a comma-separated parenthesized list 

2936 while len(lst) > 1: 

2937 cls = classify_desc(lst[0]) 

2938 if cls == "tags": 2938 ↛ 2939line 2938 didn't jump to line 2939 because the condition on line 2938 was never true

2939 tagsets, topics = decode_tags(lst[0]) 

2940 for t in tagsets: 

2941 data_extend(tr, "tags", t) 

2942 data_extend(tr, "topics", topics) 

2943 lst = lst[1:] 

2944 continue 

2945 cls = classify_desc(lst[-1]) 

2946 if cls == "tags": 

2947 tagsets, topics = decode_tags(lst[-1]) 

2948 for t in tagsets: 

2949 data_extend(tr, "tags", t) 

2950 data_extend(tr, "topics", topics) 

2951 lst = lst[:-1] 

2952 continue 

2953 break 

2954 par = ", ".join(lst) 

2955 

2956 if not par: 2956 ↛ 2957line 2956 didn't jump to line 2957 because the condition on line 2956 was never true

2957 continue 

2958 if re.search(tr_ignored_parens_re, par): 2958 ↛ 2959line 2958 didn't jump to line 2959 because the condition on line 2958 was never true

2959 continue 

2960 if par.startswith("numeral:"): 

2961 par = par[8:].strip() 

2962 

2963 # Classify the part in parenthesis and process accordingly 

2964 cls = classify_desc(par) 

2965 # print("parse_translation_desc classify: {!r} -> {}" 

2966 # .format(par, cls)) 

2967 if par == text: 

2968 pass 

2969 if par == "f": 2969 ↛ 2970line 2969 didn't jump to line 2970 because the condition on line 2969 was never true

2970 data_append(tr, "tags", "feminine") 

2971 elif par == "m": 2971 ↛ 2972line 2971 didn't jump to line 2972 because the condition on line 2971 was never true

2972 data_append(tr, "tags", "masculine") 

2973 elif cls == "tags": 

2974 tagsets, topics = decode_tags(par) 

2975 for tags in tagsets: 

2976 data_extend(tr, "tags", tags) 

2977 data_extend(tr, "topics", topics) 

2978 elif cls == "english": 

2979 # If the text contains any of certain grammatical words, treat it 

2980 # as a "note" instead of "english" 

2981 if re.search(tr_note_re, par): 

2982 if par.endswith(":"): 2982 ↛ 2983line 2982 didn't jump to line 2983 because the condition on line 2982 was never true

2983 par = par[:-1] 

2984 if par not in ("see entry for forms",): 2984 ↛ 2859line 2984 didn't jump to line 2859 because the condition on line 2984 was always true

2985 if note: 2985 ↛ 2986line 2985 didn't jump to line 2986 because the condition on line 2985 was never true

2986 note = note + ";" + par 

2987 else: 

2988 note = par 

2989 else: 

2990 # There can be more than one parenthesized english item, see 

2991 # e.g. Aunt/English/Translations/Tamil 

2992 if "translation" in tr and "english" in tr: 

2993 tr["english"] += "; " + par # DEPRECATED for "translation" 

2994 tr["translation"] += "; " + par 

2995 else: 

2996 tr["english"] = par # DEPRECATED for "translation" 

2997 tr["translation"] = par 

2998 elif cls == "romanization": 

2999 # print("roman text={!r} text cls={}" 

3000 # .format(text, classify_desc(text))) 

3001 if classify_desc(text) in ( 

3002 "english", 

3003 "romanization", 

3004 ) and lang not in ("Egyptian",): 

3005 if beginning: 

3006 restore_beginning += "({}) ".format(par) 

3007 else: 

3008 restore_end = " ({})".format(par) + restore_end 

3009 else: 

3010 if tr.get("roman"): 3010 ↛ 3011line 3010 didn't jump to line 3011 because the condition on line 3010 was never true

3011 wxr.wtp.debug( 

3012 'more than one value in "roman": {} vs. {}'.format( 

3013 tr["roman"], par 

3014 ), 

3015 sortid="form_descriptions/2013", 

3016 ) 

3017 tr["roman"] = par 

3018 elif cls == "taxonomic": 3018 ↛ 3019line 3018 didn't jump to line 3019 because the condition on line 3018 was never true

3019 if tr.get("taxonomic"): 

3020 wxr.wtp.debug( 

3021 'more than one value in "taxonomic": {} vs. {}'.format( 

3022 tr["taxonomic"], par 

3023 ), 

3024 sortid="form_descriptions/2019", 

3025 ) 

3026 if re.match(r"×[A-Z]", par): 

3027 data_append(tr, "tags", "extinct") 

3028 par = par[1:] 

3029 tr["taxonomic"] = par 

3030 elif cls == "other": 3030 ↛ 3040line 3030 didn't jump to line 3040 because the condition on line 3030 was always true

3031 if tr.get("alt"): 3031 ↛ 3032line 3031 didn't jump to line 3032 because the condition on line 3031 was never true

3032 wxr.wtp.debug( 

3033 'more than one value in "alt": {} vs. {}'.format( 

3034 tr["alt"], par 

3035 ), 

3036 sortid="form_descriptions/2028", 

3037 ) 

3038 tr["alt"] = par 

3039 else: 

3040 wxr.wtp.debug( 

3041 "parse_translation_desc unimplemented cls {}: {}".format( 

3042 cls, par 

3043 ), 

3044 sortid="form_descriptions/2033", 

3045 ) 

3046 

3047 # Check for gender indications in suffix 

3048 text, final_tags = parse_head_final_tags(wxr, lang, text) 

3049 data_extend(tr, "tags", final_tags) 

3050 

3051 # Restore those parts that we did not want to remove (they are often 

3052 # optional words or words that are always used with the given translation) 

3053 text = restore_beginning + text + restore_end 

3054 

3055 if note: 

3056 tr["note"] = note.strip() 

3057 if text and text not in ignored_translations: 

3058 tr["word"] = text.strip() 

3059 

3060 # Sometimes gender seems to be at the end of "roman" field, see e.g. 

3061 # fire/English/Noun/Translations/Egyptian (for "oxidation reaction") 

3062 roman = tr.get("roman") 

3063 if roman: 

3064 if roman.endswith(" f"): 3064 ↛ 3065line 3064 didn't jump to line 3065 because the condition on line 3064 was never true

3065 data_append(tr, "tags", "feminine") 

3066 tr["roman"] = roman[:-2].strip() 

3067 elif roman.endswith(" m"): 3067 ↛ 3068line 3067 didn't jump to line 3068 because the condition on line 3067 was never true

3068 data_append(tr, "tags", "masculine") 

3069 tr["roman"] = roman[:-2].strip() 

3070 

3071 # If the word now has "translation" field but no "roman" field, and 

3072 # the word would be classified "other" (generally non-latin 

3073 # characters), and the value in "translation" is only one lowercase 

3074 # word, move it to "roman". This happens semi-frequently when the 

3075 # translation is transliterated the same as some English word. 

3076 roman = tr.get("roman") 

3077 english = tr.get("translation") 

3078 if english and not roman and "word" in tr: 

3079 cls = classify_desc(tr["word"]) 

3080 if cls == "other" and " " not in english and english[0].islower(): 

3081 del tr["translation"] 

3082 if "english" in tr: # DEPRECATED for "translation" 3082 ↛ 3084line 3082 didn't jump to line 3084 because the condition on line 3082 was always true

3083 del tr["english"] 

3084 tr["roman"] = english 

3085 

3086 # If the entry now has both tr["roman"] and tr["word"] and they have 

3087 # the same value, delete tr["roman"] (e.g., man/English/Translations 

3088 # Evenki) 

3089 if tr.get("word") and tr.get("roman") == tr.get("word"): 3089 ↛ 3090line 3089 didn't jump to line 3090 because the condition on line 3089 was never true

3090 del tr["roman"] 

3091 

3092 

3093def parse_alt_or_inflection_of( 

3094 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str] 

3095) -> Optional[tuple[list[str], Optional[list[AltOf]]]]: 

3096 """Tries to parse an inflection-of or alt-of description. If successful, 

3097 this returns (tags, alt-of/inflection-of-dict). If the description cannot 

3098 be parsed, this returns None. This may also return (tags, None) when the 

3099 gloss describes a form (or some other tags were extracted from it), but 

3100 there was no alt-of/form-of/synonym-of word.""" 

3101 # print("parse_alt_or_inflection_of: {!r}".format(gloss)) 

3102 # Occasionally inflection_of/alt_of have "A(n) " etc. at the beginning. 

3103 

3104 # Never interpret a gloss that is equal to the word itself as a tag 

3105 # (e.g., instrumental/Romanian, instrumental/Spanish). 

3106 if gloss.lower() == wxr.wtp.title.lower() or ( # type:ignore[union-attr] 

3107 len(gloss) >= 5 and distw([gloss.lower()], wxr.wtp.title.lower()) < 0.2 # type:ignore[union-attr] 

3108 ): 

3109 return None 

3110 

3111 # First try parsing it as-is 

3112 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args) 

3113 if parsed is not None: 

3114 return parsed 

3115 

3116 # Next try parsing it with the first character converted to lowercase if 

3117 # it was previously uppercase. 

3118 if gloss and gloss[0].isupper(): 

3119 gloss = gloss[0].lower() + gloss[1:] 

3120 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args) 

3121 if parsed is not None: 

3122 return parsed 

3123 

3124 return None 

3125 

3126 

3127# These tags are not allowed in alt-or-inflection-of parsing 

3128alt_infl_disallowed: set[str] = set( 

3129 [ 

3130 "error-unknown-tag", 

3131 "place", # Not in inflected forms and causes problems e.g. house/English 

3132 ] 

3133) 

3134 

3135 

3136def parse_alt_or_inflection_of1( 

3137 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str] 

3138) -> Optional[tuple[list[str], Optional[list[AltOf]]]]: 

3139 """Helper function for parse_alt_or_inflection_of. This handles a single 

3140 capitalization.""" 

3141 if not gloss or not gloss.strip(): 3141 ↛ 3142line 3141 didn't jump to line 3142 because the condition on line 3141 was never true

3142 return None 

3143 

3144 # Prevent some common errors where we would parse something we shouldn't 

3145 if re.search(r"(?i)form of address ", gloss): 3145 ↛ 3146line 3145 didn't jump to line 3146 because the condition on line 3145 was never true

3146 return None 

3147 

3148 gloss = re.sub(r"only used in [^,]+, ", "", gloss) 

3149 

3150 # First try all formats ending with "of" (or other known last words that 

3151 # can end a form description) 

3152 matches = list(re.finditer(r"\b(of|for|by|as|letter|number) ", gloss)) 

3153 m: Optional[re.Match] 

3154 for m in reversed(matches): 

3155 desc = gloss[: m.end()].strip() 

3156 base = gloss[m.end() :].strip() 

3157 tagsets, topics = decode_tags(desc, no_unknown_starts=True) 

3158 if not topics and any( 

3159 not (alt_infl_disallowed & set(ts)) for ts in tagsets 

3160 ): 

3161 # Successfully parsed, including "of" etc. 

3162 tags: list[str] = [] 

3163 # If you have ("Western-Armenian", ..., "form-of") as your 

3164 # tag set, it's most probable that it's something like 

3165 # "Western Armenian form of խոսել (xosel)", which should 

3166 # get "alt-of" instead of "form-of" (inflection). 

3167 # խօսիլ/Armenian 

3168 for ts_t in tagsets: 

3169 if "form-of" in ts_t and any( 

3170 valid_tags.get(tk) == "dialect" for tk in ts_t 

3171 ): 

3172 ts_s = (set(ts_t) - {"form-of"}) | {"alt-of"} 

3173 else: 

3174 ts_s = set(ts_t) 

3175 if not (alt_infl_disallowed & ts_s): 3175 ↛ 3168line 3175 didn't jump to line 3168 because the condition on line 3175 was always true

3176 tags.extend(ts_s) 

3177 if ( 

3178 "alt-of" in tags 

3179 or "form-of" in tags 

3180 or "synonym-of" in tags 

3181 or "compound-of" in tags 

3182 ): 

3183 break 

3184 if m.group(1) == "of": 

3185 # Try parsing without the final "of". This is commonly used in 

3186 # various form-of expressions. 

3187 desc = gloss[: m.start()] 

3188 base = gloss[m.end() :] 

3189 tagsets, topics = decode_tags(desc, no_unknown_starts=True) 

3190 # print("ALT_OR_INFL: desc={!r} base={!r} tagsets={} topics={}" 

3191 # .format(desc, base, tagsets, topics)) 

3192 if not topics and any( 

3193 not (alt_infl_disallowed & set(t)) for t in tagsets 

3194 ): 

3195 tags = [] 

3196 for t in tagsets: 

3197 if not (alt_infl_disallowed & set(t)): 3197 ↛ 3196line 3197 didn't jump to line 3196 because the condition on line 3197 was always true

3198 tags.extend(t) 

3199 # It must have at least one tag from form_of_tags 

3200 if set(tags) & form_of_tags: 

3201 # Accept this as form-of 

3202 tags.append("form-of") 

3203 break 

3204 if set(tags) & alt_of_tags: 

3205 # Accept this as alt-of 

3206 tags.append("alt-of") 

3207 break 

3208 

3209 else: 

3210 # Did not find a form description based on last word; see if the 

3211 # whole description is tags 

3212 tagsets, topics = decode_tags(gloss, no_unknown_starts=True) 

3213 if not topics and any( 

3214 not (alt_infl_disallowed & set(ts)) and form_of_tags & set(ts) 

3215 for ts in tagsets 

3216 ): 

3217 tags = [] 

3218 for ts in tagsets: 

3219 if not (alt_infl_disallowed & set(ts)) and form_of_tags & set( 3219 ↛ 3218line 3219 didn't jump to line 3218 because the condition on line 3219 was always true

3220 ts 

3221 ): 

3222 tags.extend(ts) 

3223 base = "" 

3224 else: 

3225 return None 

3226 

3227 # kludge for Spanish (again): 'x of [word] combined with [clitic]' 

3228 m = re.search(r"combined with \w+$", base) 

3229 if m: 3229 ↛ 3230line 3229 didn't jump to line 3230 because the condition on line 3229 was never true

3230 tagsets, topics = decode_tags(m.group(0), no_unknown_starts=True) 

3231 if not topics: 

3232 for ts in tagsets: 

3233 tags.extend(ts) 

3234 base = base[: m.start()] 

3235 

3236 # It is fairly common for form_of glosses to end with something like 

3237 # "ablative case" or "in instructive case". Parse that ending. 

3238 base = base.strip() 

3239 lst = base.split() 

3240 # print("parse_alt_or_inflection_of: lst={}".format(lst)) 

3241 if len(lst) >= 3 and lst[-1] in ("case", "case."): 3241 ↛ 3242line 3241 didn't jump to line 3242 because the condition on line 3241 was never true

3242 node = valid_sequences.children.get(lst[-2]) 

3243 if node and node.end: 

3244 for s in node.tags: 

3245 tags.extend(s.split(" ")) 

3246 lst = lst[:-2] 

3247 if lst[-1] == "in" and len(lst) > 1: 

3248 lst = lst[:-1] 

3249 

3250 # Eliminate empty and duplicate tags 

3251 tags = sorted(set(t for t in tags if t)) 

3252 

3253 # Clean up some extra stuff from the linked word, separating the text 

3254 # into ``base`` (the linked word) and ``extra`` (additional information, 

3255 # such as English translation or clarifying word sense information). 

3256 orig_base = base 

3257 base = re.sub(alt_of_form_of_clean_re, "", orig_base) 

3258 base = re.sub(r" [(⟨][^()]*[)⟩]", "", base) # Remove all (...) groups 

3259 extra = orig_base[len(base) :] 

3260 extra = re.sub(r"^[- :;.,,—]+", "", extra) 

3261 if extra.endswith(".") and extra.count(".") == 1: 

3262 extra = extra[:-1].strip() 

3263 m = re.match(r"^\(([^()]*)\)$", extra) 

3264 if m: 3264 ↛ 3265line 3264 didn't jump to line 3265 because the condition on line 3264 was never true

3265 extra = m.group(1) 

3266 else: 

3267 # These weird backets used in "slash mark" 

3268 m = re.match(r"^⟨([^()]*)⟩$", extra) 

3269 if m: 3269 ↛ 3270line 3269 didn't jump to line 3270 because the condition on line 3269 was never true

3270 extra = m.group(1) 

3271 m = re.match(r'^[“"]([^"“”]*)["”]$', extra) 

3272 if m: 3272 ↛ 3273line 3272 didn't jump to line 3273 because the condition on line 3272 was never true

3273 extra = m.group(1) 

3274 # Note: base might still contain comma-separated values and values 

3275 # separated by "and" 

3276 base = base.strip() 

3277 if base.endswith(",") and len(base) > 2: 3277 ↛ 3278line 3277 didn't jump to line 3278 because the condition on line 3277 was never true

3278 base = base[:-1].strip() 

3279 while ( 

3280 base.endswith(".") 

3281 and not wxr.wtp.page_exists(base) 

3282 and base not in gloss_template_args 

3283 ): 

3284 base = base[:-1].strip() 

3285 if base.endswith('(\u201cconjecture")'): 3285 ↛ 3286line 3285 didn't jump to line 3286 because the condition on line 3285 was never true

3286 base = base[:-14].strip() 

3287 tags.append("conjecture") 

3288 while ( 3288 ↛ 3293line 3288 didn't jump to line 3293 because the condition on line 3288 was never true

3289 base.endswith(".") 

3290 and not wxr.wtp.page_exists(base) 

3291 and base not in gloss_template_args 

3292 ): 

3293 base = base[:-1].strip() 

3294 if ( 3294 ↛ 3299line 3294 didn't jump to line 3299 because the condition on line 3294 was never true

3295 base.endswith(".") 

3296 and base not in gloss_template_args 

3297 and base[:-1] in gloss_template_args 

3298 ): 

3299 base = base[:-1] 

3300 base = base.strip() 

3301 if not base: 

3302 return tags, None 

3303 

3304 # Kludge: Spanish verb forms seem to have a dot added at the end. 

3305 # Remove it; we know of no Spanish verbs ending with a dot. 

3306 language = wxr.wtp.section 

3307 pos = wxr.wtp.subsection 

3308 # print("language={} pos={} base={}".format(language, pos, base)) 

3309 if ( 3309 ↛ 3315line 3309 didn't jump to line 3315 because the condition on line 3309 was never true

3310 base.endswith(".") 

3311 and len(base) > 1 

3312 and base[-2].isalpha() 

3313 and (language == "Spanish" and pos == "Verb") 

3314 ): 

3315 base = base[:-1] 

3316 

3317 # Split base to alternatives when multiple alternatives provided 

3318 parts = split_at_comma_semi(base, extra=[" / ", "/", r" \+ "]) 

3319 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "") 

3320 if ( 

3321 len(parts) <= 1 

3322 or base.startswith("/") 

3323 or base.endswith("/") 

3324 or "/" in titleword 

3325 ): 

3326 parts = [base] 

3327 # Split base to alternatives when of form "a or b" and "a" and "b" are 

3328 # similar (generally spelling variants of the same word or similar words) 

3329 if len(parts) == 1: 

3330 pp = base.split() 

3331 if len(pp) == 3 and pp[1] == "or" and distw([pp[0]], pp[2]) < 0.4: 

3332 parts = [pp[0], pp[2]] 

3333 

3334 # Create form-of/alt-of entries based on the extracted data 

3335 dt_lst: list[AltOf] = [] 

3336 for p in parts: 

3337 # Check for some suspicious base forms 

3338 m = re.search(r"[.,] |[{}()]", p) 

3339 if m and not wxr.wtp.page_exists(p): 3339 ↛ 3340line 3339 didn't jump to line 3340 because the condition on line 3339 was never true

3340 wxr.wtp.debug( 

3341 "suspicious alt_of/form_of with {!r}: {}".format(m.group(0), p), 

3342 sortid="form_descriptions/2278", 

3343 ) 

3344 if p.startswith("*") and len(p) >= 3 and p[1].isalpha(): 3344 ↛ 3345line 3344 didn't jump to line 3345 because the condition on line 3344 was never true

3345 p = p[1:] 

3346 dt: AltOf = {"word": p} 

3347 if extra: 

3348 dt["extra"] = extra 

3349 dt_lst.append(dt) 

3350 # print("alt_or_infl_of returning tags={} lst={} base={!r}" 

3351 # .format(tags, lst, base)) 

3352 return tags, dt_lst 

3353 

3354 

3355@functools.lru_cache(maxsize=65536) 

3356def classify_desc( 

3357 desc: str, 

3358 allow_unknown_tags=False, 

3359 no_unknown_starts=False, 

3360 accepted: Union[tuple[str, ...], frozenset[str]] = tuple(), 

3361) -> str: 

3362 """Determines whether the given description is most likely tags, english, 

3363 a romanization, or something else. Returns one of: "tags", "english", 

3364 "romanization", or "other". If ``allow_unknown_tags`` is True, then 

3365 allow "tags" classification even when the only tags are those starting 

3366 with a word in allowed_unknown_starts.""" 

3367 assert isinstance(desc, str) 

3368 # Empty and whitespace-only strings are treated as "other" 

3369 desc = desc.strip() 

3370 if not desc: 

3371 return "other" 

3372 

3373 normalized_desc = unicodedata.normalize("NFKD", desc) 

3374 

3375 # If it can be fully decoded as tags without errors, treat as tags 

3376 tagsets, topics = decode_tags(desc, no_unknown_starts=no_unknown_starts) 

3377 for tagset in tagsets: 

3378 assert isinstance(tagset, (list, tuple, set)) 

3379 if "error-unknown-tag" not in tagset and ( 

3380 topics or allow_unknown_tags or any(" " not in x for x in tagset) 

3381 ): 

3382 return "tags" 

3383 

3384 # Check if it looks like the taxonomic name of a species 

3385 if desc in known_species: 

3386 return "taxonomic" 

3387 desc1 = re.sub(r"^×([A-Z])", r"\1", desc) 

3388 desc1 = re.sub(r"\s*×.*", "", desc1) 

3389 lst = desc1.split() 

3390 if len(lst) > 1 and len(lst) <= 5 and lst[0] in known_firsts: 

3391 have_non_english = 1 if lst[0].lower() not in english_words else 0 

3392 for x in lst[1:]: 

3393 if x in ("A", "B", "C", "D", "E", "F", "I", "II", "III", "IV", "V"): 

3394 continue 

3395 if x[0].isupper(): 

3396 break 

3397 if x not in english_words: 

3398 have_non_english += 1 

3399 else: 

3400 # Starts with known taxonomic term, does not contain uppercase 

3401 # words (except allowed letters) and at least one word is not 

3402 # English 

3403 if have_non_english >= len(lst) - 1 and have_non_english > 0: 3403 ↛ 3409line 3403 didn't jump to line 3409 because the condition on line 3403 was always true

3404 return "taxonomic" 

3405 

3406 # If all words are in our English dictionary, interpret as English. 

3407 # [ -~] is regex black magic, "ALL CHARACTERS from space to tilde" 

3408 # in ASCII. Took me a while to figure out. 

3409 if re.match(r"[ -~―—“”…'‘’ʹ€]+$", normalized_desc) and len(desc) > 1: 

3410 if desc in english_words and desc[0].isalpha(): 

3411 return "english" # Handles ones containing whitespace 

3412 desc1 = re.sub( 

3413 tokenizer_fixup_re, lambda m: tokenizer_fixup_map[m.group(0)], desc 

3414 ) 

3415 tokens = tokenizer.tokenize(desc1) 

3416 if not tokens: 3416 ↛ 3417line 3416 didn't jump to line 3417 because the condition on line 3416 was never true

3417 return "other" 

3418 lst_bool = list( 

3419 x not in not_english_words 

3420 and 

3421 # not x.isdigit() and 

3422 ( 

3423 x in english_words 

3424 or x.lower() in english_words 

3425 or x in known_firsts 

3426 or x[0].isdigit() 

3427 or x in accepted 

3428 or 

3429 # (x[0].isupper() and x.find("-") < 0 and x.isascii()) or 

3430 ( 

3431 x.endswith("s") and len(x) >= 4 and x[:-1] in english_words 

3432 ) # Plural 

3433 or ( 

3434 x.endswith("ies") 

3435 and len(x) >= 5 

3436 and x[:-3] + "y" in english_words 

3437 ) # E.g. lily - lilies 

3438 or ( 

3439 x.endswith("ing") 

3440 and len(x) >= 5 

3441 and x[:-3] in english_words 

3442 ) # E.g. bring - bringing 

3443 or ( 

3444 x.endswith("ing") 

3445 and len(x) >= 5 

3446 and x[:-3] + "e" in english_words 

3447 ) # E.g., tone - toning 

3448 or ( 

3449 x.endswith("ed") and len(x) >= 5 and x[:-2] in english_words 

3450 ) # E.g. hang - hanged 

3451 or ( 

3452 x.endswith("ed") 

3453 and len(x) >= 5 

3454 and x[:-2] + "e" in english_words 

3455 ) # E.g. atone - atoned 

3456 or (x.endswith("'s") and x[:-2] in english_words) 

3457 or (x.endswith("s'") and x[:-2] in english_words) 

3458 or ( 

3459 x.endswith("ise") 

3460 and len(x) >= 5 

3461 and x[:-3] + "ize" in english_words 

3462 ) 

3463 or ( 

3464 x.endswith("ised") 

3465 and len(x) >= 6 

3466 and x[:-4] + "ized" in english_words 

3467 ) 

3468 or ( 

3469 x.endswith("ising") 

3470 and len(x) >= 7 

3471 and x[:-5] + "izing" in english_words 

3472 ) 

3473 or ( 

3474 re.search(r"[-/]", x) 

3475 and all( 

3476 ((y in english_words and len(y) > 2) or not y) 

3477 for y in re.split(r"[-/]", x) 

3478 ) 

3479 ) 

3480 ) 

3481 for x in tokens 

3482 ) 

3483 cnt = lst_bool.count(True) 

3484 rejected_words = tuple( 

3485 x for i, x in enumerate(tokens) if not lst_bool[i] 

3486 ) 

3487 if ( 

3488 any( 

3489 lst_bool[i] and x[0].isalpha() and len(x) > 1 

3490 for i, x in enumerate(tokens) 

3491 ) 

3492 and not desc.startswith("-") 

3493 and not desc.endswith("-") 

3494 and re.search(r"\w+", desc) 

3495 and ( 

3496 cnt == len(lst_bool) 

3497 or ( 

3498 any( 

3499 lst_bool[i] and len(x) > 3 for i, x in enumerate(tokens) 

3500 ) 

3501 and cnt >= len(lst_bool) - 1 

3502 ) 

3503 or cnt / len(lst_bool) >= 0.8 

3504 or ( 

3505 all(x in potentially_english_words for x in rejected_words) 

3506 and cnt / len(lst_bool) >= 0.50 

3507 ) 

3508 ) 

3509 ): 

3510 return "english" 

3511 # Some translations have apparent pronunciation descriptions in /.../ 

3512 # which we'll put in the romanization field (even though they probably are 

3513 # not exactly romanizations). 

3514 if desc.startswith("/") and desc.endswith("/"): 

3515 return "romanization" 

3516 # If all characters are in classes that could occur in romanizations, 

3517 # treat as romanization 

3518 classes = list( 

3519 unicodedata.category(x) if x not in ("-", ",", ":", "/", '"') else "OK" 

3520 for x in normalized_desc 

3521 ) 

3522 classes1 = [] 

3523 num_latin = 0 

3524 num_greek = 0 

3525 # part = "" 

3526 # for ch, cl in zip(normalized_desc, classes): 

3527 # part += f"{ch}({cl})" 

3528 # print(part) 

3529 for ch, cl in zip(normalized_desc, classes): 

3530 if ch in ( 

3531 "'", # ' in Arabic, / in IPA-like parenthesized forms 

3532 ".", # e.g., "..." in translations 

3533 ";", 

3534 ":", 

3535 "!", 

3536 "‘", 

3537 "’", 

3538 '"', 

3539 "“", 

3540 "”", 

3541 "/", 

3542 "?", 

3543 "…", # alternative to "..." 

3544 "⁉", # 見る/Japanese automatic transcriptions... 

3545 "?", 

3546 "!", 

3547 "⁻", # superscript -, used in some Cantonese roman, e.g. "we" 

3548 "ʔ", 

3549 "ʼ", 

3550 "ʾ", 

3551 "ʹ", 

3552 ): # ʹ e.g. in understand/English/verb Russian transl 

3553 classes1.append("OK") 

3554 continue 

3555 if cl not in ("Ll", "Lu"): 

3556 classes1.append(cl) 

3557 continue 

3558 try: 

3559 name = unicodedata.name(ch) 

3560 first = name.split()[0] 

3561 if first == "LATIN": 

3562 num_latin += 1 

3563 elif first == "GREEK": 

3564 num_greek += 1 

3565 elif first == "COMBINING": # Combining diacritic 3565 ↛ 3566line 3565 didn't jump to line 3566 because the condition on line 3565 was never true

3566 cl = "OK" 

3567 elif re.match(non_latin_scripts_re, name): 3567 ↛ 3571line 3567 didn't jump to line 3571 because the condition on line 3567 was always true

3568 cl = "NO" # Not acceptable in romanizations 

3569 except ValueError: 

3570 cl = "NO" # Not acceptable in romanizations 

3571 classes1.append(cl) 

3572 # print("classify_desc: {!r} classes1: {}".format(desc, classes1)) 

3573 # print(set(classes1) ) 

3574 if all( 

3575 x in ("Ll", "Lu", "Lt", "Lm", "Mn", "Mc", "Zs", "Nd", "OK") 

3576 for x in classes1 

3577 ): 

3578 if ( 

3579 (num_latin >= num_greek + 2 or num_greek == 0) 

3580 and classes1.count("OK") < len(classes1) 

3581 and classes1.count("Nd") < len(classes1) 

3582 ): 

3583 return "romanization" 

3584 # Otherwise it is something else, such as hanji version of the word 

3585 return "other" 

3586 

3587 

3588def remove_text_in_parentheses(text: str) -> str: 

3589 parentheses = 0 

3590 new_text = "" 

3591 for c in text: 

3592 if c == "(": 

3593 parentheses += 1 

3594 elif c == ")": 

3595 parentheses -= 1 

3596 elif parentheses == 0: 

3597 new_text += c 

3598 return new_text