Coverage for src/wiktextract/extractor/en/form_descriptions.py: 70%

1319 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1# Code for parsing linguistic form descriptions and tags for word senses 

2# (both the word entry head - initial part and parenthesized parts - 

3# and tags at the beginning of word senses) 

4# 

5# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

6 

7import functools 

8import re 

9import unicodedata 

10from typing import ( 

11 Any, 

12 Literal, 

13 Optional, 

14 Sequence, 

15 Union, 

16) 

17 

18import Levenshtein 

19from nltk import TweetTokenizer # type:ignore[import-untyped] 

20 

21from ...datautils import data_append, data_extend, split_at_comma_semi 

22from ...tags import ( 

23 alt_of_tags, 

24 form_of_tags, 

25 head_final_bantu_langs, 

26 head_final_bantu_map, 

27 head_final_numeric_langs, 

28 head_final_other_langs, 

29 head_final_other_map, 

30 head_final_semitic_langs, 

31 head_final_semitic_map, 

32 uppercase_tags, 

33 valid_tags, 

34 xlat_descs_map, 

35 xlat_head_map, 

36 xlat_tags_map, 

37) 

38from ...topics import topic_generalize_map, valid_topics 

39from ...wxr_context import WiktextractContext 

40from .english_words import ( 

41 english_words, 

42 not_english_words, 

43 potentially_english_words, 

44) 

45from .form_descriptions_known_firsts import known_firsts 

46from .taxondata import known_species 

47from .type_utils import ( 

48 AltOf, 

49 FormData, 

50 LinkageData, 

51 SenseData, 

52 SoundData, 

53 TranslationData, 

54 WordData, 

55) 

56 

57# Tokenizer for classify_desc() 

58tokenizer = TweetTokenizer() 

59 

60# These are ignored as the value of a related form in form head. 

61IGNORED_RELATED: set[str] = set( 

62 [ 

63 "-", 

64 "־", 

65 "᠆", 

66 "‐", 

67 "‑", 

68 "‒", 

69 "–", 

70 "—", 

71 "―", 

72 "−", 

73 "⸺", 

74 "⸻", 

75 "﹘", 

76 "﹣", 

77 "-", 

78 "?", 

79 "(none)", 

80 ] 

81) 

82 

83 

84# First words of unicodedata.name() that indicate scripts that cannot be 

85# accepted in romanizations or english (i.e., should be considered "other" 

86# in classify_desc()). 

87non_latin_scripts: list[str] = [ 

88 "ADLAM", 

89 "ARABIC", 

90 "ARABIC-INDIC", 

91 "ARMENIAN", 

92 "BALINESE", 

93 "BENGALI", 

94 "BRAHMI", 

95 "BRAILLE", 

96 "CANADIAN", 

97 "CHAKMA", 

98 "CHAM", 

99 "CHEROKEE", 

100 "CJK", 

101 "COPTIC", 

102 "COUNTING ROD", 

103 "CUNEIFORM", 

104 "CYRILLIC", 

105 "DOUBLE-STRUCK", 

106 "EGYPTIAN", 

107 "ETHIOPIC", 

108 "EXTENDED ARABIC-INDIC", 

109 "GEORGIAN", 

110 "GLAGOLITIC", 

111 "GOTHIC", 

112 "GREEK", 

113 "GUJARATI", 

114 "GURMUKHI", 

115 "HANGUL", 

116 "HANIFI ROHINGYA", 

117 "HEBREW", 

118 "HIRAGANA", 

119 "JAVANESE", 

120 "KANNADA", 

121 "KATAKANA", 

122 "KAYAH LI", 

123 "KHMER", 

124 "KHUDAWADI", 

125 "LAO", 

126 "LEPCHA", 

127 "LIMBU", 

128 "MALAYALAM", 

129 "MEETEI", 

130 "MYANMAR", 

131 "NEW TAI LUE", 

132 "NKO", 

133 "OL CHIKI", 

134 "OLD PERSIAN", 

135 "OLD SOUTH ARABIAN", 

136 "ORIYA", 

137 "OSMANYA", 

138 "PHOENICIAN", 

139 "SAURASHTRA", 

140 "SHARADA", 

141 "SINHALA", 

142 "SUNDANESE", 

143 "SYLOTI", 

144 "TAI THAM", 

145 "TAKRI", 

146 "TAMIL", 

147 "TELUGU", 

148 "THAANA", 

149 "THAI", 

150 "TIBETAN", 

151 "TIFINAGH", 

152 "TIRHUTA", 

153 "UGARITIC", 

154 "WARANG CITI", 

155 "YI", 

156] 

157non_latin_scripts_re = re.compile( 

158 r"(" + r"|".join(re.escape(x) for x in non_latin_scripts) + r")\b" 

159) 

160 

161# Sanity check xlat_head_map values 

162for k, v in xlat_head_map.items(): 

163 if v.startswith("?"): 

164 v = v[1:] 

165 for tag in v.split(): 

166 if tag not in valid_tags: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true

167 print( 

168 "WARNING: xlat_head_map[{}] contains unrecognized tag {}".format( 

169 k, tag 

170 ) 

171 ) 

172 

173# Regexp for finding nested translations from translation items (these are 

174# used in, e.g., year/English/Translations/Arabic). This is actually used 

175# in page.py. 

176nested_translations_re = re.compile( 

177 r"\s+\((({}): ([^()]|\([^()]+\))+)\)".format( 

178 "|".join( 

179 re.escape(x.removeprefix("?")) 

180 for x in sorted(xlat_head_map.values(), key=len, reverse=True) 

181 if x and not x.startswith("class-") 

182 ) 

183 ) 

184) 

185 

186# Regexp that matches head tag specifiers. Used to match tags from end of 

187# translations and linkages 

188head_final_re_text = r"( -)?( ({}))+".format( 

189 "|".join( 

190 re.escape(x) 

191 for x in 

192 # The sort is to put longer ones first, preferring them in 

193 # the regexp match 

194 sorted(xlat_head_map.keys(), key=len, reverse=True) 

195 ) 

196) 

197head_final_re = re.compile(head_final_re_text + "$") 

198 

199# Regexp used to match head tag specifiers at end of a form for certain 

200# Bantu languages (particularly Swahili and similar languages). 

201head_final_bantu_re_text = r" ({})".format( 

202 "|".join(re.escape(x) for x in head_final_bantu_map.keys()) 

203) 

204head_final_bantu_re = re.compile(head_final_bantu_re_text + "$") 

205 

206# Regexp used to match head tag specifiers at end of a form for certain 

207# Semitic languages (particularly Arabic and similar languages). 

208head_final_semitic_re_text = r" ({})".format( 

209 "|".join(re.escape(x) for x in head_final_semitic_map.keys()) 

210) 

211head_final_semitic_re = re.compile(head_final_semitic_re_text + "$") 

212 

213# Regexp used to match head tag specifiers at end of a form for certain 

214# other languages (e.g., Lithuanian, Finnish, French). 

215head_final_other_re_text = r" ({})".format( 

216 "|".join(re.escape(x) for x in head_final_other_map.keys()) 

217) 

218head_final_other_re = re.compile(head_final_other_re_text + "$") 

219 

220# Regexp for splitting heads. See parse_word_head(). 

221head_split_re_text = ( 

222 "(" 

223 + head_final_re_text 

224 + "|" 

225 + head_final_bantu_re_text 

226 + "|" 

227 + head_final_semitic_re_text 

228 + "|" 

229 + head_final_other_re_text 

230 + ")?( or |[,;]+)" 

231) 

232head_split_re = re.compile(head_split_re_text) 

233head_split_re_parens = 0 

234for m in re.finditer(r"(^|[^\\])[(]+", head_split_re_text): 

235 head_split_re_parens += m.group(0).count("(") 

236 

237# Parenthesized parts that are ignored in translations 

238tr_ignored_parens: set[str] = set( 

239 [ 

240 "please verify", 

241 "(please verify)", 

242 "transliteration needed", 

243 "(transliteration needed)", 

244 "in words with back vowel harmony", 

245 "(in words with back vowel harmony)", 

246 "in words with front vowel harmony", 

247 "(in words with front vowel harmony)", 

248 "see below", 

249 "see usage notes below", 

250 ] 

251) 

252tr_ignored_parens_re = re.compile( 

253 r"^(" 

254 + "|".join(re.escape(x) for x in tr_ignored_parens) 

255 + ")$" 

256 + r"|^(Can we clean up|Can we verify|for other meanings see " 

257 r"lit\. )" 

258) 

259 

260# Translations that are ignored 

261ignored_translations: set[str] = set( 

262 [ 

263 "[script needed]", 

264 "please add this translation if you can", 

265 ] 

266) 

267 

268# Put english text into the "note" field in a translation if it contains one 

269# of these words 

270tr_note_re = re.compile( 

271 r"(\b(article|definite|indefinite|superlative|comparative|pattern|" 

272 r"adjective|adjectives|clause|clauses|pronoun|pronouns|preposition|prep|" 

273 r"postposition|postp|action|actions|articles|" 

274 r"adverb|adverbs|noun|nouns|verb|verbs|before|" 

275 r"after|placed|prefix|suffix|used with|translated|" 

276 r"nominative|genitive|dative|infinitive|participle|past|perfect|imperfect|" 

277 r"perfective|imperfective|auxiliary|negative|future|present|tense|aspect|" 

278 r"conjugation|declension|class|category|plural|singular|positive|" 

279 r"seldom used|formal|informal|familiar|unspoken|spoken|written|" 

280 r"indicative|progressive|conditional|potential|" 

281 r"accusative|adessive|inessive|superessive|elative|allative|" 

282 r"dialect|dialects|object|subject|predicate|movies|recommended|language|" 

283 r"locative|continuous|simple|continuousness|gerund|subjunctive|" 

284 r"periphrastically|no equivalent|not used|not always used|" 

285 r"used only with|not applicable|use the|signifying|wordplay|pronounced|" 

286 r"preconsonantal|spelled|spelling|respelling|respellings|phonetic|" 

287 r"may be replaced|stricter sense|for nonhumans|" 

288 r"sense:|used:|in full:|informally used|followed by|" 

289 r"not restricted to|pertaining to|or optionally with|are optional|" 

290 r"in conjunction with|in compounds|depending on the relationship|" 

291 r"person addressed|one person|multiple persons|may be replaced with|" 

292 r"optionally completed with|in the phrase|in response to|" 

293 r"before a|before an|preceded by|verbs ending|very common|after a verb|" 

294 r"with verb|with uncountable|with the objects|with stative|" 

295 r"can be replaced by|often after|used before|used after|" 

296 r"used in|clipping of|spoken|somewhat|capitalized|" 

297 r"short form|shortening of|shortened form|initialism of|" 

298 r"said to|rare:|rarer also|is rarer|negatively connoted|" 

299 r"previously mentioned|uncountable noun|countable noun|" 

300 r"countable nouns|uncountable nouns|" 

301 r"with predicative|with -|with imperfect|with a negated|" 

302 r"colloquial|misspelling|holophrastic|frequently|esp\.|especially|" 

303 r'"|' 

304 r"general term|after a vowel|before a vowel|" 

305 r"form|regular|irregular|alternative)" 

306 r")($|[) ])|^(" 

307 # Following are only matched at the beginning of the string 

308 r"pl|pl\.|see:|pl:|sg:|plurals:|e\.g\.|e\.g\.:|e\.g\.,|cf\.|compare|such as|" 

309 r"see|only|often|usually|used|usage:|of|not|in|compare|usu\.|" 

310 r"as|about|abbrv\.|abbreviation|abbr\.|that:|optionally|" 

311 r"mainly|from|for|also|also:|acronym|" 

312 r"\+|with) " 

313) 

314# \b does not work at the end??? 

315 

316# Related forms matching this regexp will be considered suspicious if the 

317# page title does not also match one of these. 

318suspicious_related_re = re.compile( 

319 r"(^| )(f|m|n|c|or|pl|sg|inan|anim|pers|anml|impf|pf|vir|nvir)( |$)" 

320 r"|[][:=<>&#*|]" 

321 r"| \d+$" 

322) 

323 

324# Word forms (head forms, translations, etc) that will be considered ok and 

325# silently accepted even if they would otherwise trigger a suspicious 

326# form warning. 

327ok_suspicious_forms: set[str] = set( 

328 [ 

329 "but en or", # "golden goal"/English/Tr/French 

330 "cœur en or", # "heart of gold"/Eng/Tr/French 

331 "en or", # golden/Eng/Tr/French 

332 "men du", # jet/Etym2/Noun/Tr/Cornish 

333 "parachute en or", # "golden parachute"/Eng/Tr/French 

334 "vieil or", # "old gold"/Eng/Tr/French 

335 # "all that glitters is not gold"/Eng/Tr/French 

336 "tout ce qui brille n’est pas or", 

337 "μη αποκλειστικό or", # inclusive or/Eng/Tr/Greek 

338 "period or full stop", 

339 ] 

340) 

341 

342 

343# Replacements to be done in classify_desc before tokenizing. This is a 

344# workaround for shortcomings in TweetTokenizer. 

345tokenizer_fixup_map = { 

346 r"a.m.": "AM", 

347 r"p.m.": "PM", 

348} 

349tokenizer_fixup_re = re.compile( 

350 r"\b(" 

351 + "|".join( 

352 re.escape(x) 

353 for x in sorted( 

354 tokenizer_fixup_map.keys(), key=lambda x: len(x), reverse=True 

355 ) 

356 ) 

357 + r")" 

358) 

359 

360# Unknown tags starting with these words will be silently ignored. 

361ignored_unknown_starts: set[str] = set( 

362 [ 

363 "originally", 

364 "e.g.", 

365 "c.f.", 

366 "supplanted by", 

367 "supplied by", 

368 ] 

369) 

370 

371ignored_unknown_starts_re = re.compile( 

372 r"^(" 

373 + "|".join( 

374 re.escape(x) 

375 for x in sorted(ignored_unknown_starts, key=lambda x: -len(x)) 

376 ) 

377 + ") " 

378) 

379 

380# If an unknown sequence starts with one of these, it will continue as an 

381# unknown sequence until the end, unless it turns out to have a replacement. 

382allowed_unknown_starts: set[str] = set( 

383 [ 

384 "Relating", 

385 "accompanied", 

386 "added", 

387 "after", 

388 "answering", 

389 "as", 

390 "based", 

391 "before", 

392 "conjugated", 

393 "conjunction", 

394 "construed", 

395 "especially", 

396 "expression:", 

397 "figurative:", 

398 "followed", 

399 "for", 

400 "forms", 

401 "from", 

402 "governs", 

403 "in", 

404 "indicating", 

405 "modifying", 

406 "normally", 

407 "not", 

408 "of", 

409 "preceding", 

410 "prefixed", 

411 "referring", 

412 "relating", 

413 "revived", 

414 "said", 

415 "since", 

416 "takes", 

417 "used", 

418 "with", 

419 "With", 

420 "without", 

421 ] 

422) 

423# Allow the ignored unknown starts without complaining 

424allowed_unknown_starts.update(ignored_unknown_starts) 

425 

426# Full unknown tags that will be ignored in decode_tags() 

427# XXX this is unused, ask Tatu where the contents is now 

428ignored_unknown_tags: set[str] = set([]) 

429 

430# Head endings that are mapped to tags 

431head_end_map = { 

432 " 1st conj.": "conjugation-1", 

433 " 2nd conj.": "conjugation-2", 

434 " 3rd conj.": "conjugation-3", 

435 " 4th conj.": "conjugation-4", 

436 " 5th conj.": "conjugation-5", 

437 " 6th conj.": "conjugation-6", 

438 " 7th conj.": "conjugation-7", 

439} 

440head_end_re = re.compile( 

441 r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$" 

442) 

443 

444# Words that can be part of form description 

445valid_words: set[str] = set(["or", "and"]) 

446for x in valid_tags: 

447 valid_words.update(x.split(" ")) 

448for x in xlat_tags_map.keys(): 

449 valid_words.update(x.split(" ")) 

450 

451 

452# Dictionary of language-specific parenthesized head part starts that 

453# either introduce new tags or modify previous tags. The value for each 

454# language is a dictionary that maps the first word of the head part to 

455# (rem_tags, add_tags), where ``rem_tags`` can be True to remove all previous 

456# tags or a space-separated string of tags to remove, and ``add_tags`` should 

457# be a string of tags to add. 

458lang_specific_head_map: dict[ 

459 str, dict[str, Union[tuple[str, str], tuple[Literal[True], str]]] 

460] = { 

461 "Danish": { 

462 # prefix: (rem_tags space separate string/True, add_tags s-sep str) 

463 "c": ("neuter", "common-gender"), 

464 "n": ("common-gender", "neuter"), 

465 "pl": ("singular neuter common-gender", "plural"), 

466 "sg": ("plural neuter common-gender", "singular"), 

467 }, 

468} 

469 

470 

471# Regular expression used to strip additional stuff from the end of alt_of and 

472# form_of. 

473alt_of_form_of_clean_re = re.compile( 

474 r"(?s)(" 

475 + "|".join( 

476 [ 

477 r":", 

478 r'[“"]', 

479 r";", 

480 r" \(", 

481 r" - ", 

482 r" ־ ", 

483 r" ᠆ ", 

484 r" ‐ ", 

485 r" ‑ ", 

486 r" ‒ ", 

487 r" – ", 

488 r" — ", 

489 r" ― ", 

490 r" − ", 

491 r" ⸺ ", 

492 r" ⸻ ", 

493 r" ﹘ ", 

494 r" ﹣ ", 

495 r" - ", 

496 r" \+ ", 

497 r" \(with ", 

498 r" with -ra/-re", 

499 r"\. Used ", 

500 r"\. Also ", 

501 r"\. Since ", 

502 r"\. A ", 

503 r"\.\. A ", 

504 r"\. An ", 

505 r"\.\. An ", 

506 r"\. an ", 

507 r"\. The ", 

508 r"\. Spanish ", 

509 r"\. Language ", 

510 r"\. former name of ", 

511 r"\. AIM", 

512 r"\. OT", 

513 r"\. Not ", 

514 r"\. Now ", 

515 r"\. Nowadays ", 

516 r"\. Early ", 

517 r"\. ASEAN", 

518 r"\. UN", 

519 r"\. IMF", 

520 r"\. WHO", 

521 r"\. WIPO", 

522 r"\. AC", 

523 r"\. DC", 

524 r"\. DNA", 

525 r"\. RNA", 

526 r"\. SOB", 

527 r"\. IMO", 

528 r"\. Behavior", 

529 r"\. Income ", 

530 r"\. More ", 

531 r"\. Most ", 

532 r"\. Only ", 

533 r"\. Also ", 

534 r"\. From ", 

535 r"\. Of ", 

536 r"\.\. Of ", 

537 r"\. To ", 

538 r"\. For ", 

539 r"\. If ", 

540 r"\. Praenominal ", 

541 r"\. This ", 

542 r"\. Replaced ", 

543 r"\. CHCS is the ", 

544 r"\. Equivalent ", 

545 r"\. Initialism ", 

546 r"\. Note ", 

547 r"\. Alternative ", 

548 r"\. Compare ", 

549 r"\. Cf\. ", 

550 r"\. Comparable ", 

551 r"\. Involves ", 

552 r"\. Sometimes ", 

553 r"\. Commonly ", 

554 r"\. Often ", 

555 r"\. Typically ", 

556 r"\. Possibly ", 

557 r"\. Although ", 

558 r"\. Rare ", 

559 r"\. Instead ", 

560 r"\. Integrated ", 

561 r"\. Distinguished ", 

562 r"\. Given ", 

563 r"\. Found ", 

564 r"\. Was ", 

565 r"\. In ", 

566 r"\. It ", 

567 r"\.\. It ", 

568 r"\. One ", 

569 r"\. Any ", 

570 r"\. They ", 

571 r"\. Members ", 

572 r"\. Each ", 

573 r"\. Original ", 

574 r"\. Especially ", 

575 r"\. Usually ", 

576 r"\. Known ", 

577 r"\.\. Known ", 

578 r"\. See ", 

579 r"\. see ", 

580 r"\. target was not ", 

581 r"\. Popular ", 

582 r"\. Pedantic ", 

583 r"\. Positive ", 

584 r"\. Society ", 

585 r"\. Plan ", 

586 r"\. Environmentally ", 

587 r"\. Affording ", 

588 r"\. Encompasses ", 

589 r"\. Expresses ", 

590 r"\. Indicates ", 

591 r"\. Text ", 

592 r"\. Large ", 

593 r"\. Sub-sorting ", 

594 r"\. Sax", 

595 r"\. First-person ", 

596 r"\. Second-person ", 

597 r"\. Third-person ", 

598 r"\. 1st ", 

599 r"\. 2nd ", 

600 r"\. 3rd ", 

601 r"\. Term ", 

602 r"\. Northeastern ", 

603 r"\. Northwestern ", 

604 r"\. Southeast ", 

605 r"\. Egyptian ", 

606 r"\. English ", 

607 r"\. Cape Province was split into ", 

608 r"\. Pañcat", 

609 r"\. of the ", 

610 r"\. is ", 

611 r"\. after ", 

612 r"\. or ", 

613 r"\. chromed", 

614 r"\. percussion", 

615 r"\. with his ", 

616 r"\. a\.k\.a\. ", 

617 r"\. comparative form ", 

618 r"\. singular ", 

619 r"\. plural ", 

620 r"\. present ", 

621 r"\. his ", 

622 r"\. her ", 

623 r"\. equivalent ", 

624 r"\. measuring ", 

625 r"\. used in ", 

626 r"\. cutely ", 

627 r"\. Protects", 

628 r'\. "', 

629 r"\.^", 

630 r"\. \+ ", 

631 r"\., ", 

632 r". — ", 

633 r", a ", 

634 r", an ", 

635 r", the ", 

636 r", obsolete ", 

637 r", possessed", # 'd/English 

638 r", imitating", # 1/English 

639 r", derived from", 

640 r", called ", 

641 r", especially ", 

642 r", slang for ", 

643 r" corresponding to ", 

644 r" equivalent to ", 

645 r" popularized by ", 

646 r" denoting ", 

647 r" in its various senses\.", 

648 r" used by ", 

649 r" but not for ", 

650 r" since ", 

651 r" i\.e\. ", 

652 r" i\. e\. ", 

653 r" e\.g\. ", 

654 r" eg\. ", 

655 r" etc\. ", 

656 r"\[http", 

657 r" — used as ", 

658 r" by K\. Forsyth ", 

659 r" by J\. R\. Allen ", 

660 r" by S\. Ferguson ", 

661 r" by G\. Donaldson ", 

662 r" May refer to ", 

663 r" An area or region ", 

664 ] 

665 ) 

666 + r").*$" 

667) 

668 

669 

670class ValidNode: 

671 """Node in the valid_sequences tree. Each node is part of a chain 

672 or chains that form sequences built out of keys in key->tags 

673 maps like xlat_tags, etc. The ValidNode's 'word' is the key 

674 by which it is refered to in the root dict or a `children` dict, 

675 `end` marks that the node is the end-terminus of a sequence (but 

676 it can still continue if the sequence is shared by the start of 

677 other sequences: "nominative$" and "nominative plural$" for example), 

678 `tags` and `topics` are the dicts containing tag and topic strings 

679 for terminal nodes (end==True).""" 

680 

681 __slots__ = ( 

682 "end", 

683 "tags", 

684 "topics", 

685 "children", 

686 ) 

687 

688 def __init__( 

689 self, 

690 end=False, 

691 tags: Optional[list[str]] = None, 

692 topics: Optional[list[str]] = None, 

693 children: Optional[dict[str, "ValidNode"]] = None, 

694 ) -> None: 

695 self.end = end 

696 self.tags: list[str] = tags or [] 

697 self.topics: list[str] = topics or [] 

698 self.children: dict[str, "ValidNode"] = children or {} 

699 

700 

701def add_to_valid_tree(tree: ValidNode, desc: str, v: Optional[str]) -> None: 

702 """Helper function for building trees of valid tags/sequences during 

703 initialization.""" 

704 assert isinstance(tree, ValidNode) 

705 assert isinstance(desc, str) 

706 assert v is None or isinstance(v, str) 

707 node = tree 

708 

709 # Build the tree structure: each node has children nodes 

710 # whose names are denoted by their dict key. 

711 for w in desc.split(" "): 

712 if w in node.children: 

713 node = node.children[w] 

714 else: 

715 new_node = ValidNode() 

716 node.children[w] = new_node 

717 node = new_node 

718 if not node.end: 

719 node.end = True 

720 if not v: 

721 return None # Terminate early because there are no tags 

722 

723 tagslist = [] 

724 topicslist = [] 

725 for vv in v.split(): 

726 if vv in valid_tags: 

727 tagslist.append(vv) 

728 elif vv in valid_topics: 728 ↛ 731line 728 didn't jump to line 731 because the condition on line 728 was always true

729 topicslist.append(vv) 

730 else: 

731 print( 

732 "WARNING: tag/topic {!r} maps to unknown {!r}".format(desc, vv) 

733 ) 

734 topics = " ".join(topicslist) 

735 tags = " ".join(tagslist) 

736 # Changed to "_tags" and "_topics" to avoid possible key-collisions. 

737 if topics: 

738 node.topics.extend([topics]) 

739 if tags: 

740 node.tags.extend([tags]) 

741 

742 

743def add_to_valid_tree1( 

744 tree: ValidNode, 

745 k: str, 

746 v: Union[list[str], tuple[str, ...], str], 

747 valid_values: Union[set[str], dict[str, Any]], 

748) -> list[str]: 

749 assert isinstance(tree, ValidNode) 

750 assert isinstance(k, str) 

751 assert v is None or isinstance(v, (list, tuple, str)) 

752 assert isinstance(valid_values, (set, dict)) 

753 if not v: 753 ↛ 754line 753 didn't jump to line 754 because the condition on line 753 was never true

754 add_to_valid_tree(valid_sequences, k, None) 

755 return [] 

756 elif isinstance(v, str): 

757 v = [v] 

758 q = [] 

759 for vv in v: 

760 assert isinstance(vv, str) 

761 add_to_valid_tree(valid_sequences, k, vv) 

762 vvs = vv.split() 

763 for x in vvs: 

764 q.append(x) 

765 # return each individual tag 

766 return q 

767 

768 

769def add_to_valid_tree_mapping( 

770 tree: ValidNode, 

771 mapping: Union[dict[str, Union[list[str], str]], dict[str, str]], 

772 valid_values: Union[set[str], dict[str, Any]], 

773 recurse: bool, 

774) -> None: 

775 assert isinstance(tree, ValidNode) 

776 assert isinstance(mapping, dict) 

777 assert isinstance(valid_values, (set, dict)) 

778 assert recurse in (True, False) 

779 for k, v in mapping.items(): 

780 assert isinstance(k, str) 

781 assert isinstance(v, (list, str)) 

782 if isinstance(v, str): 

783 q = add_to_valid_tree1(tree, k, [v], valid_values) 

784 else: 

785 q = add_to_valid_tree1(tree, k, v, valid_values) 

786 if recurse: 

787 visited = set() 

788 while q: 

789 v = q.pop() 

790 if v in visited: 

791 continue 

792 visited.add(v) 

793 if v not in mapping: 

794 continue 

795 vv = mapping[v] 

796 qq = add_to_valid_tree1(tree, k, vv, valid_values) 

797 q.extend(qq) 

798 

799 

800# Tree of sequences considered to be tags (includes sequences that are 

801# mapped to something that becomes one or more valid tags) 

802valid_sequences = ValidNode() 

803sequences_with_slashes: set[str] = set() 

804for tag in valid_tags: 

805 # The basic tags used in our tag system; some are a bit weird, but easier 

806 # to implement this with 'false' positives than filter out stuff no one else 

807 # uses. 

808 if "/" in tag: 

809 sequences_with_slashes.add(tag) 

810 add_to_valid_tree(valid_sequences, tag, tag) 

811for tag in uppercase_tags: 

812 hyphenated = re.sub(r"\s+", "-", tag) 

813 if hyphenated in valid_tags: 813 ↛ 814line 813 didn't jump to line 814 because the condition on line 813 was never true

814 print( 

815 "DUPLICATE TAG: {} (from uppercase tag {!r})".format( 

816 hyphenated, tag 

817 ) 

818 ) 

819 assert hyphenated not in valid_tags 

820 # Might as well, while we're here: Add hyphenated location tag. 

821 valid_tags[hyphenated] = "dialect" 

822 add_to_valid_tree(valid_sequences, hyphenated, hyphenated) 

823for tag in uppercase_tags: 

824 hyphenated = re.sub(r"\s+", "-", tag) 

825 # XXX Move to above loop? Or is this here for readability? 

826 if "/" in tag: 

827 sequences_with_slashes.add(tag) 

828 add_to_valid_tree(valid_sequences, tag, hyphenated) 

829# xlat_tags_map! 

830add_to_valid_tree_mapping(valid_sequences, xlat_tags_map, valid_tags, False) 

831for k in xlat_tags_map: 

832 if "/" in k: 

833 sequences_with_slashes.add(k) 

834# Add topics to the same table, with all generalized topics also added 

835for topic in valid_topics: 

836 assert " " not in topic 

837 if "/" in topic: 837 ↛ 838line 837 didn't jump to line 838 because the condition on line 837 was never true

838 sequences_with_slashes.add(topic) 

839 add_to_valid_tree(valid_sequences, topic, topic) 

840# Let each original topic value stand alone. These are not generally on 

841# valid_topics. We add the original topics with spaces replaced by hyphens. 

842for topic in topic_generalize_map.keys(): 

843 hyphenated = topic.replace(" ", "-") 

844 valid_topics.add(hyphenated) 

845 if "/" in topic: 845 ↛ 846line 845 didn't jump to line 846 because the condition on line 845 was never true

846 sequences_with_slashes.add(tag) 

847 add_to_valid_tree(valid_sequences, topic, hyphenated) 

848# Add canonicalized/generalized topic values 

849add_to_valid_tree_mapping( 

850 valid_sequences, topic_generalize_map, valid_topics, True 

851) 

852 

853# Regex used to divide a decode candidate into parts that shouldn't 

854# have their slashes turned into spaces 

855slashes_re = re.compile( 

856 r"(" + "|".join((re.escape(s) for s in sequences_with_slashes)) + r")" 

857) 

858 

859# Regexp used to find "words" from word heads and linguistic descriptions 

860word_pattern = ( 

861 r"[^ ,;()\u200e]+|" 

862 r"\([^ ,;()\u200e]+\)[^ ,;()\u200e]+|" 

863 r"[\u2800-\u28ff]|" # Braille characters 

864 r"\(([^()]|\([^()]*\))*\)" 

865) 

866 

867word_re_global = re.compile(word_pattern) 

868 

869 

870def distw(titleparts: Sequence[str], word: str) -> float: 

871 """Computes how distinct ``word`` is from the most similar word in 

872 ``titleparts``. Returns 1 if words completely distinct, 0 if 

873 identical, or otherwise something in between.""" 

874 assert isinstance(titleparts, (list, tuple)) 

875 assert isinstance(word, str) 

876 w = min( 

877 Levenshtein.distance(word, tw) / max(len(tw), len(word)) 

878 for tw in titleparts 

879 ) 

880 return w 

881 

882 

883def map_with( 

884 ht: Union[dict[str, Union[str, list[str]]], dict[str, str]], 

885 lst: Sequence[str], 

886) -> list[str]: 

887 """Takes alternatives from ``lst``, maps them using ``ht`` to zero or 

888 more alternatives each, and returns a combined list of alternatives.""" 

889 assert isinstance(ht, dict) 

890 assert isinstance(lst, (list, tuple)) 

891 ret = [] 

892 for x in lst: 

893 assert isinstance(x, str) 

894 x = x.strip() 

895 x = ht.get(x, x) 

896 if isinstance(x, str): 896 ↛ 899line 896 didn't jump to line 899 because the condition on line 896 was always true

897 if x: 897 ↛ 892line 897 didn't jump to line 892 because the condition on line 897 was always true

898 ret.append(x) 

899 elif isinstance(x, (list, tuple)): 

900 ret.extend(x) 

901 else: 

902 raise RuntimeError("map_with unexpected value: {!r}".format(x)) 

903 return ret 

904 

905 

906TagList = list[str] 

907PosPathStep = tuple[int, TagList, TagList] 

908 

909 

910def check_unknown( 

911 from_i: int, 

912 to_i: int, 

913 i: int, 

914 wordlst: Sequence[str], 

915 allow_any: bool, 

916 no_unknown_starts: bool, 

917) -> list[PosPathStep]: 

918 """Check if the current section from_i->to_i is actually unknown 

919 or if it needs some special handling. We already presupposed that 

920 this is UNKNOWN; this is just called to see what *kind* of UNKNOWN.""" 

921 assert isinstance(to_i, int) 

922 assert isinstance(from_i, int) 

923 assert isinstance(i, int) 

924 # Adds unknown tag if needed. Returns new last_i 

925 # print("check_unknown to_i={} from_i={} i={}" 

926 # .format(to_i, from_i, i)) 

927 if from_i >= to_i: 

928 return [] 

929 words = wordlst[from_i:to_i] 

930 tag = " ".join(words) 

931 assert tag 

932 if re.match(ignored_unknown_starts_re, tag): 

933 # Tags with this start are to be ignored 

934 return [(from_i, ["UNKNOWN"], [])] 

935 if tag in ignored_unknown_tags: 935 ↛ 936line 935 didn't jump to line 936 because the condition on line 935 was never true

936 return [] # One of the tags listed as to be ignored 

937 if tag in ("and", "or"): 

938 return [] 

939 if ( 

940 not allow_any 

941 and not words[0].startswith("~") 

942 and ( 

943 no_unknown_starts 

944 or words[0] not in allowed_unknown_starts 

945 or len(words) <= 1 

946 ) 

947 ): 

948 # print("ERR allow_any={} words={}" 

949 # .format(allow_any, words)) 

950 return [ 

951 (from_i, ["UNKNOWN"], ["error-unknown-tag"]) 

952 ] # Add ``tag`` here to include 

953 else: 

954 return [(from_i, ["UNKNOWN"], [tag])] 

955 

956 

957def add_new1( 

958 node: ValidNode, 

959 i: int, 

960 start_i: int, 

961 last_i: int, 

962 new_paths: list[list[PosPathStep]], 

963 new_nodes: list[tuple[ValidNode, int, int]], 

964 pos_paths: list[list[list[PosPathStep]]], 

965 wordlst: list[str], 

966 allow_any: bool, 

967 no_unknown_starts: bool, 

968 max_last_i: int, 

969) -> int: 

970 assert isinstance(new_paths, list) 

971 # print("add_new: start_i={} last_i={}".format(start_i, last_i)) 

972 # print("$ {} last_i={} start_i={}" 

973 # .format(w, last_i, start_i)) 

974 max_last_i = max(max_last_i, last_i) # if last_i has grown 

975 if (node, start_i, last_i) not in new_nodes: 

976 new_nodes.append((node, start_i, last_i)) 

977 if node.end: 

978 # We can see a terminal point in the search tree. 

979 u = check_unknown( 

980 last_i, start_i, i, wordlst, allow_any, no_unknown_starts 

981 ) 

982 # Create new paths candidates based on different past possible 

983 # paths; pos_path[last_i] contains possible paths, so add this 

984 # new one at the beginning(?) 

985 # The list comprehension inside the parens generates an iterable 

986 # of lists, so this is .extend( [(last_i...)], [(last_i...)], ... ) 

987 # XXX: this is becoming impossible to annotate, nodes might 

988 # need to become classed objects and not just dicts, or at least 

989 # a TypedDict with a "children" node 

990 new_paths.extend( 

991 [(last_i, node.tags, node.topics)] + u + x 

992 for x in pos_paths[last_i] 

993 ) 

994 max_last_i = i + 1 

995 return max_last_i 

996 

997 

998@functools.lru_cache(maxsize=65536) 

999def decode_tags( 

1000 src: str, 

1001 allow_any=False, 

1002 no_unknown_starts=False, 

1003) -> tuple[list[tuple[str, ...]], list[str]]: 

1004 tagsets, topics = decode_tags1(src, allow_any, no_unknown_starts) 

1005 # print(f"decode_tags: {src=}, {tagsets=}") 

1006 

1007 # Insert retry-code here that modifies the text source 

1008 if ( 

1009 any(s.startswith("error-") for tagset in tagsets for s in tagset) 

1010 # I hate Python's *nested* list comprehension syntax ^ 

1011 or any(s.startswith("error-") for s in topics) 

1012 ): 

1013 # slashes_re contains valid key entries with slashes; we're going to 

1014 # skip them by splitting the string and skipping handling every 

1015 # second entry, which contains the splitting group like "masculine/ 

1016 # feminine" style keys. 

1017 if "/" in src: 

1018 split_parts = re.split(slashes_re, src) 

1019 new_parts: list[str] = [] 

1020 if len(split_parts) > 1: 

1021 for i, s in enumerate(split_parts): 

1022 if i % 2 == 0: 

1023 new_parts.append(s.replace("/", " ")) 

1024 else: 

1025 new_parts.append(s) 

1026 new_src = "".join(new_parts) 

1027 else: 

1028 new_src = src 

1029 new_tagsets, new_topics = decode_tags1( 

1030 new_src, allow_any, no_unknown_starts 

1031 ) 

1032 

1033 old_errors = sum( 

1034 1 for tagset in tagsets for s in tagset if s.startswith("error") 

1035 ) 

1036 old_errors += sum(1 for s in topics if s.startswith("error")) 

1037 new_errors = sum( 

1038 1 

1039 for new_tagset in new_tagsets 

1040 for s in new_tagset 

1041 if s.startswith("error") 

1042 ) 

1043 new_errors += sum(1 for s in new_topics if s.startswith("error")) 

1044 

1045 if new_errors <= old_errors: 1045 ↛ 1048line 1045 didn't jump to line 1048 because the condition on line 1045 was always true

1046 return new_tagsets, new_topics 

1047 

1048 return tagsets, topics 

1049 

1050 

1051def decode_tags1( 

1052 src: str, 

1053 allow_any=False, 

1054 no_unknown_starts=False, 

1055) -> tuple[list[tuple[str, ...]], list[str]]: 

1056 """Decodes tags, doing some canonicalizations. This returns a list of 

1057 lists of tags and a list of topics.""" 

1058 assert isinstance(src, str) 

1059 

1060 # print("decode_tags: src={!r}".format(src)) 

1061 

1062 pos_paths: list[list[list[PosPathStep]]] = [[[]]] 

1063 wordlst: list[str] = [] 

1064 max_last_i = 0 # pre-initialized here so that it can be used as a ref 

1065 

1066 add_new = functools.partial( 

1067 add_new1, # pre-set parameters and references for function 

1068 pos_paths=pos_paths, 

1069 wordlst=wordlst, 

1070 allow_any=allow_any, 

1071 no_unknown_starts=no_unknown_starts, 

1072 max_last_i=max_last_i, 

1073 ) 

1074 # First split the tags at commas and semicolons. Their significance is that 

1075 # a multi-word sequence cannot continue across them. 

1076 parts = split_at_comma_semi(src, extra=[";", ":"]) 

1077 

1078 for part in parts: 

1079 max_last_i = len(wordlst) # "how far have we gone?" 

1080 lst1 = part.split() 

1081 if not lst1: 

1082 continue 

1083 wordlst.extend(lst1) 

1084 cur_nodes: list[tuple[ValidNode, int, int]] = [] # Currently seen 

1085 for w in lst1: 

1086 i = len(pos_paths) - 1 

1087 new_nodes: list[tuple[ValidNode, int, int]] = [] 

1088 # replacement nodes for next loop 

1089 new_paths: list[list[PosPathStep]] = [] 

1090 # print("ITER i={} w={} max_last_i={} wordlst={}" 

1091 # .format(i, w, max_last_i, wordlst)) 

1092 node: ValidNode 

1093 start_i: int 

1094 last_i: int 

1095 for node, start_i, last_i in cur_nodes: 

1096 # ValidNodes are part of a search tree that checks if a 

1097 # phrase is found in xlat_tags_map and other text->tags dicts. 

1098 if w in node.children: 

1099 # the phrase continues down the tree 

1100 # print("INC", w) 

1101 max_last_i = add_new( 

1102 node.children[w], 

1103 i, 

1104 start_i, 

1105 last_i, 

1106 new_paths, 

1107 new_nodes, 

1108 ) 

1109 if node.end: 

1110 # we've hit an end point, the tags and topics have already 

1111 # been gathered at some point, don't do anything with the 

1112 # old stuff 

1113 if w in valid_sequences.children: 

1114 # This starts a *new* possible section 

1115 max_last_i = add_new( 

1116 valid_sequences.children[w], # root-> 

1117 i, 

1118 i, 

1119 i, 

1120 new_paths, 

1121 new_nodes, 

1122 ) 

1123 if w not in node.children and not node.end: 

1124 # print("w not in node and $: i={} last_i={} wordlst={}" 

1125 # .format(i, last_i, wordlst)) 

1126 # If i == last_i == 0, for example (beginning) 

1127 if ( 

1128 i == last_i 

1129 or no_unknown_starts 

1130 or wordlst[last_i] not in allowed_unknown_starts 

1131 ): 

1132 # print("NEW", w) 

1133 if w in valid_sequences.children: 

1134 # Start new sequences here 

1135 max_last_i = add_new( 

1136 valid_sequences.children[w], 

1137 i, 

1138 i, 

1139 last_i, 

1140 new_paths, 

1141 new_nodes, 

1142 ) 

1143 if not new_nodes: 

1144 # This is run at the start when i == max_last_i == 0, 

1145 # which is what populates the first node in new_nodes. 

1146 # Some initial words cause the rest to be interpreted as unknown 

1147 # print("not new nodes: i={} last_i={} wordlst={}" 

1148 # .format(i, max_last_i, wordlst)) 

1149 if ( 

1150 i == max_last_i 

1151 or no_unknown_starts 

1152 or wordlst[max_last_i] not in allowed_unknown_starts 

1153 ): 

1154 # print("RECOVER w={} i={} max_last_i={} wordlst={}" 

1155 # .format(w, i, max_last_i, wordlst)) 

1156 if w in valid_sequences.children: 

1157 max_last_i = add_new( 

1158 # new sequence from root 

1159 valid_sequences.children[w], 

1160 i, 

1161 i, 

1162 max_last_i, 

1163 new_paths, 

1164 new_nodes, 

1165 ) 

1166 cur_nodes = new_nodes # Completely replace nodes! 

1167 # 2023-08-18, fix to improve performance 

1168 # Decode tags does a big search of the best-shortest matching 

1169 # sequences of tags, but the original algorithm didn't have 

1170 # any culling happen during operation, so in a case with 

1171 # a lot of tags (for example, big blocks of text inserted 

1172 # somewhere by mistake that is processed by decode_tags), 

1173 # it would lead to exponential growth of new_paths contents. 

1174 # This culling, using the same weighting algorithm code as 

1175 # in the original is just applied to new_paths before it is 

1176 # added to pos_paths. Basically it's "take the 10 best paths". 

1177 # This *can* cause bugs if it gets stuck in a local minimum 

1178 # or something, but this whole process is one-dimensional 

1179 # and not that complex, so hopefully it works out... 

1180 pw = [] 

1181 path: list[PosPathStep] 

1182 for path in new_paths: 

1183 weight = len(path) 

1184 if any(x[1] == ["UNKNOWN"] for x in path): 

1185 weight += 100 # Penalize unknown paths 

1186 pw.append((weight, path)) 

1187 new_paths = [weightpath[1] for weightpath in sorted(pw)[:10]] 

1188 pos_paths.append(new_paths) 

1189 

1190 # print("END max_last_i={} len(wordlst)={} len(pos_paths)={}" 

1191 # .format(max_last_i, len(wordlst), len(pos_paths))) 

1192 

1193 if cur_nodes: 

1194 # print("END HAVE_NODES") 

1195 for node, start_i, last_i in cur_nodes: 

1196 if node.end: 

1197 # print("$ END start_i={} last_i={}" 

1198 # .format(start_i, last_i)) 

1199 for path in pos_paths[start_i]: 

1200 pos_paths[-1].append( 

1201 [(last_i, node.tags, node.topics)] + path 

1202 ) 

1203 else: 

1204 # print("UNK END start_i={} last_i={} wordlst={}" 

1205 # .format(start_i, last_i, wordlst)) 

1206 u = check_unknown( 

1207 last_i, 

1208 len(wordlst), 

1209 len(wordlst), 

1210 wordlst, 

1211 allow_any, 

1212 no_unknown_starts, 

1213 ) 

1214 if pos_paths[start_i]: 

1215 for path in pos_paths[start_i]: 

1216 pos_paths[-1].append(u + path) 

1217 else: 

1218 pos_paths[-1].append(u) 

1219 else: 

1220 # Check for a final unknown tag 

1221 # print("NO END NODES max_last_i={}".format(max_last_i)) 

1222 paths = pos_paths[max_last_i] or [[]] 

1223 u = check_unknown( 

1224 max_last_i, 

1225 len(wordlst), 

1226 len(wordlst), 

1227 wordlst, 

1228 allow_any, 

1229 no_unknown_starts, 

1230 ) 

1231 if u: 1231 ↛ 1078line 1231 didn't jump to line 1078 because the condition on line 1231 was always true

1232 # print("end max_last_i={}".format(max_last_i)) 

1233 for path in list(paths): # Copy in case it is the last pos 

1234 pos_paths[-1].append(u + path) 

1235 

1236 # import json 

1237 # print("POS_PATHS:", json.dumps(pos_paths, indent=2, sort_keys=True)) 

1238 

1239 if not pos_paths[-1]: 1239 ↛ 1241line 1239 didn't jump to line 1241 because the condition on line 1239 was never true

1240 # print("decode_tags: {}: EMPTY POS_PATHS[-1]".format(src)) 

1241 return [], [] 

1242 

1243 # Find the best path 

1244 pw = [] 

1245 for path in pos_paths[-1]: 

1246 weight = len(path) 

1247 if any(x[1] == ["UNKNOWN"] for x in path): 

1248 weight += 100 # Penalize unknown paths 

1249 pw.append((weight, path)) 

1250 path = min(pw)[1] 

1251 

1252 # Convert the best path to tagsets and topics 

1253 tagsets: list[list[str]] = [[]] 

1254 topics: list[str] = [] 

1255 for i, tagspec, topicspec in path: 

1256 if len(tagsets or "") > 16: 1256 ↛ 1259line 1256 didn't jump to line 1259 because the condition on line 1256 was never true

1257 # ctx.error("Too many tagsets! This is probably exponential", 

1258 # sortid="form_descriptions/20230818") 

1259 return [("error-unknown-tag", "error-exponential-tagsets")], [] 

1260 if tagspec == ["UNKNOWN"]: 

1261 new_tagsets = [] 

1262 for x in tagsets: 

1263 new_tagsets.append(x + topicspec) 

1264 tagsets = new_tagsets 

1265 continue 

1266 if tagspec: 

1267 new_tagsets = [] 

1268 for x in tagsets: 

1269 for t in tagspec: 

1270 if t: 1270 ↛ 1277line 1270 didn't jump to line 1277 because the condition on line 1270 was always true

1271 new_tags = list(x) 

1272 for tag in t.split(): 

1273 if tag not in new_tags: 

1274 new_tags.append(tag) 

1275 new_tagsets.append(new_tags) 

1276 else: 

1277 new_tagsets.append(x) 

1278 tagsets = new_tagsets 

1279 if topicspec: 

1280 for t in topicspec: 

1281 for topic in t.split(): 

1282 if topic not in topics: 

1283 topics.append(topic) 

1284 

1285 # print("unsorted tagsets:", tagsets) 

1286 ret_tagsets = sorted(set(tuple(sorted(set(tags))) for tags in tagsets)) 

1287 # topics = list(sorted(set(topics))) XXX tests expect not sorted 

1288 # print("decode_tags: {} -> {} topics {}".format(src, tagsets, topics)) 

1289 # Yes, ret_tagsets is a list of tags in tuples, while topics is a LIST 

1290 # of tags. Turning topics into a tuple breaks tests, turning the tuples 

1291 # inside tagsets into lists breaks tests, I'm leaving them mismatched 

1292 # for now. XXX 

1293 return ret_tagsets, topics 

1294 

1295 

1296def parse_head_final_tags( 

1297 wxr: WiktextractContext, lang: str, form: str 

1298) -> tuple[str, list[str]]: 

1299 """Parses tags that are allowed at the end of a form head from the end 

1300 of the form. This can also be used for parsing the final gender etc tags 

1301 from translations and linkages.""" 

1302 assert isinstance(wxr, WiktextractContext) 

1303 assert isinstance(lang, str) # Should be language that "form" is for 

1304 assert isinstance(form, str) 

1305 

1306 # print("parse_head_final_tags: lang={} form={!r}".format(lang, form)) 

1307 

1308 # Make sure there are no double spaces in the form as this code does not 

1309 # handle them otherwise. 

1310 form = re.sub(r"\s+", " ", form.strip()) 

1311 if not form: 

1312 return form, [] 

1313 

1314 origform = form 

1315 

1316 tags = [] 

1317 

1318 # If parsing for certain Bantu languages (e.g., Swahili), handle 

1319 # some extra head-final tags first 

1320 if lang in head_final_bantu_langs: 

1321 m = re.search(head_final_bantu_re, form) 

1322 if m is not None: 

1323 tagkeys = m.group(1) 

1324 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1324 ↛ 1339line 1324 didn't jump to line 1339 because the condition on line 1324 was always true

1325 form = form[: m.start()] 

1326 v = head_final_bantu_map[tagkeys] 

1327 if v.startswith("?"): 1327 ↛ 1328line 1327 didn't jump to line 1328 because the condition on line 1327 was never true

1328 v = v[1:] 

1329 wxr.wtp.debug( 

1330 "suspicious suffix {!r} in language {}: {}".format( 

1331 tagkeys, lang, origform 

1332 ), 

1333 sortid="form_descriptions/1028", 

1334 ) 

1335 tags.extend(v.split()) 

1336 

1337 # If parsing for certain Semitic languages (e.g., Arabic), handle 

1338 # some extra head-final tags first 

1339 if lang in head_final_semitic_langs: 

1340 m = re.search(head_final_semitic_re, form) 

1341 if m is not None: 

1342 tagkeys = m.group(1) 

1343 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1343 ↛ 1358line 1343 didn't jump to line 1358 because the condition on line 1343 was always true

1344 form = form[: m.start()] 

1345 v = head_final_semitic_map[tagkeys] 

1346 if v.startswith("?"): 1346 ↛ 1347line 1346 didn't jump to line 1347 because the condition on line 1346 was never true

1347 v = v[1:] 

1348 wxr.wtp.debug( 

1349 "suspicious suffix {!r} in language {}: {}".format( 

1350 tagkeys, lang, origform 

1351 ), 

1352 sortid="form_descriptions/1043", 

1353 ) 

1354 tags.extend(v.split()) 

1355 

1356 # If parsing for certain other languages (e.g., Lithuanian, 

1357 # French, Finnish), handle some extra head-final tags first 

1358 if lang in head_final_other_langs: 

1359 m = re.search(head_final_other_re, form) 

1360 if m is not None: 

1361 tagkeys = m.group(1) 

1362 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1362 ↛ 1367line 1362 didn't jump to line 1367 because the condition on line 1362 was always true

1363 form = form[: m.start()] 

1364 tags.extend(head_final_other_map[tagkeys].split(" ")) 

1365 

1366 # Handle normal head-final tags 

1367 m = re.search(head_final_re, form) 

1368 if m is not None: 

1369 tagkeys = m.group(3) 

1370 # Only replace tags ending with numbers in languages that have 

1371 # head-final numeric tags (e.g., Bantu classes); also, don't replace 

1372 # tags if the main title ends with them (then presume they are part 

1373 # of the word) 

1374 # print("head_final_tags form={!r} tagkeys={!r} lang={}" 

1375 # .format(form, tagkeys, lang)) 

1376 tagkeys_contains_digit = re.search(r"\d", tagkeys) 

1377 if ( 

1378 (not tagkeys_contains_digit or lang in head_final_numeric_langs) 

1379 and not wxr.wtp.title.endswith(" " + tagkeys) # type:ignore[union-attr] 

1380 and 

1381 # XXX the above test does not capture when the whole word is a 

1382 # xlat_head_map key, so I added the below test to complement 

1383 # it; does this break anything? 

1384 not wxr.wtp.title == tagkeys 

1385 ): # defunct/English, 

1386 # "more defunct" -> "more" ["archaic"] 

1387 if not tagkeys_contains_digit or lang in head_final_numeric_langs: 1387 ↛ 1401line 1387 didn't jump to line 1401 because the condition on line 1387 was always true

1388 form = form[: m.start()] 

1389 v = xlat_head_map[tagkeys] 

1390 if v.startswith("?"): 1390 ↛ 1391line 1390 didn't jump to line 1391 because the condition on line 1390 was never true

1391 v = v[1:] 

1392 wxr.wtp.debug( 

1393 "suspicious suffix {!r} in language {}: {}".format( 

1394 tagkeys, lang, origform 

1395 ), 

1396 sortid="form_descriptions/1077", 

1397 ) 

1398 tags.extend(v.split()) 

1399 

1400 # Generate warnings about words ending in " or" after processing 

1401 if ( 

1402 (form.endswith(" or") and not origform.endswith(" or")) 

1403 or re.search( 

1404 r" (1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|" 

1405 r"1a|2a|9a|10a|m1|f1|f2|m2|f3|m3|f4|m4|f5|m5|or|\?)" 

1406 r"($|/| (f|m|sg|pl|anim|inan))", 

1407 form, 

1408 ) 

1409 or form.endswith(" du") 

1410 ): 

1411 if form not in ok_suspicious_forms: 

1412 wxr.wtp.debug( 

1413 "suspicious unhandled suffix in {}: {!r}, originally {!r}".format( 

1414 lang, form, origform 

1415 ), 

1416 sortid="form_descriptions/1089", 

1417 ) 

1418 

1419 # print("parse_head_final_tags: form={!r} tags={}".format(form, tags)) 

1420 return form, tags 

1421 

1422 

1423def quote_kept_parens(s: str) -> str: 

1424 """Changes certain parenthesized expressions so that they won't be 

1425 interpreted as parentheses. This is used for parts that are kept as 

1426 part of the word, such as "read admiral (upper half)".""" 

1427 return re.sub( 

1428 r"\((lower half|upper half|k|s|n|II|III|A|C|G|U|Y|" 

1429 r"vinyl|p-phenylene vinylene|\(\(\s*\)\))\)", 

1430 r"__lpar__\1__rpar__", 

1431 s, 

1432 ) 

1433 

1434 

1435def quote_kept_ruby( 

1436 wxr: WiktextractContext, 

1437 ruby_tuples: list[ 

1438 tuple[ 

1439 str, 

1440 str, 

1441 ] 

1442 ], 

1443 s: str, 

1444) -> str: 

1445 if len(ruby_tuples) < 1: 

1446 wxr.wtp.debug( 

1447 "quote_kept_ruby called with no ruby", 

1448 sortid="form_description/1114/20230517", 

1449 ) 

1450 return s 

1451 ks = [] 

1452 rs = [] 

1453 for k, r in ruby_tuples: 

1454 ks.append(re.escape(k)) 

1455 rs.append(re.escape(r)) 

1456 if not (ks and rs): 

1457 wxr.wtp.debug( 

1458 f"empty column in ruby_tuples: {ruby_tuples}", 

1459 sortid="form_description/1124/20230606", 

1460 ) 

1461 return s 

1462 newm = re.compile( 

1463 r"({})\s*\(\s*({})\s*\)".format("|".join(ks), "|".join(rs)) 

1464 ) 

1465 rub_re = re.compile( 

1466 r"({})".format( 

1467 r"|".join( 

1468 r"{}\(*{}\)*".format( 

1469 re.escape(k), 

1470 re.escape(r), 

1471 ) 

1472 for k, r in ruby_tuples 

1473 ) 

1474 ) 

1475 ) 

1476 

1477 def paren_replace(m: re.Match) -> str: 

1478 return re.sub(newm, r"\1__lrub__\2__rrub__", m.group(0)) 

1479 

1480 return re.sub(rub_re, paren_replace, s) 

1481 

1482 

1483def unquote_kept_parens(s: str) -> str: 

1484 """Conerts the quoted parentheses back to normal parentheses.""" 

1485 return re.sub(r"__lpar__(.*?)__rpar__", r"(\1)", s) 

1486 

1487 

1488def add_romanization( 

1489 wxr: WiktextractContext, 

1490 data: WordData, 

1491 roman: str, 

1492 text: str, 

1493 is_reconstruction: bool, 

1494 head_group: Optional[int], 

1495 ruby: Sequence[tuple[str, str]], 

1496) -> None: 

1497 tags_lst = ["romanization"] 

1498 m = re.match(r"([^:]+):(.+)", roman) 

1499 # This function's purpose is to intercept broken romanizations, 

1500 # like "Yale: hēnpyeng" style tags. Most romanization styles 

1501 # are already present as tags, so we can use decode_tags to find 

1502 # them. 

1503 if m: 1503 ↛ 1504line 1503 didn't jump to line 1504 because the condition on line 1503 was never true

1504 tagsets, topics = decode_tags(m.group(1)) 

1505 if tagsets: 

1506 for tags in tagsets: 

1507 tags_lst.extend(tags) 

1508 roman = m.group(2) 

1509 add_related( 

1510 wxr, 

1511 data, 

1512 tags_lst, 

1513 [roman], 

1514 text, 

1515 True, 

1516 is_reconstruction, 

1517 head_group, 

1518 ruby, 

1519 ) 

1520 

1521 

1522def add_related( 

1523 wxr: WiktextractContext, 

1524 data: WordData, 

1525 tags_lst: Union[list[str], tuple[str, ...]], 

1526 related_list: list[str], 

1527 origtext: str, 

1528 add_all_canonicals: bool, 

1529 is_reconstruction: bool, 

1530 head_group: Optional[int], 

1531 ruby_data: Optional[Sequence[tuple[str, str]]] = None, 

1532) -> Optional[list[tuple[str, ...]]]: 

1533 """Internal helper function for some post-processing entries for related 

1534 forms (e.g., in word head). This returns a list of list of tags to be 

1535 added to following related forms or None (cf. walrus/English word head, 

1536 parenthesized part starting with "both").""" 

1537 assert isinstance(wxr, WiktextractContext) 

1538 assert isinstance(tags_lst, (list, tuple)) 

1539 for x in tags_lst: 

1540 assert isinstance(x, str) 

1541 assert isinstance(related_list, (list, tuple)) 

1542 assert isinstance(origtext, str) 

1543 assert add_all_canonicals in (True, False) 

1544 assert isinstance(ruby_data, (list, tuple)) or ruby_data is None 

1545 if ruby_data is None: 1545 ↛ 1546line 1545 didn't jump to line 1546 because the condition on line 1545 was never true

1546 ruby_data = [] 

1547 # print("add_related: tags_lst={} related={}".format(tags_lst, related)) 

1548 related = " ".join(related_list) 

1549 if related == "[please provide]": 1549 ↛ 1550line 1549 didn't jump to line 1550 because the condition on line 1549 was never true

1550 return None 

1551 if related in IGNORED_RELATED: 1551 ↛ 1552line 1551 didn't jump to line 1552 because the condition on line 1551 was never true

1552 return None 

1553 if is_reconstruction and related.startswith("*") and len(related) > 1: 

1554 related = related[1:] 

1555 

1556 # Get title word, with any reconstruction prefix removed 

1557 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title) # type:ignore[arg-type] 

1558 

1559 def check_related(related: str) -> None: 

1560 # Warn about some suspicious related forms 

1561 m = re.search(suspicious_related_re, related) 

1562 if (m and m.group(0) not in titleword) or ( 

1563 related in ("f", "m", "n", "c") and len(titleword) >= 3 

1564 ): 

1565 if "eumhun" in tags_lst: 1565 ↛ 1566line 1565 didn't jump to line 1566 because the condition on line 1565 was never true

1566 return 

1567 if "cangjie-input" in tags_lst: 1567 ↛ 1568line 1567 didn't jump to line 1568 because the condition on line 1567 was never true

1568 return 

1569 if "class" in tags_lst: 1569 ↛ 1570line 1569 didn't jump to line 1570 because the condition on line 1569 was never true

1570 return 

1571 if wxr.wtp.section == "Korean" and re.search( 1571 ↛ 1575line 1571 didn't jump to line 1575 because the condition on line 1571 was never true

1572 r"^\s*\w*>\w*\s*$", related 

1573 ): 

1574 # ignore Korean "i>ni" / "라>나" values 

1575 return 

1576 if ( 1576 ↛ 1583line 1576 didn't jump to line 1583

1577 wxr.wtp.section == "Burmese" 

1578 and "romanization" in tags_lst 

1579 and re.search(r":", related) 

1580 ): 

1581 # ignore Burmese with ":", that is used in Burmese 

1582 # translitteration of "း", the high-tone visarga. 

1583 return 

1584 wxr.wtp.debug( 

1585 "suspicious related form tags {}: {!r} in {!r}".format( 

1586 tags_lst, related, origtext 

1587 ), 

1588 sortid="form_descriptions/1147", 

1589 ) 

1590 

1591 following_tagsets = None # Tagsets to add to following related forms 

1592 roman = None 

1593 tagsets1: list[tuple[str, ...]] = [tuple()] 

1594 topics1: list[str] = [] 

1595 

1596 m = re.match(r"\((([^()]|\([^()]*\))*)\)\s+", related) 

1597 if m: 

1598 paren = m.group(1) 

1599 related = related[m.end() :] 

1600 m = re.match(r"^(all|both) (.*)", paren) 

1601 if m: 1601 ↛ 1602line 1601 didn't jump to line 1602 because the condition on line 1601 was never true

1602 tagsets1, topics1 = decode_tags(m.group(2)) 

1603 following_tagsets = tagsets1 

1604 else: 

1605 tagsets1, topics1 = decode_tags(paren) 

1606 else: 

1607 m = re.search(r"\s+\((([^()]|\([^()]*\))*)\)$", related) 

1608 if m: 

1609 paren = m.group(1) 

1610 if paren.startswith("U+"): 1610 ↛ 1611line 1610 didn't jump to line 1611 because the condition on line 1610 was never true

1611 related = related[: m.start()] 

1612 else: 

1613 cls = classify_desc(paren) 

1614 if ( 1614 ↛ 1621line 1614 didn't jump to line 1621

1615 cls in ("romanization", "english") 

1616 and classify_desc(related[: m.start()]) == "other" 

1617 ): 

1618 roman = paren 

1619 related = related[: m.start()] 

1620 else: 

1621 related = related[: m.start()] 

1622 tagsets1, topics1 = decode_tags(paren) 

1623 if related and related.startswith("{{"): 1623 ↛ 1624line 1623 didn't jump to line 1624 because the condition on line 1623 was never true

1624 wxr.wtp.debug( 

1625 "{{ in word head form - possible Wiktionary error: {!r}".format( 

1626 related 

1627 ), 

1628 sortid="form_descriptions/1177", 

1629 ) 

1630 return None # Likely Wiktionary coding error 

1631 related = unquote_kept_parens(related) 

1632 # Split related by "/" (e.g., grande/Spanish) superlative in head 

1633 # Do not split if / in word title, see π//Japanese 

1634 if len(related) > 5 and "/" not in wxr.wtp.title: # type:ignore[operator] 

1635 alts = split_at_comma_semi(related, separators=["/"]) 

1636 else: 

1637 alts = [related] 

1638 if ruby_data: 1638 ↛ 1640line 1638 didn't jump to line 1640 because the condition on line 1638 was never true

1639 # prepare some regex stuff in advance 

1640 ks, rs = [], [] 

1641 for k, r in ruby_data: 

1642 ks.append(re.escape(k)) 

1643 rs.append(re.escape(r)) 

1644 splitter = r"((?:{})__lrub__(?:{})__rrub__)".format( 

1645 "|".join(ks), "|".join(rs) 

1646 ) 

1647 for related in alts: 

1648 ruby: list[tuple[str, str]] = [] 

1649 if ruby_data: 1649 ↛ 1650line 1649 didn't jump to line 1650 because the condition on line 1649 was never true

1650 new_related = [] 

1651 rub_split = re.split(splitter, related) 

1652 for s in rub_split: 

1653 m = re.match(r"(.+)__lrub__(.+)__rrub__", s) 

1654 if m: 

1655 # add ruby with (\1, \2) 

1656 ruby.append((m.group(1), m.group(2))) 

1657 new_related.append(m.group(1)) 

1658 else: 

1659 new_related.append(s) 

1660 related = "".join(new_related) 

1661 tagsets2, topics2 = decode_tags(" ".join(tags_lst)) 

1662 for tags1 in tagsets1: 

1663 assert isinstance(tags1, (list, tuple)) 

1664 for tags2 in tagsets2: 

1665 assert isinstance(tags1, (list, tuple)) 

1666 dt: LinkageData = {"word": related} 

1667 if roman: 

1668 dt["roman"] = roman 

1669 if ruby: 1669 ↛ 1670line 1669 didn't jump to line 1670 because the condition on line 1669 was never true

1670 dt["ruby"] = ruby 

1671 if "alt-of" in tags2: 1671 ↛ 1672line 1671 didn't jump to line 1672 because the condition on line 1671 was never true

1672 check_related(related) 

1673 data_extend(data, "tags", tags1) 

1674 data_extend(data, "tags", tags2) 

1675 data_extend(data, "topics", topics1) 

1676 data_extend(data, "topics", topics2) 

1677 data_append(data, "alt_of", dt) 

1678 elif "form-of" in tags2: 1678 ↛ 1679line 1678 didn't jump to line 1679 because the condition on line 1678 was never true

1679 check_related(related) 

1680 data_extend(data, "tags", tags1) 

1681 data_extend(data, "tags", tags2) 

1682 data_extend(data, "topics", topics1) 

1683 data_extend(data, "topics", topics2) 

1684 data_append(data, "form_of", dt) 

1685 elif "compound-of" in tags2: 1685 ↛ 1686line 1685 didn't jump to line 1686 because the condition on line 1685 was never true

1686 check_related(related) 

1687 data_extend(data, "tags", tags1) 

1688 data_extend(data, "tags", tags2) 

1689 data_extend(data, "topics", topics1) 

1690 data_extend(data, "topics", topics2) 

1691 data_append(data, "compound", related) 

1692 else: 

1693 lang = wxr.wtp.section or "LANG_MISSING" 

1694 related, final_tags = parse_head_final_tags( 

1695 wxr, lang, related 

1696 ) 

1697 # print("add_related: related={!r} tags1={!r} tags2={!r} " 

1698 # "final_tags={!r}" 

1699 # .format(related, tags1, tags2, final_tags)) 

1700 tags = list(tags1) + list(tags2) + list(final_tags) 

1701 check_related(related) 

1702 form: FormData = {"form": related} 

1703 if head_group: 

1704 form["head_nr"] = head_group 

1705 if roman: 

1706 form["roman"] = roman 

1707 if ruby: 1707 ↛ 1708line 1707 didn't jump to line 1708 because the condition on line 1707 was never true

1708 form["ruby"] = ruby 

1709 data_extend(form, "topics", topics1) 

1710 data_extend(form, "topics", topics2) 

1711 if topics1 or topics2: 1711 ↛ 1712line 1711 didn't jump to line 1712 because the condition on line 1711 was never true

1712 wxr.wtp.debug( 

1713 "word head form has topics: {}".format(form), 

1714 sortid="form_descriptions/1233", 

1715 ) 

1716 # Add tags from canonical form into the main entry 

1717 if "canonical" in tags: 

1718 if related in ("m", "f") and len(titleword) > 1: 1718 ↛ 1719line 1718 didn't jump to line 1719 because the condition on line 1718 was never true

1719 wxr.wtp.debug( 

1720 "probably incorrect canonical form " 

1721 "{!r} ignored (probably tag combination " 

1722 "missing from xlat_head_map)".format(related), 

1723 sortid="form_descriptions/1241", 

1724 ) 

1725 continue 

1726 if ( 

1727 related != titleword 

1728 or add_all_canonicals 

1729 or topics1 

1730 or topics2 

1731 or ruby 

1732 ): 

1733 data_extend(form, "tags", list(sorted(set(tags)))) 

1734 else: 

1735 # We won't add canonical form here 

1736 filtered_tags = list( 

1737 x for x in tags if x != "canonical" 

1738 ) 

1739 data_extend(data, "tags", filtered_tags) 

1740 continue 

1741 else: 

1742 data_extend(form, "tags", list(sorted(set(tags)))) 

1743 # Only insert if the form is not already there 

1744 for old in data.get("forms", ()): 

1745 if form == old: 1745 ↛ 1746line 1745 didn't jump to line 1746 because the condition on line 1745 was never true

1746 break 

1747 else: 

1748 data_append(data, "forms", form) 

1749 

1750 # If this form had pre-tags that started with "both" or "all", add those 

1751 # tags also to following related forms that don't have their own tags 

1752 # specified. 

1753 return following_tagsets 

1754 

1755 

1756def parse_word_head( 

1757 wxr: WiktextractContext, 

1758 pos: str, 

1759 text: str, 

1760 data: WordData, 

1761 is_reconstruction: bool, 

1762 head_group: Optional[int], 

1763 ruby=None, 

1764 links=None, 

1765) -> None: 

1766 """Parses the head line for a word for in a particular language and 

1767 part-of-speech, extracting tags and related forms.""" 

1768 assert isinstance(wxr, WiktextractContext) 

1769 assert isinstance(pos, str) 

1770 assert isinstance(text, str) 

1771 assert isinstance(data, dict) 

1772 assert isinstance(ruby, (list, tuple)) or ruby is None 

1773 if ruby is None: 

1774 ruby = [] 

1775 assert is_reconstruction in (True, False) 

1776 # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text)) 

1777 # print(f"PARSE_WORD_HEAD: {data=}") 

1778 if links is None: 

1779 links = [] 

1780 

1781 if len(links) > 0: 

1782 # if we have link data (that is, links with stuff like commas and 

1783 # spaces, replace word_re with a modified local scope pattern 

1784 word_re = re.compile( 

1785 r"|".join( 

1786 sorted((re.escape(s) for s in links), key=lambda x: -len(x)) 

1787 ) 

1788 + r"|" 

1789 + word_pattern 

1790 ) 

1791 else: 

1792 word_re = word_re_global 

1793 

1794 if "Lua execution error" in text or "Lua timeout error" in text: 1794 ↛ 1795line 1794 didn't jump to line 1795 because the condition on line 1794 was never true

1795 return 

1796 

1797 # In Aug 2021, some words had spurious Template:en at the end of head forms 

1798 # due to a Wiktionary error. 

1799 text = re.sub(r"\s+Template:[-a-zA-Z]+\s*$", "", text) 

1800 

1801 # Fix words with "superlative:" or "comparative:" at end of head 

1802 # e.g. grande/Spanish/Adj 

1803 text = re.sub(r" (superlative|comparative): (.*)", r" (\1 \2)", text) 

1804 

1805 # Parse Arabic non-past forms, e.g. أبلع/Arabic/Verb 

1806 m = re.search(r", non-past ([^)]+ \([^)]+\))", text) 

1807 if m: 

1808 add_related( 

1809 wxr, 

1810 data, 

1811 ["non-past"], 

1812 [m.group(1)], 

1813 text, 

1814 True, 

1815 is_reconstruction, 

1816 head_group, 

1817 ruby, 

1818 ) 

1819 text = text[: m.start()] + text[m.end() :] 

1820 

1821 language = wxr.wtp.section 

1822 titleword = re.sub( 

1823 r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "MISSING_TITLE" 

1824 ) 

1825 titleparts = list( 

1826 m.group(0) 

1827 for m in re.finditer(word_re, wxr.wtp.title or "MISSING_TITLE") 

1828 ) 

1829 if not titleparts: 1829 ↛ 1830line 1829 didn't jump to line 1830 because the condition on line 1829 was never true

1830 return 

1831 

1832 # Remove " or" from the end to prevent weird canonical forms 

1833 if text.endswith(" or"): 

1834 for tp in titleparts: 

1835 if text.endswith(tp): 1835 ↛ 1836line 1835 didn't jump to line 1836 because the condition on line 1835 was never true

1836 break 

1837 else: 

1838 text = text.removesuffix(" or").rstrip() 

1839 

1840 # Handle the part of the head that is not in parentheses. However, certain 

1841 # parenthesized parts are part of word, and those must be handled 

1842 # specially here. 

1843 if ruby: 1843 ↛ 1844line 1843 didn't jump to line 1844 because the condition on line 1843 was never true

1844 text = quote_kept_ruby(wxr, ruby, text) 

1845 base = text 

1846 base = quote_kept_parens(base) 

1847 base = remove_text_in_parentheses(base) 

1848 base = base.replace("?", "") # Removes uncertain articles etc 

1849 base = re.sub(r"\s+", " ", base) 

1850 base = re.sub(r" ([,;])", r"\1", base) 

1851 base = re.sub(r"(.*) •.*", r"\1", base) 

1852 # Many languages use • as a punctuation mark separating the base 

1853 # from the rest of the head. στάδιος/Ancient Greek, issue #176 

1854 base = base.strip() 

1855 

1856 # Check for certain endings in head (mostly for compatibility with weird 

1857 # heads, e.g. rata/Romanian "1st conj." at end) 

1858 m = re.search(head_end_re, base) 

1859 tags: Union[tuple[str, ...], list[str]] = [] 

1860 if m: 1860 ↛ 1861line 1860 didn't jump to line 1861 because the condition on line 1860 was never true

1861 tags = head_end_map[m.group(1).lower()].split() 

1862 data_extend(data, "tags", tags) 

1863 base = base[: m.start()] 

1864 

1865 # Special case: handle Hán Nôm readings for Vietnamese characters 

1866 m = re.match( 

1867 r"{}: (Hán Nôm) readings: (.*)".format(re.escape(titleword)), base 

1868 ) 

1869 if m: 1869 ↛ 1870line 1869 didn't jump to line 1870 because the condition on line 1869 was never true

1870 tag, readings = m.groups() 

1871 tag = re.sub(r"\s+", "-", tag) 

1872 for reading in split_at_comma_semi(readings, skipped=links): 

1873 add_related( 

1874 wxr, 

1875 data, 

1876 [tag], 

1877 [reading], 

1878 text, 

1879 True, 

1880 is_reconstruction, 

1881 head_group, 

1882 ruby, 

1883 ) 

1884 return 

1885 

1886 # Special case: Hebrew " [pattern: nnn]" ending 

1887 m = re.search(r"\s+\[pattern: ([^]]+)\]", base) 

1888 if m: 1888 ↛ 1889line 1888 didn't jump to line 1889 because the condition on line 1888 was never true

1889 add_related( 

1890 wxr, 

1891 data, 

1892 ["class"], 

1893 [m.group(1)], 

1894 text, 

1895 True, 

1896 is_reconstruction, 

1897 head_group, 

1898 ruby, 

1899 ) 

1900 base = base[: m.start()] + base[m.end() :] 

1901 

1902 # Clean away some messy "Upload an image" template text used in 

1903 # American Sign Language: 

1904 # S@NearBaseForearm-PalmUp Frontandback S@BaseForearm-PalmUp 

1905 m = re.search(r"Upload .+ gif image.", base) 

1906 if m: 1906 ↛ 1907line 1906 didn't jump to line 1907 because the condition on line 1906 was never true

1907 base = base[: m.start()] + base[m.end() :] 

1908 

1909 # Split the head into alternatives. This is a complicated task, as 

1910 # we do not want so split on "or" or "," when immediately followed by more 

1911 # head-final tags, but otherwise do want to split by them. 

1912 # 20230907 added "or" to this to handle 'true or false', titles with 'or' 

1913 if wxr.wtp.title and ("," in wxr.wtp.title or " or " in wxr.wtp.title): 

1914 # A kludge to handle article titles/phrases with commas. 

1915 # Preprocess splits to first capture the title, then handle 

1916 # all the others as usual. 

1917 presplits = re.split(r"({})".format(wxr.wtp.title), base) 

1918 splits = [] 

1919 for psplit in presplits: 

1920 if psplit == wxr.wtp.title: 

1921 splits.append(psplit) 

1922 else: 

1923 splits.extend(re.split(head_split_re, psplit)) 

1924 else: 

1925 # Do the normal split; previous only-behavior. 

1926 splits = re.split(head_split_re, base) 

1927 # print("SPLITS:", splits) 

1928 alts: list[str] = [] 

1929 # print("parse_word_head: splits:", splits, 

1930 # "head_split_re_parens:", head_split_re_parens) 

1931 for i in range( 

1932 0, len(splits) - head_split_re_parens, head_split_re_parens + 1 

1933 ): 

1934 v = splits[i] 

1935 ending = splits[i + 1] or "" # XXX is this correct??? 

1936 # print("parse_word_head alts v={!r} ending={!r} alts={}" 

1937 # .format(v, ending, alts)) 

1938 if alts and (v == "" and ending): 

1939 assert ending[0] == " " 

1940 alts[-1] += " or" + ending # endings starts with space 

1941 elif v or ending: 1941 ↛ 1931line 1941 didn't jump to line 1931 because the condition on line 1941 was always true

1942 alts.append((v or "") + (ending or "")) 

1943 last = splits[-1].strip() 

1944 conn = "" if len(splits) < 3 else splits[-2] 

1945 # print("parse_word_head alts last={!r} conn={!r} alts={}" 

1946 # .format(last, conn, alts)) 

1947 if ( 

1948 alts 

1949 and last 

1950 and ( 

1951 last.split()[0] in xlat_head_map 

1952 or ( 

1953 conn == " or " 

1954 and (alts[-1] + " or " + last).strip() in xlat_head_map 

1955 ) 

1956 ) 

1957 ): 

1958 alts[-1] += " or " + last 

1959 elif last: 

1960 alts.append(last) 

1961 

1962 # print("parse_word_head alts: {}".format(alts)) 

1963 # print(f"{base=}") 

1964 

1965 # Process the head alternatives 

1966 canonicals: list[tuple[list[str], list[str]]] = [] 

1967 mode: Optional[str] = None 

1968 for alt_i, alt in enumerate(alts): 

1969 alt = alt.strip() 

1970 if alt.startswith("compound form:"): 1970 ↛ 1971line 1970 didn't jump to line 1971 because the condition on line 1970 was never true

1971 mode = "compound-form" 

1972 alt = alt[14:].strip() 

1973 if mode == "compound-form": 1973 ↛ 1974line 1973 didn't jump to line 1974 because the condition on line 1973 was never true

1974 add_related( 

1975 wxr, 

1976 data, 

1977 ["in-compounds"], 

1978 [alt], 

1979 text, 

1980 True, 

1981 is_reconstruction, 

1982 head_group, 

1983 ruby, 

1984 ) 

1985 continue 

1986 # For non-first parts, see if it can be treated as tags-only 

1987 if alt_i == 0: 

1988 expanded_alts = [alt] 

1989 else: 

1990 expanded_alts = map_with(xlat_descs_map, [alt]) 

1991 # print("EXPANDED_ALTS:", expanded_alts) 

1992 tagsets: Optional[list[tuple[str, ...]]] 

1993 for alt in expanded_alts: 

1994 baseparts = list(m.group(0) for m in re.finditer(word_re, alt)) 

1995 if alt_i > 0: 

1996 tagsets, topics = decode_tags(" ".join(baseparts)) 

1997 if not any("error-unknown-tag" in x for x in tagsets): 

1998 data_extend(data, "topics", topics) 

1999 for tags1 in tagsets: 

2000 data_extend(data, "tags", tags1) 

2001 continue 

2002 

2003 alt, tags = parse_head_final_tags( 

2004 wxr, language or "MISSING_LANG", alt 

2005 ) 

2006 tags = list(tags) # Make sure we don't modify anything cached 

2007 tags.append("canonical") 

2008 if alt_i == 0 and "," in wxr.wtp.title: # type:ignore[operator] 

2009 # Kludge to handle article titles/phrases with commas. 

2010 # basepart's regex strips commas, which leads to a 

2011 # canonical form that is the title phrase without a comma. 

2012 # basepart in add_related is almost immediately joined with 

2013 # spaces anyhow. XXX not exactly sure why it's 

2014 # canonicals.append((tags, baseparts)) and not (tags, [alt]) 

2015 baseparts = [alt] 

2016 canonicals.append((tags, baseparts)) 

2017 for tags, baseparts in canonicals: 

2018 add_related( 

2019 wxr, 

2020 data, 

2021 tags, 

2022 baseparts, 

2023 text, 

2024 len(canonicals) > 1, 

2025 is_reconstruction, 

2026 head_group, 

2027 ruby, 

2028 ) 

2029 

2030 # Handle parenthesized descriptors for the word form and links to 

2031 # related words 

2032 text = quote_kept_parens(text) 

2033 parens = list( 

2034 m.group(2) 

2035 for m in re.finditer(r"(^|\s)\((([^()]|\([^()]*\))*)\)", text) 

2036 ) 

2037 parens.extend( 

2038 m.group(1) 

2039 for m in re.finditer(r"[^\s]\((([^()]|\([^()]*\))*)\)($|\s)", text) 

2040 ) 

2041 have_romanization = False 

2042 have_ruby = False 

2043 hiragana = "" 

2044 katakana = "" 

2045 for paren in parens: 

2046 paren = paren.strip() 

2047 if not paren: 2047 ↛ 2048line 2047 didn't jump to line 2048 because the condition on line 2047 was never true

2048 continue 

2049 if paren.startswith("see "): 

2050 continue 

2051 if paren.startswith("U+"): 2051 ↛ 2052line 2051 didn't jump to line 2052 because the condition on line 2051 was never true

2052 continue 

2053 # In some rare cases, strip word that inflects form the form 

2054 # description, e.g. "look through rose-tinted glasses"/English. 

2055 paren = re.sub(r"\s*\(\[[^])]*\]\)", "", paren) 

2056 

2057 # If it starts with hiragana or katakana, treat as such form. Note 

2058 # that each hiragana/katakana character is in separate parentheses, 

2059 # so we must concatenate them. 

2060 try: 

2061 un = unicodedata.name(paren[0]).split()[0] 

2062 except ValueError: 

2063 un = "INVALID" 

2064 if un == "KATAKANA": 2064 ↛ 2065line 2064 didn't jump to line 2065 because the condition on line 2064 was never true

2065 katakana += paren 

2066 have_ruby = True 

2067 continue 

2068 if un == "HIRAGANA": 2068 ↛ 2069line 2068 didn't jump to line 2069 because the condition on line 2068 was never true

2069 hiragana += paren 

2070 have_ruby = True 

2071 continue 

2072 

2073 # Parse format ", 16 (Japan, Mainland), 17 (Hong Kong, Taiwan) strokes," 

2074 # in the middle of the parenthesized expression, e.g. 薄 

2075 def strokes_repl(m: re.Match) -> str: 

2076 strokes1, tags1, strokes2, tags2 = m.groups() 

2077 for strokes, tags in [[strokes1, tags1], [strokes2, tags2]]: 

2078 tags = tags.split(", ") 

2079 tags = list( 

2080 "Mainland China" if t == "Mainland" else t for t in tags 

2081 ) 

2082 tags.append("strokes") 

2083 add_related( 

2084 wxr, 

2085 data, 

2086 tags, 

2087 [strokes], 

2088 text, 

2089 True, 

2090 is_reconstruction, 

2091 head_group, 

2092 ruby, 

2093 ) 

2094 return ", " 

2095 

2096 paren = re.sub( 

2097 r", (\d+) \(([^()]+)\), (\d+) \(([^()]+)\) strokes, ", 

2098 strokes_repl, 

2099 paren, 

2100 ) 

2101 

2102 descriptors = map_with(xlat_descs_map, [paren]) 

2103 new_desc = [] 

2104 for desc in descriptors: 

2105 new_desc.extend( 

2106 map_with( 

2107 xlat_tags_map, 

2108 split_at_comma_semi(desc, extra=[", or "], skipped=links), 

2109 ) 

2110 ) 

2111 prev_tags: Union[list[list[str]], list[tuple[str, ...]], None] = None 

2112 following_tags = None # Added to prev_tags from previous parenthesized 

2113 # part, e.g. walrus/English 

2114 # "(both nonstandard, proscribed, uncommon)" 

2115 for desc_i, desc in enumerate(new_desc): 

2116 # print("HEAD DESC: {!r}".format(desc)) 

2117 

2118 # Abort on certain descriptors (assume remaining values are 

2119 # examples or uninteresting, cf. gaan/Navajo, horior/Latin) 

2120 if re.match(r"^(per |e\.g\.$)", desc): 2120 ↛ 2121line 2120 didn't jump to line 2121 because the condition on line 2120 was never true

2121 break 

2122 

2123 # If it all consists of CJK characters, add it with the 

2124 # CJK tag. This is used at least for some Vietnamese 

2125 # words (e.g., ba/Vietnamese) 

2126 try: 

2127 if all(unicodedata.name(x).startswith("CJK ") for x in desc): 2127 ↛ 2128line 2127 didn't jump to line 2128 because the condition on line 2127 was never true

2128 add_related( 

2129 wxr, 

2130 data, 

2131 ["CJK"], 

2132 [desc], 

2133 text, 

2134 True, 

2135 is_reconstruction, 

2136 head_group, 

2137 ruby, 

2138 ) 

2139 continue 

2140 except ValueError: 

2141 pass 

2142 

2143 # Handle some special cases 

2144 splitdesc = desc.split() 

2145 if ( 2145 ↛ 2154line 2145 didn't jump to line 2154

2146 len(splitdesc) >= 3 

2147 and splitdesc[1] == "superlative" 

2148 and classify_desc(splitdesc[0]) != "tags" 

2149 and prev_tags 

2150 ): 

2151 # Handle the special case of second comparative after comma, 

2152 # followed by superlative without comma. E.g. 

2153 # mal/Portuguese/Adv 

2154 for ts in prev_tags: 

2155 add_related( 

2156 wxr, 

2157 data, 

2158 ts, 

2159 [splitdesc[0]], 

2160 text, 

2161 True, 

2162 is_reconstruction, 

2163 head_group, 

2164 ruby, 

2165 ) 

2166 desc = " ".join(splitdesc[1:]) 

2167 elif ( 2167 ↛ 2175line 2167 didn't jump to line 2175

2168 len(splitdesc) == 2 

2169 and splitdesc[0] in ("also", "and") 

2170 and prev_tags 

2171 and classify_desc(splitdesc[1]) != "tags" 

2172 ): 

2173 # Sometimes alternative forms are prefixed with "also" or 

2174 # "and" 

2175 for ts in prev_tags: 

2176 add_related( 

2177 wxr, 

2178 data, 

2179 ts, 

2180 [splitdesc[1]], 

2181 text, 

2182 True, 

2183 is_reconstruction, 

2184 head_group, 

2185 ruby, 

2186 ) 

2187 continue 

2188 elif len(splitdesc) >= 2 and splitdesc[0] in ("including",): 2188 ↛ 2189line 2188 didn't jump to line 2189 because the condition on line 2188 was never true

2189 continue 

2190 

2191 # If only one word, assume it is comma-separated alternative 

2192 # to the previous one 

2193 if " " not in desc: 

2194 cls = classify_desc(desc) 

2195 if cls != "tags": 

2196 if prev_tags: 2196 ↛ 2198line 2196 didn't jump to line 2198 because the condition on line 2196 was never true

2197 # Assume comma-separated alternative to previous one 

2198 for ts in prev_tags: 

2199 add_related( 

2200 wxr, 

2201 data, 

2202 ts, 

2203 [desc], 

2204 text, 

2205 True, 

2206 is_reconstruction, 

2207 head_group, 

2208 ruby, 

2209 ) 

2210 continue 

2211 elif distw(titleparts, desc) <= 0.5: 2211 ↛ 2214line 2211 didn't jump to line 2214 because the condition on line 2211 was never true

2212 # Similar to head word, assume a dialectal variation to 

2213 # the base form. Cf. go/Alemannic German/Verb 

2214 add_related( 

2215 wxr, 

2216 data, 

2217 ["alternative"], 

2218 [desc], 

2219 text, 

2220 True, 

2221 is_reconstruction, 

2222 head_group, 

2223 ruby, 

2224 ) 

2225 continue 

2226 elif ( 2226 ↛ 2247line 2226 didn't jump to line 2247

2227 cls in ("romanization", "english") 

2228 and not have_romanization 

2229 and classify_desc(titleword) == "other" 

2230 and not ( 

2231 "categories" in data and desc in data["categories"] 

2232 ) 

2233 ): 

2234 # Assume it to be a romanization 

2235 add_romanization( 

2236 wxr, 

2237 data, 

2238 desc, 

2239 text, 

2240 is_reconstruction, 

2241 head_group, 

2242 ruby, 

2243 ) 

2244 have_romanization = True 

2245 continue 

2246 

2247 m = re.match(r"^(\d+) strokes?$", desc) 

2248 if m: 

2249 # Special case, used to give #strokes for Han characters 

2250 add_related( 

2251 wxr, 

2252 data, 

2253 ["strokes"], 

2254 [m.group(1)], 

2255 text, 

2256 True, 

2257 is_reconstruction, 

2258 head_group, 

2259 ruby, 

2260 ) 

2261 continue 

2262 

2263 # See if it is radical+strokes 

2264 m = re.match( 

2265 r"^([\u2F00-\u2FDF\u2E80-\u2EFF\U00018800-\U00018AFF" 

2266 r"\uA490-\uA4CF\u4E00-\u9FFF]\+\d+)" 

2267 r"( in (Japanese|Chinese|traditional Chinese|" 

2268 r"simplified Chinese))?$", 

2269 desc, 

2270 ) 

2271 if m: 2271 ↛ 2274line 2271 didn't jump to line 2274 because the condition on line 2271 was never true

2272 # Special case, used to give radical + strokes for Han 

2273 # characters 

2274 radical_strokes = m.group(1) 

2275 lang = m.group(3) 

2276 t = ["radical+strokes"] 

2277 if lang: 

2278 t.extend(lang.split()) 

2279 add_related( 

2280 wxr, 

2281 data, 

2282 t, 

2283 [radical_strokes], 

2284 text, 

2285 True, 

2286 is_reconstruction, 

2287 head_group, 

2288 ruby, 

2289 ) 

2290 prev_tags = None 

2291 following_tags = None 

2292 continue 

2293 

2294 # See if it indicates historical Katakana ortography (←) or 

2295 # just otherwise katakana/hiragana form 

2296 m = re.match(r"←\s*|kana\s+", desc) 

2297 if m: 2297 ↛ 2298line 2297 didn't jump to line 2298 because the condition on line 2297 was never true

2298 if desc.startswith("←"): 

2299 t1 = "historical " 

2300 else: 

2301 t1 = "" 

2302 x = desc[m.end() :] 

2303 if x.endswith("?"): 

2304 x = x[:-1] 

2305 # XXX should we add a tag indicating uncertainty? 

2306 if x: 

2307 name = unicodedata.name(x[0]) 

2308 if name.startswith("HIRAGANA "): 

2309 desc = t1 + "hiragana " + x 

2310 elif name.startswith("KATAKANA "): 

2311 desc = t1 + "katakana " + x 

2312 

2313 # See if it is "n strokes in Chinese" or similar 

2314 m = re.match( 

2315 r"(\d+) strokes in (Chinese|Japanese|" 

2316 r"traditional Chinese|simplified Chinese)$", 

2317 desc, 

2318 ) 

2319 if m: 2319 ↛ 2321line 2319 didn't jump to line 2321 because the condition on line 2319 was never true

2320 # Special case, used to give just strokes for some Han chars 

2321 strokes = m.group(1) 

2322 lang = m.group(2) 

2323 t = ["strokes"] 

2324 t.extend(lang.split()) 

2325 add_related( 

2326 wxr, 

2327 data, 

2328 t, 

2329 [strokes], 

2330 text, 

2331 True, 

2332 is_reconstruction, 

2333 head_group, 

2334 ruby, 

2335 ) 

2336 prev_tags = None 

2337 following_tags = None 

2338 continue 

2339 

2340 # American Sign Language has images (or requests for image) 

2341 # as heads, + this ASL gloss after. 

2342 m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text) 

2343 if m2: 2343 ↛ 2344line 2343 didn't jump to line 2344 because the condition on line 2343 was never true

2344 add_related( 

2345 wxr, 

2346 data, 

2347 ["ASL-gloss"], 

2348 [m2.group(1)], 

2349 text, 

2350 True, 

2351 is_reconstruction, 

2352 head_group, 

2353 ruby, 

2354 ) 

2355 continue 

2356 

2357 parts = list(m.group(0) for m in re.finditer(word_re, desc)) 

2358 if not parts: 2358 ↛ 2359line 2358 didn't jump to line 2359 because the condition on line 2358 was never true

2359 prev_tags = None 

2360 following_tags = None 

2361 continue 

2362 

2363 # Check for certain language-specific header part starts that 

2364 # modify 

2365 if len(parts) == 2 and language in lang_specific_head_map: 2365 ↛ 2366line 2365 didn't jump to line 2366 because the condition on line 2365 was never true

2366 ht = lang_specific_head_map[language] 

2367 if parts[0] in ht: 

2368 rem_tags, add_tags = ht[parts[0]] 

2369 new_prev_tags1: list[list[str]] = [] 

2370 tags2: Union[tuple[str, ...], list[str]] 

2371 for tags2 in prev_tags or [()]: 

2372 if rem_tags is True: # Remove all old tags 

2373 tsets = set() 

2374 else: 

2375 tsets = set(tags2) - set(rem_tags.split()) 

2376 tsets = tsets | set(add_tags.split()) 

2377 tags = list(sorted(tsets)) 

2378 add_related( 

2379 wxr, 

2380 data, 

2381 tags, 

2382 [parts[1]], 

2383 text, 

2384 True, 

2385 is_reconstruction, 

2386 head_group, 

2387 ruby, 

2388 ) 

2389 new_prev_tags1.append(tags) 

2390 prev_tags = new_prev_tags1 

2391 following_tags = None 

2392 continue 

2393 

2394 # Handle the special case of descriptors that are parenthesized, 

2395 # e.g., (archaic or Scotland) 

2396 m = re.match(r"\(([^)]+)\)\s+(.*)$", desc) 

2397 if m is not None and classify_desc(m.group(1)) == "tags": 2397 ↛ 2398line 2397 didn't jump to line 2398 because the condition on line 2397 was never true

2398 tagpart = m.group(1) 

2399 related = [m.group(2)] 

2400 tagsets, topics = decode_tags(tagpart, no_unknown_starts=True) 

2401 if topics: 

2402 wxr.wtp.debug( 

2403 "parenthized head part {!r} contains topics: {}".format( 

2404 tagpart, topics 

2405 ), 

2406 sortid="form_descriptions/1647", 

2407 ) 

2408 elif m is not None and re.match(r"in the sense ", m.group(1)): 2408 ↛ 2411line 2408 didn't jump to line 2411 because the condition on line 2408 was never true

2409 # Handle certain ignored cases 

2410 # e.g. bord/Danish: in the sense "plank" 

2411 related = [m.group(2)] 

2412 tagsets = [()] 

2413 else: 

2414 # Normal parsing of the descriptor 

2415 alt_related = None 

2416 alt_tagsets = None 

2417 tagsets = None 

2418 for i in range(len(parts), 0, -1): 2418 ↛ 2451line 2418 didn't jump to line 2451 because the loop on line 2418 didn't complete

2419 related = parts[i:] 

2420 tagparts = parts[:i] 

2421 # print(" i={} related={} tagparts={}" 

2422 # .format(i, related, tagparts)) 

2423 tagsets, topics = decode_tags( 

2424 " ".join(tagparts), no_unknown_starts=True 

2425 ) 

2426 # print("tagparts={!r} tagsets={} topics={} related={} " 

2427 # "alt_related={} distw={:.2f}" 

2428 # .format(tagparts, tagsets, topics, related, 

2429 # alt_related, 

2430 # distw(titleparts, parts[i - 1]))) 

2431 if ( 

2432 topics 

2433 or not tagsets 

2434 or any("error-unknown-tag" in x for x in tagsets) 

2435 ): 

2436 if alt_related is not None: 2436 ↛ 2437line 2436 didn't jump to line 2437 because the condition on line 2436 was never true

2437 break 

2438 continue 

2439 if ( 2439 ↛ 2444line 2439 didn't jump to line 2444

2440 i > 1 

2441 and len(parts[i - 1]) >= 4 

2442 and distw(titleparts, parts[i - 1]) <= 0.4 

2443 ): 

2444 alt_related = related 

2445 alt_tagsets = tagsets 

2446 continue 

2447 alt_related = None 

2448 alt_tagsets = None 

2449 break 

2450 else: 

2451 if alt_related is None: 

2452 # Check if the parenthesized part is likely a 

2453 # romanization 

2454 if ( 

2455 (have_ruby or classify_desc(base) == "other") 

2456 and classify_desc(paren) == "romanization" 

2457 and not ( 

2458 "categories" in data 

2459 and desc in data["categories"] 

2460 ) 

2461 ): 

2462 for r in split_at_comma_semi( 

2463 paren, extra=[" or "], skipped=links 

2464 ): 

2465 add_romanization( 

2466 wxr, 

2467 data, 

2468 r, 

2469 text, 

2470 is_reconstruction, 

2471 head_group, 

2472 ruby, 

2473 ) 

2474 have_romanization = True 

2475 continue 

2476 tagsets = [("error-unrecognized-head-form",)] 

2477 wxr.wtp.debug( 

2478 "unrecognized head form: {}".format(desc), 

2479 sortid="form_descriptions/1698", 

2480 ) 

2481 continue 

2482 

2483 if alt_related is not None: 2483 ↛ 2484line 2483 didn't jump to line 2484 because the condition on line 2483 was never true

2484 related = alt_related 

2485 tagsets = alt_tagsets 

2486 

2487 # print("FORM END: tagsets={} related={}".format(tagsets, related)) 

2488 if not tagsets: 2488 ↛ 2489line 2488 didn't jump to line 2489 because the condition on line 2488 was never true

2489 continue 

2490 

2491 assert isinstance(related, (list, tuple)) 

2492 related_str = " ".join(related) 

2493 if "or" in titleparts: 

2494 alts = [related_str] 

2495 else: 

2496 alts = split_at_comma_semi( 

2497 related_str, separators=[" or "], skipped=links 

2498 ) 

2499 if not alts: 

2500 alts = [""] 

2501 for related_str in alts: 

2502 if related_str: 

2503 if prev_tags and ( 

2504 all( 

2505 all( 

2506 t in ["nonstandard", "dialectal"] 

2507 or valid_tags[t] == "dialect" 

2508 for t in tags 

2509 ) 

2510 for ts in tagsets 

2511 ) 

2512 or ( 

2513 any("participle" in ts for ts in prev_tags) 

2514 and all( 

2515 "attributive" in ts 

2516 or any(valid_tags[t] == "gender" for t in ts) 

2517 for ts in tagsets 

2518 ) 

2519 ) 

2520 ): 

2521 # Merged with previous tags. Don't update previous 

2522 # tags here; cf. burn/English/Verb 

2523 for tags_l in tagsets: 

2524 for ts in prev_tags: 

2525 tags_l1 = list(sorted(set(tags_l) | set(ts))) 

2526 add_related( 

2527 wxr, 

2528 data, 

2529 tags_l1, 

2530 [related_str], 

2531 text, 

2532 True, 

2533 is_reconstruction, 

2534 head_group, 

2535 ruby, 

2536 ) 

2537 else: 

2538 # Not merged with previous tags 

2539 for tags_l in tagsets: 

2540 if following_tags is not None: 2540 ↛ 2541line 2540 didn't jump to line 2541 because the condition on line 2540 was never true

2541 for ts in following_tags: 

2542 tags_l1 = list( 

2543 sorted(set(tags_l) | set(ts)) 

2544 ) 

2545 add_related( 

2546 wxr, 

2547 data, 

2548 tags_l1, 

2549 [related_str], 

2550 text, 

2551 True, 

2552 is_reconstruction, 

2553 head_group, 

2554 ruby, 

2555 ) 

2556 else: 

2557 ret = add_related( 

2558 wxr, 

2559 data, 

2560 tags_l, 

2561 [related_str], 

2562 text, 

2563 True, 

2564 is_reconstruction, 

2565 head_group, 

2566 ruby, 

2567 ) 

2568 if ret is not None: 2568 ↛ 2569line 2568 didn't jump to line 2569 because the condition on line 2568 was never true

2569 following_tags = ret 

2570 prev_tags = tagsets 

2571 else: 

2572 if desc_i < len(new_desc) - 1 and all( 2572 ↛ 2579line 2572 didn't jump to line 2579 because the condition on line 2572 was never true

2573 "participle" in ts or "infinitive" in ts 

2574 for ts in tagsets 

2575 ): 

2576 # Interpret it as a standalone form description 

2577 # in the middle, probably followed by forms or 

2578 # language-specific descriptors. cf. drikke/Danish 

2579 new_prev_tags2 = [] 

2580 for ts1 in prev_tags or [()]: 

2581 for ts2 in tagsets: 

2582 ts = tuple(sorted(set(ts1) | set(ts2))) 

2583 new_prev_tags2.append(ts) 

2584 prev_tags = new_prev_tags2 

2585 continue 

2586 for tags in tagsets: 

2587 data_extend(data, "tags", tags) 

2588 prev_tags = tagsets 

2589 following_tags = None 

2590 

2591 # Finally, if we collected hirakana/katakana, add them now 

2592 if hiragana: 2592 ↛ 2593line 2592 didn't jump to line 2593 because the condition on line 2592 was never true

2593 add_related( 

2594 wxr, 

2595 data, 

2596 ["hiragana"], 

2597 [hiragana], 

2598 text, 

2599 True, 

2600 is_reconstruction, 

2601 head_group, 

2602 ruby, 

2603 ) 

2604 if katakana: 2604 ↛ 2605line 2604 didn't jump to line 2605 because the condition on line 2604 was never true

2605 add_related( 

2606 wxr, 

2607 data, 

2608 ["katakana"], 

2609 [katakana], 

2610 text, 

2611 True, 

2612 is_reconstruction, 

2613 head_group, 

2614 ruby, 

2615 ) 

2616 

2617 # XXX check if this is actually relevant, tags in word root data 

2618 # is extremely rare (not sure where they slip through). 

2619 tags = data.get("tags", []) # type:ignore 

2620 if len(tags) > 0: 

2621 # wxr.wtp.debug( 

2622 # f"Tags appear in word root data: {data['tags']=}", # type:ignore 

2623 # sortid="form_descriptions/2620/20240606", 

2624 # ) # Messes up tests. 

2625 data["tags"] = list(sorted(set(tags))) # type:ignore 

2626 

2627 

2628def parse_sense_qualifier( 

2629 wxr: WiktextractContext, text: str, data: Union[SenseData, LinkageData] 

2630) -> None: 

2631 """Parses tags or topics for a sense or some other data. The values are 

2632 added into the dictionary ``data``.""" 

2633 assert isinstance(wxr, WiktextractContext) 

2634 assert isinstance(text, str) 

2635 assert isinstance(data, dict) 

2636 # print("parse_sense_qualifier:", text) 

2637 if re.match(r"\([^()]+\)$", text): 2637 ↛ 2638line 2637 didn't jump to line 2638 because the condition on line 2637 was never true

2638 text = text[1:-1] 

2639 if re.match(r'"[^"]+"$', text): 2639 ↛ 2640line 2639 didn't jump to line 2640 because the condition on line 2639 was never true

2640 text = text[1:-1] 

2641 lst = map_with(xlat_descs_map, [text]) 

2642 sense_tags: list[str] = [] 

2643 for text in lst: 

2644 for semi in split_at_comma_semi(text): 

2645 if not semi: 2645 ↛ 2646line 2645 didn't jump to line 2646 because the condition on line 2645 was never true

2646 continue 

2647 orig_semi = semi 

2648 idx = semi.find(":") 

2649 if idx >= 0: 2649 ↛ 2650line 2649 didn't jump to line 2650 because the condition on line 2649 was never true

2650 semi = semi[:idx] 

2651 cls = classify_desc(semi, allow_unknown_tags=True) 

2652 # print("parse_sense_qualifier: classify_desc: {} -> {}" 

2653 # .format(semi, cls)) 

2654 if cls == "tags": 2654 ↛ 2663line 2654 didn't jump to line 2663 because the condition on line 2654 was always true

2655 tagsets, topics = decode_tags(semi) 

2656 data_extend(data, "topics", topics) 

2657 # XXX should think how to handle distinct options better, 

2658 # e.g., "singular and plural genitive"; that can't really be 

2659 # done with changing the calling convention of this function. 

2660 # Should split sense if more than one category of tags differs. 

2661 for tags in tagsets: 

2662 sense_tags.extend(tags) 

2663 elif cls == "taxonomic": 

2664 if re.match(r"×[A-Z]", semi): 

2665 sense_tags.append("extinct") 

2666 semi = semi[1:] 

2667 data["taxonomic"] = semi 

2668 elif cls == "english": 

2669 if "qualifier" in data and data["qualifier"] != orig_semi: 

2670 data["qualifier"] += "; " + orig_semi 

2671 else: 

2672 data["qualifier"] = orig_semi 

2673 else: 

2674 wxr.wtp.debug( 

2675 "unrecognized sense qualifier: {}".format(text), 

2676 sortid="form_descriptions/1831", 

2677 ) 

2678 sense_tags = list(sorted(set(sense_tags))) 

2679 data_extend(data, "tags", sense_tags) 

2680 

2681 

2682def parse_pronunciation_tags( 

2683 wxr: WiktextractContext, text: str, data: SoundData 

2684) -> None: 

2685 assert isinstance(wxr, WiktextractContext) 

2686 assert isinstance(text, str) 

2687 assert isinstance(data, dict) 

2688 text = text.strip() 

2689 if not text: 2689 ↛ 2690line 2689 didn't jump to line 2690 because the condition on line 2689 was never true

2690 return 

2691 cls = classify_desc(text) 

2692 notes = [] 

2693 if cls == "tags": 

2694 tagsets, topics = decode_tags(text) 

2695 data_extend(data, "topics", topics) 

2696 for tagset in tagsets: 

2697 for t in tagset: 

2698 if " " in t: 2698 ↛ 2699line 2698 didn't jump to line 2699 because the condition on line 2698 was never true

2699 notes.append(t) 

2700 else: 

2701 data_append(data, "tags", t) 

2702 else: 

2703 notes.append(text) 

2704 if notes: 

2705 data["note"] = "; ".join(notes) 

2706 

2707 

2708def parse_translation_desc( 

2709 wxr: WiktextractContext, lang: str, text: str, tr: TranslationData 

2710) -> None: 

2711 assert isinstance(wxr, WiktextractContext) 

2712 assert isinstance(lang, str) # The language of ``text`` 

2713 assert isinstance(text, str) 

2714 assert isinstance(tr, dict) 

2715 # print("parse_translation_desc:", text) 

2716 

2717 # Process all parenthesized parts from the translation item 

2718 note = None 

2719 restore_beginning = "" 

2720 restore_end = "" 

2721 while True: 

2722 beginning = False 

2723 # See if we can find a parenthesized expression at the end 

2724 m = re.search(r"\s*\((([^()]|\([^()]+\))+)\)\.?$", text) 

2725 if m: 

2726 par = m.group(1) 

2727 text = text[: m.start()] 

2728 if par.startswith(("literally ", "lit.")): 2728 ↛ 2729line 2728 didn't jump to line 2729 because the condition on line 2728 was never true

2729 continue # Not useful for disambiguation in many idioms 

2730 else: 

2731 # See if we can find a parenthesized expression at the start 

2732 m = re.match(r"^\^?\((([^()]|\([^()]+\))+)\):?(\s+|$)", text) 

2733 if m: 2733 ↛ 2734line 2733 didn't jump to line 2734 because the condition on line 2733 was never true

2734 par = m.group(1) 

2735 text = text[m.end() :] 

2736 beginning = True 

2737 if re.match(r"^(\d|\s|,| or | and )+$", par): 

2738 # Looks like this beginning parenthesized expression only 

2739 # contains digits or their combinations. We assume such 

2740 # to be sense descriptions if no sense has been selected, 

2741 # or otherwise just ignore them. 

2742 if not tr.get("sense"): 

2743 tr["sense"] = par 

2744 continue 

2745 else: 

2746 # See if we can find a parenthesized expression in the middle. 

2747 # Romanizations are sometimes between word and gender marker, 

2748 # e.g. wife/English/Tr/Yiddish. 

2749 m = re.search(r"\s+\((([^()]|\([^()]+\))+)\)", text) 

2750 if m: 2750 ↛ 2751line 2750 didn't jump to line 2751 because the condition on line 2750 was never true

2751 par = m.group(1) 

2752 text = text[: m.start()] + text[m.end() :] 

2753 else: 

2754 # No more parenthesized expressions - break out of the loop 

2755 break 

2756 

2757 # Some cleanup of artifacts that may result from skipping some templates 

2758 # in earlier stages 

2759 if par.startswith(": "): 2759 ↛ 2760line 2759 didn't jump to line 2760 because the condition on line 2759 was never true

2760 par = par[2:] 

2761 if par.endswith(","): 2761 ↛ 2762line 2761 didn't jump to line 2762 because the condition on line 2761 was never true

2762 par = par[:-1] 

2763 if re.match(r'^[“"]([^“”"]*)[“”"]$', par): 2763 ↛ 2764line 2763 didn't jump to line 2764 because the condition on line 2763 was never true

2764 par = par[1:-1] 

2765 par = par.strip() 

2766 

2767 # Check for special script pronunciation followed by romanization, 

2768 # used in many Asian languages. 

2769 lst = par.split(", ") 

2770 if len(lst) == 2: 2770 ↛ 2771line 2770 didn't jump to line 2771 because the condition on line 2770 was never true

2771 a, r = lst 

2772 if classify_desc(a) == "other": 

2773 cls = classify_desc(r) 

2774 # print("parse_translation_desc: r={} cls={}".format(r, cls)) 

2775 if cls == "romanization" or ( 

2776 cls == "english" and len(r.split()) == 1 and r[0].islower() 

2777 ): 

2778 if tr.get("alt") and tr.get("alt") != a: 

2779 wxr.wtp.debug( 

2780 'more than one value in "alt": {} vs. {}'.format( 

2781 tr["alt"], a 

2782 ), 

2783 sortid="form_descriptions/1930", 

2784 ) 

2785 tr["alt"] = a 

2786 if tr.get("roman") and tr.get("roman") != r: 

2787 wxr.wtp.debug( 

2788 'more than one value in "roman": ' 

2789 "{} vs. {}".format(tr["roman"], r), 

2790 sortid="form_descriptions/1936", 

2791 ) 

2792 tr["roman"] = r 

2793 continue 

2794 

2795 # Check for certain comma-separated tags combined with English text 

2796 # at the beginning or end of a comma-separated parenthesized list 

2797 while len(lst) > 1: 2797 ↛ 2798line 2797 didn't jump to line 2798 because the condition on line 2797 was never true

2798 cls = classify_desc(lst[0]) 

2799 if cls == "tags": 

2800 tagsets, topics = decode_tags(lst[0]) 

2801 for t in tagsets: 

2802 data_extend(tr, "tags", t) 

2803 data_extend(tr, "topics", topics) 

2804 lst = lst[1:] 

2805 continue 

2806 cls = classify_desc(lst[-1]) 

2807 if cls == "tags": 

2808 tagsets, topics = decode_tags(lst[-1]) 

2809 for t in tagsets: 

2810 data_extend(tr, "tags", t) 

2811 data_extend(tr, "topics", topics) 

2812 lst = lst[:-1] 

2813 continue 

2814 break 

2815 par = ", ".join(lst) 

2816 

2817 if not par: 2817 ↛ 2818line 2817 didn't jump to line 2818 because the condition on line 2817 was never true

2818 continue 

2819 if re.search(tr_ignored_parens_re, par): 2819 ↛ 2820line 2819 didn't jump to line 2820 because the condition on line 2819 was never true

2820 continue 

2821 if par.startswith("numeral:"): 2821 ↛ 2822line 2821 didn't jump to line 2822 because the condition on line 2821 was never true

2822 par = par[8:].strip() 

2823 

2824 # Classify the part in parenthesis and process accordingly 

2825 cls = classify_desc(par) 

2826 # print("parse_translation_desc classify: {!r} -> {}" 

2827 # .format(par, cls)) 

2828 if par == text: 2828 ↛ 2829line 2828 didn't jump to line 2829 because the condition on line 2828 was never true

2829 pass 

2830 if par == "f": 2830 ↛ 2831line 2830 didn't jump to line 2831 because the condition on line 2830 was never true

2831 data_append(tr, "tags", "feminine") 

2832 elif par == "m": 2832 ↛ 2833line 2832 didn't jump to line 2833 because the condition on line 2832 was never true

2833 data_append(tr, "tags", "masculine") 

2834 elif cls == "tags": 2834 ↛ 2835line 2834 didn't jump to line 2835 because the condition on line 2834 was never true

2835 tagsets, topics = decode_tags(par) 

2836 for tags in tagsets: 

2837 data_extend(tr, "tags", tags) 

2838 data_extend(tr, "topics", topics) 

2839 elif cls == "english": 

2840 # If the text contains any of certain grammatical words, treat it 

2841 # as a "note" instead of "english" 

2842 if re.search(tr_note_re, par): 2842 ↛ 2843line 2842 didn't jump to line 2843 because the condition on line 2842 was never true

2843 if par.endswith(":"): 

2844 par = par[:-1] 

2845 if par not in ("see entry for forms",): 

2846 if note: 

2847 note = note + ";" + par 

2848 else: 

2849 note = par 

2850 else: 

2851 # There can be more than one parenthesized english item, see 

2852 # e.g. Aunt/English/Translations/Tamil 

2853 if tr.get("english"): 2853 ↛ 2854line 2853 didn't jump to line 2854 because the condition on line 2853 was never true

2854 tr["english"] += "; " + par 

2855 else: 

2856 tr["english"] = par 

2857 elif cls == "romanization": 2857 ↛ 2877line 2857 didn't jump to line 2877 because the condition on line 2857 was always true

2858 # print("roman text={!r} text cls={}" 

2859 # .format(text, classify_desc(text))) 

2860 if classify_desc(text) in ( 2860 ↛ 2864line 2860 didn't jump to line 2864 because the condition on line 2860 was never true

2861 "english", 

2862 "romanization", 

2863 ) and lang not in ("Egyptian",): 

2864 if beginning: 

2865 restore_beginning += "({}) ".format(par) 

2866 else: 

2867 restore_end = " ({})".format(par) + restore_end 

2868 else: 

2869 if tr.get("roman"): 2869 ↛ 2870line 2869 didn't jump to line 2870 because the condition on line 2869 was never true

2870 wxr.wtp.debug( 

2871 'more than one value in "roman": {} vs. {}'.format( 

2872 tr["roman"], par 

2873 ), 

2874 sortid="form_descriptions/2013", 

2875 ) 

2876 tr["roman"] = par 

2877 elif cls == "taxonomic": 

2878 if tr.get("taxonomic"): 

2879 wxr.wtp.debug( 

2880 'more than one value in "taxonomic": {} vs. {}'.format( 

2881 tr["taxonomic"], par 

2882 ), 

2883 sortid="form_descriptions/2019", 

2884 ) 

2885 if re.match(r"×[A-Z]", par): 

2886 data_append(tr, "tags", "extinct") 

2887 par = par[1:] 

2888 tr["taxonomic"] = par 

2889 elif cls == "other": 

2890 if tr.get("alt"): 

2891 wxr.wtp.debug( 

2892 'more than one value in "alt": {} vs. {}'.format( 

2893 tr["alt"], par 

2894 ), 

2895 sortid="form_descriptions/2028", 

2896 ) 

2897 tr["alt"] = par 

2898 else: 

2899 wxr.wtp.debug( 

2900 "parse_translation_desc unimplemented cls {}: {}".format( 

2901 cls, par 

2902 ), 

2903 sortid="form_descriptions/2033", 

2904 ) 

2905 

2906 # Check for gender indications in suffix 

2907 text, final_tags = parse_head_final_tags(wxr, lang, text) 

2908 data_extend(tr, "tags", final_tags) 

2909 

2910 # Restore those parts that we did not want to remove (they are often 

2911 # optional words or words that are always used with the given translation) 

2912 text = restore_beginning + text + restore_end 

2913 

2914 if note: 2914 ↛ 2915line 2914 didn't jump to line 2915 because the condition on line 2914 was never true

2915 tr["note"] = note.strip() 

2916 if text and text not in ignored_translations: 2916 ↛ 2921line 2916 didn't jump to line 2921 because the condition on line 2916 was always true

2917 tr["word"] = text.strip() 

2918 

2919 # Sometimes gender seems to be at the end of "roman" field, see e.g. 

2920 # fire/English/Noun/Translations/Egyptian (for "oxidation reaction") 

2921 roman = tr.get("roman") 

2922 if roman: 

2923 if roman.endswith(" f"): 2923 ↛ 2924line 2923 didn't jump to line 2924 because the condition on line 2923 was never true

2924 data_append(tr, "tags", "feminine") 

2925 tr["roman"] = roman[:-2].strip() 

2926 elif roman.endswith(" m"): 2926 ↛ 2927line 2926 didn't jump to line 2927 because the condition on line 2926 was never true

2927 data_append(tr, "tags", "masculine") 

2928 tr["roman"] = roman[:-2].strip() 

2929 

2930 # If the word now has "english" field but no "roman" field, and 

2931 # the word would be classified "other" (generally non-latin 

2932 # characters), and the value in "english" is only one lowercase 

2933 # word, move it to "roman". This happens semi-frequently when the 

2934 # translation is transliterated the same as some English word. 

2935 roman = tr.get("roman") 

2936 english = tr.get("english") 

2937 if english and not roman and "word" in tr: 

2938 cls = classify_desc(tr["word"]) 

2939 if cls == "other" and " " not in english and english[0].islower(): 2939 ↛ 2946line 2939 didn't jump to line 2946 because the condition on line 2939 was always true

2940 del tr["english"] 

2941 tr["roman"] = english 

2942 

2943 # If the entry now has both tr["roman"] and tr["word"] and they have 

2944 # the same value, delete tr["roman"] (e.g., man/English/Translations 

2945 # Evenki) 

2946 if tr.get("word") and tr.get("roman") == tr.get("word"): 2946 ↛ 2947line 2946 didn't jump to line 2947 because the condition on line 2946 was never true

2947 del tr["roman"] 

2948 

2949 

2950def parse_alt_or_inflection_of( 

2951 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str] 

2952) -> Optional[tuple[list[str], Optional[list[AltOf]]]]: 

2953 """Tries to parse an inflection-of or alt-of description. If successful, 

2954 this returns (tags, alt-of/inflection-of-dict). If the description cannot 

2955 be parsed, this returns None. This may also return (tags, None) when the 

2956 gloss describes a form (or some other tags were extracted from it), but 

2957 there was no alt-of/form-of/synonym-of word.""" 

2958 # print("parse_alt_or_inflection_of: {!r}".format(gloss)) 

2959 # Occasionally inflection_of/alt_of have "A(n) " etc. at the beginning. 

2960 

2961 # Never interpret a gloss that is equal to the word itself as a tag 

2962 # (e.g., instrumental/Romanian, instrumental/Spanish). 

2963 if gloss.lower() == wxr.wtp.title.lower() or ( # type:ignore[union-attr] 2963 ↛ 2966line 2963 didn't jump to line 2966 because the condition on line 2963 was never true

2964 len(gloss) >= 5 and distw([gloss.lower()], wxr.wtp.title.lower()) < 0.2 # type:ignore[union-attr] 

2965 ): 

2966 return None 

2967 

2968 # First try parsing it as-is 

2969 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args) 

2970 if parsed is not None: 

2971 return parsed 

2972 

2973 # Next try parsing it with the first character converted to lowercase if 

2974 # it was previously uppercase. 

2975 if gloss and gloss[0].isupper(): 

2976 gloss = gloss[0].lower() + gloss[1:] 

2977 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args) 

2978 if parsed is not None: 2978 ↛ 2979line 2978 didn't jump to line 2979 because the condition on line 2978 was never true

2979 return parsed 

2980 

2981 return None 

2982 

2983 

2984# These tags are not allowed in alt-or-inflection-of parsing 

2985alt_infl_disallowed: set[str] = set( 

2986 [ 

2987 "error-unknown-tag", 

2988 "place", # Not in inflected forms and causes problems e.g. house/English 

2989 ] 

2990) 

2991 

2992 

2993def parse_alt_or_inflection_of1( 

2994 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str] 

2995) -> Optional[tuple[list[str], Optional[list[AltOf]]]]: 

2996 """Helper function for parse_alt_or_inflection_of. This handles a single 

2997 capitalization.""" 

2998 if not gloss or not gloss.strip(): 2998 ↛ 2999line 2998 didn't jump to line 2999 because the condition on line 2998 was never true

2999 return None 

3000 

3001 # Prevent some common errors where we would parse something we shouldn't 

3002 if re.search(r"(?i)form of address ", gloss): 3002 ↛ 3003line 3002 didn't jump to line 3003 because the condition on line 3002 was never true

3003 return None 

3004 

3005 gloss = re.sub(r"only used in [^,]+, ", "", gloss) 

3006 

3007 # First try all formats ending with "of" (or other known last words that 

3008 # can end a form description) 

3009 matches = list(re.finditer(r"\b(of|for|by|as|letter|number) ", gloss)) 

3010 m: Optional[re.Match] 

3011 for m in reversed(matches): 

3012 desc = gloss[: m.end()].strip() 

3013 base = gloss[m.end() :].strip() 

3014 tagsets, topics = decode_tags(desc, no_unknown_starts=True) 

3015 if not topics and any( 

3016 not (alt_infl_disallowed & set(ts)) for ts in tagsets 

3017 ): 

3018 # Successfully parsed, including "of" etc. 

3019 tags: list[str] = [] 

3020 # If you have ("Western-Armenian", ..., "form-of") as your 

3021 # tag set, it's most probable that it's something like 

3022 # "Western Armenian form of խոսել (xosel)", which should 

3023 # get "alt-of" instead of "form-of" (inflection). 

3024 # խօսիլ/Armenian 

3025 for ts_t in tagsets: 

3026 if "form-of" in ts_t and any( 

3027 valid_tags.get(tk) == "dialect" for tk in ts_t 

3028 ): 

3029 ts_s = (set(ts_t) - {"form-of"}) | {"alt-of"} 

3030 else: 

3031 ts_s = set(ts_t) 

3032 if not (alt_infl_disallowed & ts_s): 3032 ↛ 3025line 3032 didn't jump to line 3025 because the condition on line 3032 was always true

3033 tags.extend(ts_s) 

3034 if ( 

3035 "alt-of" in tags 

3036 or "form-of" in tags 

3037 or "synonym-of" in tags 

3038 or "compound-of" in tags 

3039 ): 

3040 break 

3041 if m.group(1) == "of": 

3042 # Try parsing without the final "of". This is commonly used in 

3043 # various form-of expressions. 

3044 desc = gloss[: m.start()] 

3045 base = gloss[m.end() :] 

3046 tagsets, topics = decode_tags(desc, no_unknown_starts=True) 

3047 # print("ALT_OR_INFL: desc={!r} base={!r} tagsets={} topics={}" 

3048 # .format(desc, base, tagsets, topics)) 

3049 if not topics and any( 

3050 not (alt_infl_disallowed & set(t)) for t in tagsets 

3051 ): 

3052 tags = [] 

3053 for t in tagsets: 

3054 if not (alt_infl_disallowed & set(t)): 3054 ↛ 3053line 3054 didn't jump to line 3053 because the condition on line 3054 was always true

3055 tags.extend(t) 

3056 # It must have at least one tag from form_of_tags 

3057 if set(tags) & form_of_tags: 3057 ↛ 3061line 3057 didn't jump to line 3061 because the condition on line 3057 was always true

3058 # Accept this as form-of 

3059 tags.append("form-of") 

3060 break 

3061 if set(tags) & alt_of_tags: 

3062 # Accept this as alt-of 

3063 tags.append("alt-of") 

3064 break 

3065 

3066 else: 

3067 # Did not find a form description based on last word; see if the 

3068 # whole description is tags 

3069 tagsets, topics = decode_tags(gloss, no_unknown_starts=True) 

3070 if not topics and any( 

3071 not (alt_infl_disallowed & set(ts)) and form_of_tags & set(ts) 

3072 for ts in tagsets 

3073 ): 

3074 tags = [] 

3075 for ts in tagsets: 

3076 if not (alt_infl_disallowed & set(ts)) and form_of_tags & set( 3076 ↛ 3075line 3076 didn't jump to line 3075 because the condition on line 3076 was always true

3077 ts 

3078 ): 

3079 tags.extend(ts) 

3080 base = "" 

3081 else: 

3082 return None 

3083 

3084 # kludge for Spanish (again): 'x of [word] combined with [clitic]' 

3085 m = re.search(r"combined with \w+$", base) 

3086 if m: 3086 ↛ 3087line 3086 didn't jump to line 3087 because the condition on line 3086 was never true

3087 tagsets, topics = decode_tags(m.group(0), no_unknown_starts=True) 

3088 if not topics: 

3089 for ts in tagsets: 

3090 tags.extend(ts) 

3091 base = base[: m.start()] 

3092 

3093 # It is fairly common for form_of glosses to end with something like 

3094 # "ablative case" or "in instructive case". Parse that ending. 

3095 base = base.strip() 

3096 lst = base.split() 

3097 # print("parse_alt_or_inflection_of: lst={}".format(lst)) 

3098 if len(lst) >= 3 and lst[-1] in ("case", "case."): 3098 ↛ 3099line 3098 didn't jump to line 3099 because the condition on line 3098 was never true

3099 node = valid_sequences.children.get(lst[-2]) 

3100 if node and node.end: 

3101 for s in node.tags: 

3102 tags.extend(s.split(" ")) 

3103 lst = lst[:-2] 

3104 if lst[-1] == "in" and len(lst) > 1: 

3105 lst = lst[:-1] 

3106 

3107 # Eliminate empty and duplicate tags 

3108 tags = list(sorted(set(t for t in tags if t))) 

3109 

3110 # Clean up some extra stuff from the linked word, separating the text 

3111 # into ``base`` (the linked word) and ``extra`` (additional information, 

3112 # such as English translation or clarifying word sense information). 

3113 orig_base = base 

3114 base = re.sub(alt_of_form_of_clean_re, "", orig_base) 

3115 base = re.sub(r" [(⟨][^()]*[)⟩]", "", base) # Remove all (...) groups 

3116 extra = orig_base[len(base) :] 

3117 extra = re.sub(r"^[- :;.,,—]+", "", extra) 

3118 if extra.endswith(".") and extra.count(".") == 1: 3118 ↛ 3119line 3118 didn't jump to line 3119 because the condition on line 3118 was never true

3119 extra = extra[:-1].strip() 

3120 m = re.match(r"^\(([^()]*)\)$", extra) 

3121 if m: 3121 ↛ 3122line 3121 didn't jump to line 3122 because the condition on line 3121 was never true

3122 extra = m.group(1) 

3123 else: 

3124 # These weird backets used in "slash mark" 

3125 m = re.match(r"^⟨([^()]*)⟩$", extra) 

3126 if m: 3126 ↛ 3127line 3126 didn't jump to line 3127 because the condition on line 3126 was never true

3127 extra = m.group(1) 

3128 m = re.match(r'^[“"]([^"“”]*)["”]$', extra) 

3129 if m: 3129 ↛ 3130line 3129 didn't jump to line 3130 because the condition on line 3129 was never true

3130 extra = m.group(1) 

3131 # Note: base might still contain comma-separated values and values 

3132 # separated by "and" 

3133 base = base.strip() 

3134 if base.endswith(",") and len(base) > 2: 3134 ↛ 3135line 3134 didn't jump to line 3135 because the condition on line 3134 was never true

3135 base = base[:-1].strip() 

3136 while ( 

3137 base.endswith(".") 

3138 and not wxr.wtp.page_exists(base) 

3139 and base not in gloss_template_args 

3140 ): 

3141 base = base[:-1].strip() 

3142 if base.endswith('(\u201cconjecture")'): 3142 ↛ 3143line 3142 didn't jump to line 3143 because the condition on line 3142 was never true

3143 base = base[:-14].strip() 

3144 tags.append("conjecture") 

3145 while ( 3145 ↛ 3150line 3145 didn't jump to line 3150

3146 base.endswith(".") 

3147 and not wxr.wtp.page_exists(base) 

3148 and base not in gloss_template_args 

3149 ): 

3150 base = base[:-1].strip() 

3151 if ( 3151 ↛ 3156line 3151 didn't jump to line 3156

3152 base.endswith(".") 

3153 and base not in gloss_template_args 

3154 and base[:-1] in gloss_template_args 

3155 ): 

3156 base = base[:-1] 

3157 base = base.strip() 

3158 if not base: 

3159 return tags, None 

3160 

3161 # Kludge: Spanish verb forms seem to have a dot added at the end. 

3162 # Remove it; we know of no Spanish verbs ending with a dot. 

3163 language = wxr.wtp.section 

3164 pos = wxr.wtp.subsection 

3165 # print("language={} pos={} base={}".format(language, pos, base)) 

3166 if ( 3166 ↛ 3172line 3166 didn't jump to line 3172

3167 base.endswith(".") 

3168 and len(base) > 1 

3169 and base[-2].isalpha() 

3170 and (language == "Spanish" and pos == "Verb") 

3171 ): 

3172 base = base[:-1] 

3173 

3174 # Split base to alternatives when multiple alternatives provided 

3175 parts = split_at_comma_semi(base, extra=[" / ", "/", r" \+ "]) 

3176 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "") 

3177 if ( 3177 ↛ 3186line 3177 didn't jump to line 3186

3178 len(parts) <= 1 

3179 or base.startswith("/") 

3180 or base.endswith("/") 

3181 or "/" in titleword 

3182 ): 

3183 parts = [base] 

3184 # Split base to alternatives when of form "a or b" and "a" and "b" are 

3185 # similar (generally spelling variants of the same word or similar words) 

3186 if len(parts) == 1: 3186 ↛ 3192line 3186 didn't jump to line 3192 because the condition on line 3186 was always true

3187 pp = base.split() 

3188 if len(pp) == 3 and pp[1] == "or" and distw([pp[0]], pp[2]) < 0.4: 

3189 parts = [pp[0], pp[2]] 

3190 

3191 # Create form-of/alt-of entries based on the extracted data 

3192 dt_lst: list[AltOf] = [] 

3193 for p in parts: 

3194 # Check for some suspicious base forms 

3195 m = re.search(r"[.,] |[{}()]", p) 

3196 if m and not wxr.wtp.page_exists(p): 3196 ↛ 3197line 3196 didn't jump to line 3197 because the condition on line 3196 was never true

3197 wxr.wtp.debug( 

3198 "suspicious alt_of/form_of with {!r}: {}".format(m.group(0), p), 

3199 sortid="form_descriptions/2278", 

3200 ) 

3201 if p.startswith("*") and len(p) >= 3 and p[1].isalpha(): 3201 ↛ 3202line 3201 didn't jump to line 3202 because the condition on line 3201 was never true

3202 p = p[1:] 

3203 dt: AltOf = {"word": p} 

3204 if extra: 

3205 dt["extra"] = extra 

3206 dt_lst.append(dt) 

3207 # print("alt_or_infl_of returning tags={} lst={} base={!r}" 

3208 # .format(tags, lst, base)) 

3209 return tags, dt_lst 

3210 

3211 

3212@functools.lru_cache(maxsize=65536) 

3213def classify_desc( 

3214 desc: str, 

3215 allow_unknown_tags=False, 

3216 no_unknown_starts=False, 

3217 accepted: Union[tuple[str, ...], frozenset[str]] = tuple(), 

3218) -> str: 

3219 """Determines whether the given description is most likely tags, english, 

3220 a romanization, or something else. Returns one of: "tags", "english", 

3221 "romanization", or "other". If ``allow_unknown_tags`` is True, then 

3222 allow "tags" classification even when the only tags are those starting 

3223 with a word in allowed_unknown_starts.""" 

3224 assert isinstance(desc, str) 

3225 # Empty and whitespace-only strings are treated as "other" 

3226 desc = desc.strip() 

3227 if not desc: 

3228 return "other" 

3229 

3230 normalized_desc = unicodedata.normalize("NFKD", desc) 

3231 

3232 # If it can be fully decoded as tags without errors, treat as tags 

3233 tagsets, topics = decode_tags(desc, no_unknown_starts=no_unknown_starts) 

3234 for tagset in tagsets: 

3235 assert isinstance(tagset, (list, tuple, set)) 

3236 if "error-unknown-tag" not in tagset and ( 

3237 topics or allow_unknown_tags or any(" " not in x for x in tagset) 

3238 ): 

3239 return "tags" 

3240 

3241 # Check if it looks like the taxonomic name of a species 

3242 if desc in known_species: 

3243 return "taxonomic" 

3244 desc1 = re.sub(r"^×([A-Z])", r"\1", desc) 

3245 desc1 = re.sub(r"\s*×.*", "", desc1) 

3246 lst = desc1.split() 

3247 if len(lst) > 1 and len(lst) <= 5 and lst[0] in known_firsts: 

3248 have_non_english = 1 if lst[0].lower() not in english_words else 0 

3249 for x in lst[1:]: 

3250 if x in ("A", "B", "C", "D", "E", "F", "I", "II", "III", "IV", "V"): 

3251 continue 

3252 if x[0].isupper(): 

3253 break 

3254 if x not in english_words: 

3255 have_non_english += 1 

3256 else: 

3257 # Starts with known taxonomic term, does not contain uppercase 

3258 # words (except allowed letters) and at least one word is not 

3259 # English 

3260 if have_non_english >= len(lst) - 1 and have_non_english > 0: 3260 ↛ 3266line 3260 didn't jump to line 3266 because the condition on line 3260 was always true

3261 return "taxonomic" 

3262 

3263 # If all words are in our English dictionary, interpret as English. 

3264 # [ -~] is regex black magic, "ALL CHARACTERS from space to tilde" 

3265 # in ASCII. Took me a while to figure out. 

3266 if re.match(r"[ -~―—“”…'‘’ʹ€]+$", normalized_desc) and len(desc) > 1: 

3267 if desc in english_words and desc[0].isalpha(): 

3268 return "english" # Handles ones containing whitespace 

3269 desc1 = re.sub( 

3270 tokenizer_fixup_re, lambda m: tokenizer_fixup_map[m.group(0)], desc 

3271 ) 

3272 tokens = tokenizer.tokenize(desc1) 

3273 if not tokens: 3273 ↛ 3274line 3273 didn't jump to line 3274 because the condition on line 3273 was never true

3274 return "other" 

3275 lst_bool = list( 

3276 x not in not_english_words 

3277 and 

3278 # not x.isdigit() and 

3279 ( 

3280 x in english_words 

3281 or x.lower() in english_words 

3282 or x in known_firsts 

3283 or x[0].isdigit() 

3284 or x in accepted 

3285 or 

3286 # (x[0].isupper() and x.find("-") < 0 and x.isascii()) or 

3287 ( 

3288 x.endswith("s") and len(x) >= 4 and x[:-1] in english_words 

3289 ) # Plural 

3290 or ( 

3291 x.endswith("ies") 

3292 and len(x) >= 5 

3293 and x[:-3] + "y" in english_words 

3294 ) # E.g. lily - lilies 

3295 or ( 

3296 x.endswith("ing") 

3297 and len(x) >= 5 

3298 and x[:-3] in english_words 

3299 ) # E.g. bring - bringing 

3300 or ( 

3301 x.endswith("ing") 

3302 and len(x) >= 5 

3303 and x[:-3] + "e" in english_words 

3304 ) # E.g., tone - toning 

3305 or ( 

3306 x.endswith("ed") and len(x) >= 5 and x[:-2] in english_words 

3307 ) # E.g. hang - hanged 

3308 or ( 

3309 x.endswith("ed") 

3310 and len(x) >= 5 

3311 and x[:-2] + "e" in english_words 

3312 ) # E.g. atone - atoned 

3313 or (x.endswith("'s") and x[:-2] in english_words) 

3314 or (x.endswith("s'") and x[:-2] in english_words) 

3315 or ( 

3316 x.endswith("ise") 

3317 and len(x) >= 5 

3318 and x[:-3] + "ize" in english_words 

3319 ) 

3320 or ( 

3321 x.endswith("ised") 

3322 and len(x) >= 6 

3323 and x[:-4] + "ized" in english_words 

3324 ) 

3325 or ( 

3326 x.endswith("ising") 

3327 and len(x) >= 7 

3328 and x[:-5] + "izing" in english_words 

3329 ) 

3330 or ( 

3331 re.search(r"[-/]", x) 

3332 and all( 

3333 ((y in english_words and len(y) > 2) or not y) 

3334 for y in re.split(r"[-/]", x) 

3335 ) 

3336 ) 

3337 ) 

3338 for x in tokens 

3339 ) 

3340 cnt = lst_bool.count(True) 

3341 rejected_words = tuple( 

3342 x for i, x in enumerate(tokens) if not lst_bool[i] 

3343 ) 

3344 if ( 

3345 any( 

3346 lst_bool[i] and x[0].isalpha() and len(x) > 1 

3347 for i, x in enumerate(tokens) 

3348 ) 

3349 and not desc.startswith("-") 

3350 and not desc.endswith("-") 

3351 and re.search(r"\w+", desc) 

3352 and ( 

3353 cnt == len(lst_bool) 

3354 or ( 

3355 any( 

3356 lst_bool[i] and len(x) > 3 for i, x in enumerate(tokens) 

3357 ) 

3358 and cnt >= len(lst_bool) - 1 

3359 ) 

3360 or cnt / len(lst_bool) >= 0.8 

3361 or ( 

3362 all(x in potentially_english_words for x in rejected_words) 

3363 and cnt / len(lst_bool) >= 0.50 

3364 ) 

3365 ) 

3366 ): 

3367 return "english" 

3368 # Some translations have apparent pronunciation descriptions in /.../ 

3369 # which we'll put in the romanization field (even though they probably are 

3370 # not exactly romanizations). 

3371 if desc.startswith("/") and desc.endswith("/"): 

3372 return "romanization" 

3373 # If all characters are in classes that could occur in romanizations, 

3374 # treat as romanization 

3375 classes = list( 

3376 unicodedata.category(x) if x not in ("-", ",", ":", "/", '"') else "OK" 

3377 for x in normalized_desc 

3378 ) 

3379 classes1 = [] 

3380 num_latin = 0 

3381 num_greek = 0 

3382 # part = "" 

3383 # for ch, cl in zip(normalized_desc, classes): 

3384 # part += f"{ch}({cl})" 

3385 # print(part) 

3386 for ch, cl in zip(normalized_desc, classes): 

3387 if ch in ( 

3388 "'", # ' in Arabic, / in IPA-like parenthesized forms 

3389 ".", # e.g., "..." in translations 

3390 ";", 

3391 ":", 

3392 "!", 

3393 "‘", 

3394 "’", 

3395 '"', 

3396 "“", 

3397 "”", 

3398 "/", 

3399 "?", 

3400 "…", # alternative to "..." 

3401 "⁉", # 見る/Japanese automatic transcriptions... 

3402 "?", 

3403 "!", 

3404 "⁻", # superscript -, used in some Cantonese roman, e.g. "we" 

3405 "ʔ", 

3406 "ʼ", 

3407 "ʾ", 

3408 "ʹ", 

3409 ): # ʹ e.g. in understand/English/verb Russian transl 

3410 classes1.append("OK") 

3411 continue 

3412 if cl not in ("Ll", "Lu"): 

3413 classes1.append(cl) 

3414 continue 

3415 try: 

3416 name = unicodedata.name(ch) 

3417 first = name.split()[0] 

3418 if first == "LATIN": 

3419 num_latin += 1 

3420 elif first == "GREEK": 

3421 num_greek += 1 

3422 elif first == "COMBINING": # Combining diacritic 3422 ↛ 3423line 3422 didn't jump to line 3423 because the condition on line 3422 was never true

3423 cl = "OK" 

3424 elif re.match(non_latin_scripts_re, name): 3424 ↛ 3428line 3424 didn't jump to line 3428 because the condition on line 3424 was always true

3425 cl = "NO" # Not acceptable in romanizations 

3426 except ValueError: 

3427 cl = "NO" # Not acceptable in romanizations 

3428 classes1.append(cl) 

3429 # print("classify_desc: {!r} classes1: {}".format(desc, classes1)) 

3430 # print(set(classes1) ) 

3431 if all( 

3432 x in ("Ll", "Lu", "Lt", "Lm", "Mn", "Mc", "Zs", "Nd", "OK") 

3433 for x in classes1 

3434 ): 

3435 if ( 

3436 (num_latin >= num_greek + 2 or num_greek == 0) 

3437 and classes1.count("OK") < len(classes1) 

3438 and classes1.count("Nd") < len(classes1) 

3439 ): 

3440 return "romanization" 

3441 # Otherwise it is something else, such as hanji version of the word 

3442 return "other" 

3443 

3444 

3445def remove_text_in_parentheses(text: str) -> str: 

3446 parentheses = 0 

3447 new_text = "" 

3448 for c in text: 

3449 if c == "(": 

3450 parentheses += 1 

3451 elif c == ")": 

3452 parentheses -= 1 

3453 elif parentheses == 0: 

3454 new_text += c 

3455 return new_text