Coverage for src/wiktextract/extractor/en/form

1# Code for parsing linguistic form descriptions and tags for word senses

2# (both the word entry head - initial part and parenthesized parts -

3# and tags at the beginning of word senses)

7import functools

8import re

9import unicodedata

10from typing import (

11 Any,

12 Literal,

13 Optional,

14 Sequence,

15 Union,

16)

18import Levenshtein

19from nltk import TweetTokenizer # type:ignore[import-untyped]

21from ...datautils import data_append, data_extend, split_at_comma_semi

22from ...tags import (

23 alt_of_tags,

24 form_of_tags,

25 head_final_bantu_langs,

26 head_final_bantu_map,

27 head_final_numeric_langs,

28 head_final_other_langs,

29 head_final_other_map,

30 head_final_semitic_langs,

31 head_final_semitic_map,

32 uppercase_tags,

33 valid_tags,

34 xlat_descs_map,

35 xlat_head_map,

36 xlat_tags_map,

37)

38from ...topics import topic_generalize_map, valid_topics

39from ...wxr_context import WiktextractContext

40from .english_words import (

41 english_words,

42 not_english_words,

43 potentially_english_words,

44)

45from .form_descriptions_known_firsts import known_firsts

46from .taxondata import known_species

47from .type_utils import (

48 AltOf,

49 FormData,

50 LinkageData,

51 SenseData,

52 SoundData,

53 TranslationData,

54 WordData,

55)

57# Tokenizer for classify_desc()

58tokenizer = TweetTokenizer()

60# These are ignored as the value of a related form in form head.

61IGNORED_RELATED: set[str] = set(

62 [

63 "-",

64 "־",

65 "᠆",

66 "‐",

67 "‑",

68 "‒",

69 "–",

70 "—",

71 "―",

72 "−",

73 "⸺",

74 "⸻",

75 "﹘",

76 "﹣",

77 "－",

78 "?",

79 "(none)",

80 ]

81)

84# First words of unicodedata.name() that indicate scripts that cannot be

85# accepted in romanizations or english (i.e., should be considered "other"

86# in classify_desc()).

87non_latin_scripts: list[str] = [

88 "ADLAM",

89 "ARABIC",

90 "ARABIC-INDIC",

91 "ARMENIAN",

92 "BALINESE",

93 "BENGALI",

94 "BRAHMI",

95 "BRAILLE",

96 "CANADIAN",

97 "CHAKMA",

98 "CHAM",

99 "CHEROKEE",

100 "CJK",

101 "COPTIC",

102 "COUNTING ROD",

103 "CUNEIFORM",

104 "CYRILLIC",

105 "DOUBLE-STRUCK",

106 "EGYPTIAN",

107 "ETHIOPIC",

108 "EXTENDED ARABIC-INDIC",

109 "GEORGIAN",

110 "GLAGOLITIC",

111 "GOTHIC",

112 "GREEK",

113 "GUJARATI",

114 "GURMUKHI",

115 "HANGUL",

116 "HANIFI ROHINGYA",

117 "HEBREW",

118 "HIRAGANA",

119 "JAVANESE",

120 "KANNADA",

121 "KATAKANA",

122 "KAYAH LI",

123 "KHMER",

124 "KHUDAWADI",

125 "LAO",

126 "LEPCHA",

127 "LIMBU",

128 "MALAYALAM",

129 "MEETEI",

130 "MYANMAR",

131 "NEW TAI LUE",

132 "NKO",

133 "OL CHIKI",

134 "OLD PERSIAN",

135 "OLD SOUTH ARABIAN",

136 "ORIYA",

137 "OSMANYA",

138 "PHOENICIAN",

139 "SAURASHTRA",

140 "SHARADA",

141 "SINHALA",

142 "SUNDANESE",

143 "SYLOTI",

144 "TAI THAM",

145 "TAKRI",

146 "TAMIL",

147 "TELUGU",

148 "THAANA",

149 "THAI",

150 "TIBETAN",

151 "TIFINAGH",

152 "TIRHUTA",

153 "UGARITIC",

154 "WARANG CITI",

155 "YI",

156]

157non_latin_scripts_re = re.compile(

158 r"(" + r"|".join(re.escape(x) for x in non_latin_scripts) + r")\b"

159)

160

161# Sanity check xlat_head_map values

162for k, v in xlat_head_map.items():

163 if v.startswith("?"):

164 v = v[1:]

165 for tag in v.split():

166 if tag not in valid_tags: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true

167 print(

168 "WARNING: xlat_head_map[{}] contains unrecognized tag {}".format(

169 k, tag

170 )

171 )

172

173# Regexp for finding nested translations from translation items (these are

174# used in, e.g., year/English/Translations/Arabic). This is actually used

175# in page.py.

176nested_translations_re = re.compile(

177 r"\s+$(({}): ([^()]|\([^()]+$)+)\)".format(

178 "|".join(

179 re.escape(x.removeprefix("?"))

180 for x in sorted(xlat_head_map.values(), key=len, reverse=True)

181 if x and not x.startswith("class-")

182 )

183 )

184)

185

186# Regexp that matches head tag specifiers. Used to match tags from end of

187# translations and linkages

188head_final_re_text = r"( -)?( ({}))+".format(

189 "|".join(

190 re.escape(x)

191 for x in

192 # The sort is to put longer ones first, preferring them in

193 # the regexp match

194 sorted(xlat_head_map.keys(), key=len, reverse=True)

195 )

196)

197head_final_re = re.compile(head_final_re_text + "$")

198

199# Regexp used to match head tag specifiers at end of a form for certain

200# Bantu languages (particularly Swahili and similar languages).

201head_final_bantu_re_text = r" ({})".format(

202 "|".join(re.escape(x) for x in head_final_bantu_map.keys())

203)

204head_final_bantu_re = re.compile(head_final_bantu_re_text + "$")

205

206# Regexp used to match head tag specifiers at end of a form for certain

207# Semitic languages (particularly Arabic and similar languages).

208head_final_semitic_re_text = r" ({})".format(

209 "|".join(re.escape(x) for x in head_final_semitic_map.keys())

210)

211head_final_semitic_re = re.compile(head_final_semitic_re_text + "$")

212

213# Regexp used to match head tag specifiers at end of a form for certain

214# other languages (e.g., Lithuanian, Finnish, French).

215head_final_other_re_text = r" ({})".format(

216 "|".join(re.escape(x) for x in head_final_other_map.keys())

217)

218head_final_other_re = re.compile(head_final_other_re_text + "$")

219

220# Regexp for splitting heads. See parse_word_head().

221head_split_re_text = (

222 "("

223 + head_final_re_text

224 + "|"

225 + head_final_bantu_re_text

226 + "|"

227 + head_final_semitic_re_text

228 + "|"

229 + head_final_other_re_text

230 + ")?( or |[,;]+)"

231)

232head_split_re = re.compile(head_split_re_text)

233head_split_re_parens = 0

234for m in re.finditer(r"(^|[^\\])[(]+", head_split_re_text):

235 head_split_re_parens += m.group(0).count("(")

236

237# Parenthesized parts that are ignored in translations

238tr_ignored_parens: set[str] = set(

239 [

240 "please verify",

241 "(please verify)",

242 "transliteration needed",

243 "(transliteration needed)",

244 "in words with back vowel harmony",

245 "(in words with back vowel harmony)",

246 "in words with front vowel harmony",

247 "(in words with front vowel harmony)",

248 "see below",

249 "see usage notes below",

250 ]

251)

252tr_ignored_parens_re = re.compile(

253 r"^("

254 + "|".join(re.escape(x) for x in tr_ignored_parens)

255 + ")$"

256 + r"|^(Can we clean up|Can we verify|for other meanings see "

257 r"lit\. )"

258)

259

260# Translations that are ignored

261ignored_translations: set[str] = set(

262 [

263 "[script needed]",

264 "please add this translation if you can",

265 ]

266)

267

268# Put english text into the "note" field in a translation if it contains one

269# of these words

270tr_note_re = re.compile(

280 r"indicative|progressive|conditional|potential|"

284 r"periphrastically|no equivalent|not used|not always used|"

287 r"may be replaced|stricter sense|for nonhumans|"

289 r"not restricted to|pertaining to|or optionally with|are optional|"

290 r"in conjunction with|in compounds|depending on the relationship|"

291 r"person addressed|one person|multiple persons|may be replaced with|"

292 r"optionally completed with|in the phrase|in response to|"

294 r"with verb|with uncountable|with the objects|with stative|"

295 r"can be replaced by|often after|used before|used after|"

297 r"short form|shortening of|shortened form|initialism of|"

299 r"previously mentioned|uncountable noun|countable noun|"

300 r"countable nouns|uncountable nouns|"

301 r"with predicative|with -|with imperfect|with a negated|"

303 r'"|'

304 r"general term|after a vowel|before a vowel|"

305 r"form|regular|irregular|alternative)"

306 r")($|[) ])|^("

307 # Following are only matched at the beginning of the string

308 r"pl|pl\.|see:|pl:|sg:|plurals:|e\.g\.|e\.g\.:|e\.g\.,|cf\.|compare|such as|"

309 r"see|only|often|usually|used|usage:|of|not|in|compare|usu\.|"

312 r"\+|with) "

313)

314# \b does not work at the end???

315

316# Related forms matching this regexp will be considered suspicious if the

317# page title does not also match one of these.

318suspicious_related_re = re.compile(

319 r"(^| )(f|m|n|c|or|pl|sg|inan|anim|pers|anml|impf|pf|vir|nvir)( |$)"

320 r"|[][:=<>&#*|]"

321 r"| \d+$"

322)

323

324# Word forms (head forms, translations, etc) that will be considered ok and

325# silently accepted even if they would otherwise trigger a suspicious

326# form warning.

327ok_suspicious_forms: set[str] = set(

328 [

329 "but en or", # "golden goal"/English/Tr/French

330 "cœur en or", # "heart of gold"/Eng/Tr/French

331 "en or", # golden/Eng/Tr/French

332 "men du", # jet/Etym2/Noun/Tr/Cornish

333 "parachute en or", # "golden parachute"/Eng/Tr/French

334 "vieil or", # "old gold"/Eng/Tr/French

335 # "all that glitters is not gold"/Eng/Tr/French

336 "tout ce qui brille n’est pas or",

337 "μη αποκλειστικό or", # inclusive or/Eng/Tr/Greek

338 "period or full stop",

339 ]

340)

341

342

343# Replacements to be done in classify_desc before tokenizing. This is a

344# workaround for shortcomings in TweetTokenizer.

345tokenizer_fixup_map = {

346 r"a.m.": "AM",

347 r"p.m.": "PM",

348}

349tokenizer_fixup_re = re.compile(

350 r"\b("

351 + "|".join(

352 re.escape(x)

353 for x in sorted(

354 tokenizer_fixup_map.keys(), key=lambda x: len(x), reverse=True

355 )

356 )

357 + r")"

358)

359

360# Unknown tags starting with these words will be silently ignored.

361ignored_unknown_starts: set[str] = set(

362 [

363 "originally",

364 "e.g.",

365 "c.f.",

366 "supplanted by",

367 "supplied by",

368 ]

369)

370

371ignored_unknown_starts_re = re.compile(

372 r"^("

373 + "|".join(

374 re.escape(x)

375 for x in sorted(ignored_unknown_starts, key=lambda x: -len(x))

376 )

377 + ") "

378)

379

380# If an unknown sequence starts with one of these, it will continue as an

381# unknown sequence until the end, unless it turns out to have a replacement.

382allowed_unknown_starts: set[str] = set(

383 [

384 "Relating",

385 "accompanied",

386 "added",

387 "after",

388 "answering",

389 "as",

390 "based",

391 "before",

392 "conjugated",

393 "conjunction",

394 "construed",

395 "especially",

396 "expression:",

397 "figurative:",

398 "followed",

399 "for",

400 "forms",

401 "from",

402 "governs",

403 "in",

404 "indicating",

405 "modifying",

406 "normally",

407 "not",

408 "of",

409 "preceding",

410 "prefixed",

411 "referring",

412 "relating",

413 "revived",

414 "said",

415 "since",

416 "takes",

417 "used",

418 "with",

419 "With",

420 "without",

421 ]

422)

423# Allow the ignored unknown starts without complaining

424allowed_unknown_starts.update(ignored_unknown_starts)

425

426# Full unknown tags that will be ignored in decode_tags()

427# XXX this is unused, ask Tatu where the contents is now

428ignored_unknown_tags: set[str] = set([])

429

430# Head endings that are mapped to tags

431head_end_map = {

432 " 1st conj.": "conjugation-1",

433 " 2nd conj.": "conjugation-2",

434 " 3rd conj.": "conjugation-3",

435 " 4th conj.": "conjugation-4",

436 " 5th conj.": "conjugation-5",

437 " 6th conj.": "conjugation-6",

438 " 7th conj.": "conjugation-7",

439}

440head_end_re = re.compile(

441 r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$"

442)

443

444

445# Dictionary of language-specific parenthesized head part starts that

446# either introduce new tags or modify previous tags. The value for each

447# language is a dictionary that maps the first word of the head part to

448# (rem_tags, add_tags), where ``rem_tags`` can be True to remove all previous

449# tags or a space-separated string of tags to remove, and ``add_tags`` should

450# be a string of tags to add.

451lang_specific_head_map: dict[

452 str, dict[str, Union[tuple[str, str], tuple[Literal[True], str]]]

453] = {

454 "Danish": {

455 # prefix: (rem_tags space separate string/True, add_tags s-sep str)

456 "c": ("neuter", "common-gender"),

457 "n": ("common-gender", "neuter"),

458 "pl": ("singular neuter common-gender", "plural"),

459 "sg": ("plural neuter common-gender", "singular"),

460 },

461}

462

463

464# Regular expression used to strip additional stuff from the end of alt_of and

465# form_of.

466alt_of_form_of_clean_re = re.compile(

467 r"(?s)("

468 + "|".join(

469 [

470 r":",

471 r'[“"]',

472 r";",

473 r" \(",

474 r" - ",

475 r" ־ ",

476 r" ᠆ ",

477 r" ‐ ",

478 r" ‑ ",

479 r" ‒ ",

480 r" – ",

481 r" — ",

482 r" ― ",

483 r" − ",

484 r" ⸺ ",

485 r" ⸻ ",

486 r" ﹘ ",

487 r" ﹣ ",

488 r" － ",

489 r" \+ ",

490 r" \(with ",

491 r" with -ra/-re",

492 r"\. Used ",

493 r"\. Also ",

494 r"\. Since ",

495 r"\. A ",

496 r"\.\. A ",

497 r"\. An ",

498 r"\.\. An ",

499 r"\. an ",

500 r"\. The ",

501 r"\. Spanish ",

502 r"\. Language ",

503 r"\. former name of ",

504 r"\. AIM",

505 r"\. OT",

506 r"\. Not ",

507 r"\. Now ",

508 r"\. Nowadays ",

509 r"\. Early ",

510 r"\. ASEAN",

511 r"\. UN",

512 r"\. IMF",

513 r"\. WHO",

514 r"\. WIPO",

515 r"\. AC",

516 r"\. DC",

517 r"\. DNA",

518 r"\. RNA",

519 r"\. SOB",

520 r"\. IMO",

521 r"\. Behavior",

522 r"\. Income ",

523 r"\. More ",

524 r"\. Most ",

525 r"\. Only ",

526 r"\. Also ",

527 r"\. From ",

528 r"\. Of ",

529 r"\.\. Of ",

530 r"\. To ",

531 r"\. For ",

532 r"\. If ",

533 r"\. Praenominal ",

534 r"\. This ",

535 r"\. Replaced ",

536 r"\. CHCS is the ",

537 r"\. Equivalent ",

538 r"\. Initialism ",

539 r"\. Note ",

540 r"\. Alternative ",

541 r"\. Compare ",

542 r"\. Cf\. ",

543 r"\. Comparable ",

544 r"\. Involves ",

545 r"\. Sometimes ",

546 r"\. Commonly ",

547 r"\. Often ",

548 r"\. Typically ",

549 r"\. Possibly ",

550 r"\. Although ",

551 r"\. Rare ",

552 r"\. Instead ",

553 r"\. Integrated ",

554 r"\. Distinguished ",

555 r"\. Given ",

556 r"\. Found ",

557 r"\. Was ",

558 r"\. In ",

559 r"\. It ",

560 r"\.\. It ",

561 r"\. One ",

562 r"\. Any ",

563 r"\. They ",

564 r"\. Members ",

565 r"\. Each ",

566 r"\. Original ",

567 r"\. Especially ",

568 r"\. Usually ",

569 r"\. Known ",

570 r"\.\. Known ",

571 r"\. See ",

572 r"\. see ",

573 r"\. target was not ",

574 r"\. Popular ",

575 r"\. Pedantic ",

576 r"\. Positive ",

577 r"\. Society ",

578 r"\. Plan ",

579 r"\. Environmentally ",

580 r"\. Affording ",

581 r"\. Encompasses ",

582 r"\. Expresses ",

583 r"\. Indicates ",

584 r"\. Text ",

585 r"\. Large ",

586 r"\. Sub-sorting ",

587 r"\. Sax",

588 r"\. First-person ",

589 r"\. Second-person ",

590 r"\. Third-person ",

591 r"\. 1st ",

592 r"\. 2nd ",

593 r"\. 3rd ",

594 r"\. Term ",

595 r"\. Northeastern ",

596 r"\. Northwestern ",

597 r"\. Southeast ",

598 r"\. Egyptian ",

599 r"\. English ",

600 r"\. Cape Province was split into ",

601 r"\. Pañcat",

602 r"\. of the ",

603 r"\. is ",

604 r"\. after ",

605 r"\. or ",

606 r"\. chromed",

607 r"\. percussion",

608 r"\. with his ",

609 r"\. a\.k\.a\. ",

610 r"\. comparative form ",

611 r"\. singular ",

612 r"\. plural ",

613 r"\. present ",

614 r"\. his ",

615 r"\. her ",

616 r"\. equivalent ",

617 r"\. measuring ",

618 r"\. used in ",

619 r"\. cutely ",

620 r"\. Protects",

621 r'\. "',

622 r"\.^",

623 r"\. \+ ",

624 r"\., ",

625 r". — ",

626 r", a ",

627 r", an ",

628 r", the ",

629 r", obsolete ",

630 r", possessed", # 'd/English

631 r", imitating", # 1/English

632 r", derived from",

633 r", called ",

634 r", especially ",

635 r", slang for ",

636 r" corresponding to ",

637 r" equivalent to ",

638 r" popularized by ",

639 r" denoting ",

640 r" in its various senses\.",

641 r" used by ",

642 r" but not for ",

643 r" since ",

644 r" i\.e\. ",

645 r" i\. e\. ",

646 r" e\.g\. ",

647 r" eg\. ",

648 r" etc\. ",

649 r"\[http",

650 r" — used as ",

651 r" by K\. Forsyth ",

652 r" by J\. R\. Allen ",

653 r" by S\. Ferguson ",

654 r" by G\. Donaldson ",

655 r" May refer to ",

656 r" An area or region ",

657 ]

658 )

659 + r").*$"

660)

661

662

663class ValidNode:

664 """Node in the valid_sequences tree. Each node is part of a chain

665 or chains that form sequences built out of keys in key->tags

666 maps like xlat_tags, etc. The ValidNode's 'word' is the key

667 by which it is refered to in the root dict or a `children` dict,

668 `end` marks that the node is the end-terminus of a sequence (but

669 it can still continue if the sequence is shared by the start of

670 other sequences: "nominative$" and "nominative plural$" for example),

671 `tags` and `topics` are the dicts containing tag and topic strings

672 for terminal nodes (end==True)."""

673

674 __slots__ = (

675 "end",

676 "tags",

677 "topics",

678 "children",

679 )

680

681 def __init__(

682 self,

683 end=False,

684 tags: Optional[list[str]] = None,

685 topics: Optional[list[str]] = None,

686 children: Optional[dict[str, "ValidNode"]] = None,

687 ) -> None:

688 self.end = end

689 self.tags: list[str] = tags or []

690 self.topics: list[str] = topics or []

691 self.children: dict[str, "ValidNode"] = children or {}

692

693

694def add_to_valid_tree(tree: ValidNode, desc: str, v: Optional[str]) -> None:

695 """Helper function for building trees of valid tags/sequences during

696 initialization."""

697 assert isinstance(tree, ValidNode)

698 assert isinstance(desc, str)

699 assert v is None or isinstance(v, str)

700 node = tree

701

702 # Build the tree structure: each node has children nodes

703 # whose names are denoted by their dict key.

704 for w in desc.split(" "):

705 if w in node.children:

706 node = node.children[w]

707 else:

708 new_node = ValidNode()

709 node.children[w] = new_node

710 node = new_node

711 if not node.end:

712 node.end = True

713 if not v:

714 return None # Terminate early because there are no tags

715

716 tagslist = []

717 topicslist = []

718 for vv in v.split():

719 if vv in valid_tags:

720 tagslist.append(vv)

721 elif vv in valid_topics: 721 ↛ 724line 721 didn't jump to line 724 because the condition on line 721 was always true

722 topicslist.append(vv)

723 else:

724 print(

725 "WARNING: tag/topic {!r} maps to unknown {!r}".format(desc, vv)

726 )

727 topics = " ".join(topicslist)

728 tags = " ".join(tagslist)

729 # Changed to "_tags" and "_topics" to avoid possible key-collisions.

730 if topics:

731 node.topics.extend([topics])

732 if tags:

733 node.tags.extend([tags])

734

735

736def add_to_valid_tree1(

737 tree: ValidNode,

738 k: str,

739 v: Union[list[str], tuple[str, ...], str],

740 valid_values: Union[set[str], dict[str, Any]],

741) -> list[str]:

742 assert isinstance(tree, ValidNode)

743 assert isinstance(k, str)

744 assert v is None or isinstance(v, (list, tuple, str))

745 assert isinstance(valid_values, (set, dict))

746 if not v: 746 ↛ 747line 746 didn't jump to line 747 because the condition on line 746 was never true

747 add_to_valid_tree(valid_sequences, k, None)

748 return []

749 elif isinstance(v, str):

750 v = [v]

751 q = []

752 for vv in v:

753 assert isinstance(vv, str)

754 add_to_valid_tree(valid_sequences, k, vv)

755 vvs = vv.split()

756 for x in vvs:

757 q.append(x)

758 # return each individual tag

759 return q

760

761

762def add_to_valid_tree_mapping(

763 tree: ValidNode,

764 mapping: Union[dict[str, Union[list[str], str]], dict[str, str]],

765 valid_values: Union[set[str], dict[str, Any]],

766 recurse: bool,

767) -> None:

768 assert isinstance(tree, ValidNode)

769 assert isinstance(mapping, dict)

770 assert isinstance(valid_values, (set, dict))

771 assert recurse in (True, False)

772 for k, v in mapping.items():

773 assert isinstance(k, str)

774 assert isinstance(v, (list, str))

775 if isinstance(v, str):

776 q = add_to_valid_tree1(tree, k, [v], valid_values)

777 else:

778 q = add_to_valid_tree1(tree, k, v, valid_values)

779 if recurse:

780 visited = set()

781 while q:

782 v = q.pop()

783 if v in visited:

784 continue

785 visited.add(v)

786 if v not in mapping:

787 continue

788 vv = mapping[v]

789 qq = add_to_valid_tree1(tree, k, vv, valid_values)

790 q.extend(qq)

791

792

793# Tree of sequences considered to be tags (includes sequences that are

794# mapped to something that becomes one or more valid tags)

795valid_sequences = ValidNode()

796sequences_with_slashes: set[str] = set()

797for tag in valid_tags:

798 # The basic tags used in our tag system; some are a bit weird, but easier

799 # to implement this with 'false' positives than filter out stuff no one else

800 # uses.

801 if "/" in tag:

802 sequences_with_slashes.add(tag)

803 add_to_valid_tree(valid_sequences, tag, tag)

804for tag in uppercase_tags:

805 hyphenated = re.sub(r"\s+", "-", tag)

806 if hyphenated in valid_tags: 806 ↛ 807line 806 didn't jump to line 807 because the condition on line 806 was never true

807 print(

808 "DUPLICATE TAG: {} (from uppercase tag {!r})".format(

809 hyphenated, tag

810 )

811 )

812 assert hyphenated not in valid_tags

813 # Might as well, while we're here: Add hyphenated location tag.

814 valid_tags[hyphenated] = "dialect"

815 add_to_valid_tree(valid_sequences, hyphenated, hyphenated)

816for tag in uppercase_tags:

817 hyphenated = re.sub(r"\s+", "-", tag)

818 # XXX Move to above loop? Or is this here for readability?

819 if "/" in tag:

820 sequences_with_slashes.add(tag)

821 add_to_valid_tree(valid_sequences, tag, hyphenated)

822# xlat_tags_map!

823add_to_valid_tree_mapping(valid_sequences, xlat_tags_map, valid_tags, False)

824for k in xlat_tags_map:

825 if "/" in k:

826 sequences_with_slashes.add(k)

827# Add topics to the same table, with all generalized topics also added

828for topic in valid_topics:

829 assert " " not in topic

830 if "/" in topic: 830 ↛ 831line 830 didn't jump to line 831 because the condition on line 830 was never true

831 sequences_with_slashes.add(topic)

832 add_to_valid_tree(valid_sequences, topic, topic)

833# Let each original topic value stand alone. These are not generally on

834# valid_topics. We add the original topics with spaces replaced by hyphens.

835for topic in topic_generalize_map.keys():

836 hyphenated = topic.replace(" ", "-")

837 valid_topics.add(hyphenated)

838 if "/" in topic: 838 ↛ 839line 838 didn't jump to line 839 because the condition on line 838 was never true

839 sequences_with_slashes.add(tag)

840 add_to_valid_tree(valid_sequences, topic, hyphenated)

841# Add canonicalized/generalized topic values

842add_to_valid_tree_mapping(

843 valid_sequences, topic_generalize_map, valid_topics, True

844)

845

846# Regex used to divide a decode candidate into parts that shouldn't

847# have their slashes turned into spaces

848slashes_re = re.compile(

849 r"(" + "|".join((re.escape(s) for s in sequences_with_slashes)) + r")"

850)

851

852# Regexp used to find "words" from word heads and linguistic descriptions

853word_pattern = (

854 r"[^ ,;()\u200e]+|"

855 r"$[^ ,;()\u200e]+$[^ ,;()\u200e]+|"

856 r"[\u2800-\u28ff]|" # Braille characters

857 r"$([^()]|\([^()]*$)*\)"

858)

859

860word_re_global = re.compile(word_pattern)

861

862

863def distw(titleparts: Sequence[str], word: str) -> float:

864 """Computes how distinct ``word`` is from the most similar word in

865 ``titleparts``. Returns 1 if words completely distinct, 0 if

866 identical, or otherwise something in between."""

867 assert isinstance(titleparts, (list, tuple))

868 assert isinstance(word, str)

869 w = min(

870 Levenshtein.distance(word, tw) / max(len(tw), len(word))

871 for tw in titleparts

872 )

873 return w

874

875

876def map_with(

877 ht: Union[dict[str, Union[str, list[str]]], dict[str, str]],

878 lst: Sequence[str],

879) -> list[str]:

880 """Takes alternatives from ``lst``, maps them using ``ht`` to zero or

881 more alternatives each, and returns a combined list of alternatives."""

882 assert isinstance(ht, dict)

883 assert isinstance(lst, (list, tuple))

884 ret = []

885 for x in lst:

886 assert isinstance(x, str)

887 x = x.strip()

888 x = ht.get(x, x)

889 if isinstance(x, str): 889 ↛ 892line 889 didn't jump to line 892 because the condition on line 889 was always true

890 if x: 890 ↛ 885line 890 didn't jump to line 885 because the condition on line 890 was always true

891 ret.append(x)

892 elif isinstance(x, (list, tuple)):

893 ret.extend(x)

894 else:

895 raise RuntimeError("map_with unexpected value: {!r}".format(x))

896 return ret

897

898

899TagList = list[str]

900PosPathStep = tuple[int, TagList, TagList]

901

902

903def check_unknown(

904 from_i: int,

905 to_i: int,

906 i: int,

907 wordlst: Sequence[str],

908 allow_any: bool,

909 no_unknown_starts: bool,

910) -> list[PosPathStep]:

911 """Check if the current section from_i->to_i is actually unknown

912 or if it needs some special handling. We already presupposed that

913 this is UNKNOWN; this is just called to see what *kind* of UNKNOWN."""

914 assert isinstance(to_i, int)

915 assert isinstance(from_i, int)

916 assert isinstance(i, int)

917 # Adds unknown tag if needed. Returns new last_i

918 # print("check_unknown to_i={} from_i={} i={}"

919 # .format(to_i, from_i, i))

920 if from_i >= to_i:

921 return []

922 words = wordlst[from_i:to_i]

923 tag = " ".join(words)

924 assert tag

925 # print(f"{tag=}")

926 if re.match(ignored_unknown_starts_re, tag):

927 # Tags with this start are to be ignored

928 return [(from_i, ["UNKNOWN"], [])]

929 if tag in ignored_unknown_tags: 929 ↛ 930line 929 didn't jump to line 930 because the condition on line 929 was never true

930 return [] # One of the tags listed as to be ignored

931 if tag in ("and", "or"):

932 return []

933 if (

934 not allow_any

935 and not words[0].startswith("~")

936 and (

937 no_unknown_starts

938 or words[0] not in allowed_unknown_starts

939 or len(words) <= 1

940 )

941 ):

942 # print("ERR allow_any={} words={}"

943 # .format(allow_any, words))

944 return [

945 (from_i, ["UNKNOWN"], ["error-unknown-tag"])

946 ] # Add ``tag`` here to include

947 else:

948 return [(from_i, ["UNKNOWN"], [tag])]

949

950

951def add_new1(

952 node: ValidNode,

953 i: int,

954 start_i: int,

955 last_i: int,

956 new_paths: list[list[PosPathStep]],

957 new_nodes: list[tuple[ValidNode, int, int]],

958 pos_paths: list[list[list[PosPathStep]]],

959 wordlst: list[str],

960 allow_any: bool,

961 no_unknown_starts: bool,

962 max_last_i: int,

963) -> int:

964 assert isinstance(new_paths, list)

965 # print("add_new: start_i={} last_i={}".format(start_i, last_i))

966 # print("$ {} last_i={} start_i={}"

967 # .format(w, last_i, start_i))

968 max_last_i = max(max_last_i, last_i) # if last_i has grown

969 if (node, start_i, last_i) not in new_nodes:

970 new_nodes.append((node, start_i, last_i))

971 if node.end:

972 # We can see a terminal point in the search tree.

973 u = check_unknown(

974 last_i, start_i, i, wordlst, allow_any, no_unknown_starts

975 )

976 # Create new paths candidates based on different past possible

977 # paths; pos_path[last_i] contains possible paths, so add this

978 # new one at the beginning(?)

979 # The list comprehension inside the parens generates an iterable

980 # of lists, so this is .extend( [(last_i...)], [(last_i...)], ... )

981 # XXX: this is becoming impossible to annotate, nodes might

982 # need to become classed objects and not just dicts, or at least

983 # a TypedDict with a "children" node

984 new_paths.extend(

985 [(last_i, node.tags, node.topics)] + u + x

986 for x in pos_paths[last_i]

987 )

988 max_last_i = i + 1

989 return max_last_i

990

991

992@functools.lru_cache(maxsize=65536)

993def decode_tags(

994 src: str,

995 allow_any=False,

996 no_unknown_starts=False,

997) -> tuple[list[tuple[str, ...]], list[str]]:

998 tagsets, topics = decode_tags1(src, allow_any, no_unknown_starts)

999 # print(f"decode_tags: {src=}, {tagsets=}")

1000

1001 # Insert retry-code here that modifies the text source

1002 if (

1003 any(s.startswith("error-") for tagset in tagsets for s in tagset)

1004 # I hate Python's *nested* list comprehension syntax ^

1005 or any(s.startswith("error-") for s in topics)

1006 ):

1007 new_tagsets: list[tuple[str, ...]] = []

1008 new_topics: list[str] = []

1009

1010 if "/" in src:

1011 # slashes_re contains valid key entries with slashes; we're going

1012 # to skip them by splitting the string and skipping handling every

1013 # second entry, which contains the splitting group like "masculine/

1014 # feminine" style keys.

1015 split_parts = re.split(slashes_re, src)

1016 new_parts: list[str] = []

1017 if len(split_parts) > 1:

1018 for i, s in enumerate(split_parts):

1019 if i % 2 == 0:

1020 new_parts.append(s.replace("/", " "))

1021 else:

1022 new_parts.append(s)

1023 new_src = "".join(new_parts)

1024 else:

1025 new_src = src

1026 new_tagsets, new_topics = decode_tags1(

1027 new_src, allow_any, no_unknown_starts

1028 )

1029 elif " or " in src or " and " in src:

1030 # Annoying kludge.

1031 new_src = src.replace(" and ", " ")

1032 new_src = new_src.replace(" or ", " ")

1033 new_tagsets, new_topics = decode_tags1(

1034 new_src, allow_any, no_unknown_starts

1035 )

1036 # print(f"{new_tagsets=}")

1037

1038 if new_tagsets or new_topics:

1039 old_errors = sum(

1040 1 for tagset in tagsets for s in tagset if s.startswith("error")

1041 )

1042 old_errors += sum(1 for s in topics if s.startswith("error"))

1043 new_errors = sum(

1044 1

1045 for new_tagset in new_tagsets

1046 for s in new_tagset

1047 if s.startswith("error")

1048 )

1049 new_errors += sum(1 for s in new_topics if s.startswith("error"))

1050

1051 if new_errors <= old_errors: 1051 ↛ 1054line 1051 didn't jump to line 1054 because the condition on line 1051 was always true

1052 return new_tagsets, new_topics

1053

1054 return tagsets, topics

1055

1056

1057def decode_tags1(

1058 src: str,

1059 allow_any=False,

1060 no_unknown_starts=False,

1061) -> tuple[list[tuple[str, ...]], list[str]]:

1062 """Decodes tags, doing some canonicalizations. This returns a list of

1063 lists of tags and a list of topics."""

1064 assert isinstance(src, str)

1065

1066 # print("decode_tags: src={!r}".format(src))

1067

1068 pos_paths: list[list[list[PosPathStep]]] = [[[]]]

1069 wordlst: list[str] = []

1070 max_last_i = 0 # pre-initialized here so that it can be used as a ref

1071

1072 add_new = functools.partial(

1073 add_new1, # pre-set parameters and references for function

1074 pos_paths=pos_paths,

1075 wordlst=wordlst,

1076 allow_any=allow_any,

1077 no_unknown_starts=no_unknown_starts,

1078 max_last_i=max_last_i,

1079 )

1080 # First split the tags at commas and semicolons. Their significance is that

1081 # a multi-word sequence cannot continue across them.

1082 parts = split_at_comma_semi(src, extra=[";", ":"])

1083

1084 for part in parts:

1085 max_last_i = len(wordlst) # "how far have we gone?"

1086 lst1 = part.split()

1087 if not lst1:

1088 continue

1089 wordlst.extend(lst1)

1090 cur_nodes: list[tuple[ValidNode, int, int]] = [] # Currently seen

1091 for w in lst1:

1092 i = len(pos_paths) - 1

1093 new_nodes: list[tuple[ValidNode, int, int]] = []

1094 # replacement nodes for next loop

1095 new_paths: list[list[PosPathStep]] = []

1096 # print("ITER i={} w={} max_last_i={} wordlst={}"

1097 # .format(i, w, max_last_i, wordlst))

1098 node: ValidNode

1099 start_i: int

1100 last_i: int

1101 for node, start_i, last_i in cur_nodes:

1102 # ValidNodes are part of a search tree that checks if a

1103 # phrase is found in xlat_tags_map and other text->tags dicts.

1104 if w in node.children:

1105 # the phrase continues down the tree

1106 # print("INC", w)

1107 max_last_i = add_new(

1108 node.children[w],

1109 i,

1110 start_i,

1111 last_i,

1112 new_paths,

1113 new_nodes,

1114 )

1115 if node.end:

1116 # we've hit an end point, the tags and topics have already

1117 # been gathered at some point, don't do anything with the

1118 # old stuff

1119 if w in valid_sequences.children:

1120 # This starts a *new* possible section

1121 max_last_i = add_new(

1122 valid_sequences.children[w], # root->

1123 i,

1124 i,

1125 i,

1126 new_paths,

1127 new_nodes,

1128 )

1129 if w not in node.children and not node.end:

1130 # print("w not in node and $: i={} last_i={} wordlst={}"

1131 # .format(i, last_i, wordlst))

1132 # If i == last_i == 0, for example (beginning)

1133 if (

1134 i == last_i

1135 or no_unknown_starts

1136 or wordlst[last_i] not in allowed_unknown_starts

1137 ):

1138 # print("NEW", w)

1139 if w in valid_sequences.children:

1140 # Start new sequences here

1141 max_last_i = add_new(

1142 valid_sequences.children[w],

1143 i,

1144 i,

1145 last_i,

1146 new_paths,

1147 new_nodes,

1148 )

1149 if not new_nodes:

1150 # This is run at the start when i == max_last_i == 0,

1151 # which is what populates the first node in new_nodes.

1152 # Some initial words cause the rest to be interpreted as unknown

1153 # print("not new nodes: i={} last_i={} wordlst={}"

1154 # .format(i, max_last_i, wordlst))

1155 if (

1156 i == max_last_i

1157 or no_unknown_starts

1158 or wordlst[max_last_i] not in allowed_unknown_starts

1159 ):

1160 # print("RECOVER w={} i={} max_last_i={} wordlst={}"

1161 # .format(w, i, max_last_i, wordlst))

1162 if w in valid_sequences.children:

1163 max_last_i = add_new(

1164 # new sequence from root

1165 valid_sequences.children[w],

1166 i,

1167 i,

1168 max_last_i,

1169 new_paths,

1170 new_nodes,

1171 )

1172 cur_nodes = new_nodes # Completely replace nodes!

1173 # 2023-08-18, fix to improve performance

1174 # Decode tags does a big search of the best-shortest matching

1175 # sequences of tags, but the original algorithm didn't have

1176 # any culling happen during operation, so in a case with

1177 # a lot of tags (for example, big blocks of text inserted

1178 # somewhere by mistake that is processed by decode_tags),

1179 # it would lead to exponential growth of new_paths contents.

1180 # This culling, using the same weighting algorithm code as

1181 # in the original is just applied to new_paths before it is

1182 # added to pos_paths. Basically it's "take the 10 best paths".

1183 # This *can* cause bugs if it gets stuck in a local minimum

1184 # or something, but this whole process is one-dimensional

1185 # and not that complex, so hopefully it works out...

1186 pw = []

1187 path: list[PosPathStep]

1188 for path in new_paths:

1189 weight = len(path)

1190 if any(x[1] == ["UNKNOWN"] for x in path):

1191 weight += 100 # Penalize unknown paths

1192 pw.append((weight, path))

1193 new_paths = [weightpath[1] for weightpath in sorted(pw)[:10]]

1194 pos_paths.append(new_paths)

1195

1196 # print("END max_last_i={} len(wordlst)={} len(pos_paths)={}"

1197 # .format(max_last_i, len(wordlst), len(pos_paths)))

1198

1199 if cur_nodes:

1200 # print("END HAVE_NODES")

1201 for node, start_i, last_i in cur_nodes:

1202 if node.end:

1203 # print("$ END start_i={} last_i={}"

1204 # .format(start_i, last_i))

1205 for path in pos_paths[start_i]:

1206 pos_paths[-1].append(

1207 [(last_i, node.tags, node.topics)] + path

1208 )

1209 else:

1210 # print("UNK END start_i={} last_i={} wordlst={}"

1211 # .format(start_i, last_i, wordlst))

1212 u = check_unknown(

1213 last_i,

1214 len(wordlst),

1215 len(wordlst),

1216 wordlst,

1217 allow_any,

1218 no_unknown_starts,

1219 )

1220 if pos_paths[start_i]:

1221 for path in pos_paths[start_i]:

1222 pos_paths[-1].append(u + path)

1223 else:

1224 pos_paths[-1].append(u)

1225 else:

1226 # Check for a final unknown tag

1227 # print("NO END NODES max_last_i={}".format(max_last_i))

1228 paths = pos_paths[max_last_i] or [[]]

1229 u = check_unknown(

1230 max_last_i,

1231 len(wordlst),

1232 len(wordlst),

1233 wordlst,

1234 allow_any,

1235 no_unknown_starts,

1236 )

1237 if u:

1238 # print("end max_last_i={}".format(max_last_i))

1239 for path in list(paths): # Copy in case it is the last pos

1240 pos_paths[-1].append(u + path)

1241

1242 # import json

1243 # print("POS_PATHS:", json.dumps(pos_paths, indent=2, sort_keys=True))

1244

1245 if not pos_paths[-1]:

1246 # print("decode_tags: {}: EMPTY POS_PATHS[-1]".format(src))

1247 return [], []

1248

1249 # Find the best path

1250 pw = []

1251 for path in pos_paths[-1]:

1252 weight = len(path)

1253 if any(x[1] == ["UNKNOWN"] for x in path):

1254 weight += 100 # Penalize unknown paths

1255 pw.append((weight, path))

1256 path = min(pw)[1]

1257

1258 # Convert the best path to tagsets and topics

1259 tagsets: list[list[str]] = [[]]

1260 topics: list[str] = []

1261 for i, tagspec, topicspec in path:

1262 if len(tagsets or "") > 16:

1263 # ctx.error("Too many tagsets! This is probably exponential",

1264 # sortid="form_descriptions/20230818")

1265 return [("error-unknown-tag", "error-exponential-tagsets")], []

1266 if tagspec == ["UNKNOWN"]:

1267 new_tagsets = []

1268 for x in tagsets:

1269 new_tagsets.append(x + topicspec)

1270 tagsets = new_tagsets

1271 continue

1272 if tagspec:

1273 new_tagsets = []

1274 for x in tagsets:

1275 for t in tagspec:

1276 if t: 1276 ↛ 1283line 1276 didn't jump to line 1283 because the condition on line 1276 was always true

1277 new_tags = list(x)

1278 for tag in t.split():

1279 if tag not in new_tags:

1280 new_tags.append(tag)

1281 new_tagsets.append(new_tags)

1282 else:

1283 new_tagsets.append(x)

1284 tagsets = new_tagsets

1285 if topicspec:

1286 for t in topicspec:

1287 for topic in t.split():

1288 if topic not in topics:

1289 topics.append(topic)

1290

1291 # print("unsorted tagsets:", tagsets)

1292 ret_tagsets = sorted(set(tuple(sorted(set(tags))) for tags in tagsets))

1293 # topics = list(sorted(set(topics))) XXX tests expect not sorted

1294 # print("decode_tags: {} -> {} topics {}".format(src, tagsets, topics))

1295 # Yes, ret_tagsets is a list of tags in tuples, while topics is a LIST

1296 # of tags. Turning topics into a tuple breaks tests, turning the tuples

1297 # inside tagsets into lists breaks tests, I'm leaving them mismatched

1298 # for now. XXX

1299 return ret_tagsets, topics

1300

1301

1302def parse_head_final_tags(

1303 wxr: WiktextractContext, lang: str, form: str

1304) -> tuple[str, list[str]]:

1305 """Parses tags that are allowed at the end of a form head from the end

1306 of the form. This can also be used for parsing the final gender etc tags

1307 from translations and linkages."""

1308 assert isinstance(wxr, WiktextractContext)

1309 assert isinstance(lang, str) # Should be language that "form" is for

1310 assert isinstance(form, str)

1311

1312 # print("parse_head_final_tags: lang={} form={!r}".format(lang, form))

1313

1314 # Make sure there are no double spaces in the form as this code does not

1315 # handle them otherwise.

1316 form = re.sub(r"\s+", " ", form.strip())

1317 if not form:

1318 return form, []

1319

1320 origform = form

1321

1322 tags = []

1323

1324 # If parsing for certain Bantu languages (e.g., Swahili), handle

1325 # some extra head-final tags first

1326 if lang in head_final_bantu_langs:

1327 m = re.search(head_final_bantu_re, form)

1328 if m is not None:

1329 tagkeys = m.group(1)

1330 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1330 ↛ 1345line 1330 didn't jump to line 1345 because the condition on line 1330 was always true

1331 form = form[: m.start()]

1332 v = head_final_bantu_map[tagkeys]

1333 if v.startswith("?"): 1333 ↛ 1334line 1333 didn't jump to line 1334 because the condition on line 1333 was never true

1334 v = v[1:]

1335 wxr.wtp.debug(

1336 "suspicious suffix {!r} in language {}: {}".format(

1337 tagkeys, lang, origform

1338 ),

1339 sortid="form_descriptions/1028",

1340 )

1341 tags.extend(v.split())

1342

1343 # If parsing for certain Semitic languages (e.g., Arabic), handle

1344 # some extra head-final tags first

1345 if lang in head_final_semitic_langs:

1346 m = re.search(head_final_semitic_re, form)

1347 if m is not None:

1348 tagkeys = m.group(1)

1349 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1349 ↛ 1364line 1349 didn't jump to line 1364 because the condition on line 1349 was always true

1350 form = form[: m.start()]

1351 v = head_final_semitic_map[tagkeys]

1352 if v.startswith("?"): 1352 ↛ 1353line 1352 didn't jump to line 1353 because the condition on line 1352 was never true

1353 v = v[1:]

1354 wxr.wtp.debug(

1355 "suspicious suffix {!r} in language {}: {}".format(

1356 tagkeys, lang, origform

1357 ),

1358 sortid="form_descriptions/1043",

1359 )

1360 tags.extend(v.split())

1361

1362 # If parsing for certain other languages (e.g., Lithuanian,

1363 # French, Finnish), handle some extra head-final tags first

1364 if lang in head_final_other_langs:

1365 m = re.search(head_final_other_re, form)

1366 if m is not None:

1367 tagkeys = m.group(1)

1368 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1368 ↛ 1373line 1368 didn't jump to line 1373 because the condition on line 1368 was always true

1369 form = form[: m.start()]

1370 tags.extend(head_final_other_map[tagkeys].split(" "))

1371

1372 # Handle normal head-final tags

1373 m = re.search(head_final_re, form)

1374 if m is not None:

1375 tagkeys = m.group(3)

1376 # Only replace tags ending with numbers in languages that have

1377 # head-final numeric tags (e.g., Bantu classes); also, don't replace

1378 # tags if the main title ends with them (then presume they are part

1379 # of the word)

1380 # print("head_final_tags form={!r} tagkeys={!r} lang={}"

1381 # .format(form, tagkeys, lang))

1382 tagkeys_contains_digit = re.search(r"\d", tagkeys)

1383 if (

1384 (not tagkeys_contains_digit or lang in head_final_numeric_langs)

1385 and not wxr.wtp.title.endswith(" " + tagkeys) # type:ignore[union-attr]

1386 and

1387 # XXX the above test does not capture when the whole word is a

1388 # xlat_head_map key, so I added the below test to complement

1389 # it; does this break anything?

1390 not wxr.wtp.title == tagkeys

1391 ): # defunct/English,

1392 # "more defunct" -> "more" ["archaic"]

1393 if not tagkeys_contains_digit or lang in head_final_numeric_langs: 1393 ↛ 1407line 1393 didn't jump to line 1407 because the condition on line 1393 was always true

1394 form = form[: m.start()]

1395 v = xlat_head_map[tagkeys]

1396 if v.startswith("?"): 1396 ↛ 1397line 1396 didn't jump to line 1397 because the condition on line 1396 was never true

1397 v = v[1:]

1398 wxr.wtp.debug(

1399 "suspicious suffix {!r} in language {}: {}".format(

1400 tagkeys, lang, origform

1401 ),

1402 sortid="form_descriptions/1077",

1403 )

1404 tags.extend(v.split())

1405

1406 # Generate warnings about words ending in " or" after processing

1407 if (

1408 (form.endswith(" or") and not origform.endswith(" or"))

1409 or re.search(

1410 r" (1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|"

1411 r"1a|2a|9a|10a|m1|f1|f2|m2|f3|m3|f4|m4|f5|m5|or|\?)"

1412 r"($|/| (f|m|sg|pl|anim|inan))",

1413 form,

1414 )

1415 or form.endswith(" du")

1416 ):

1417 if form not in ok_suspicious_forms:

1418 wxr.wtp.debug(

1419 "suspicious unhandled suffix in {}: {!r}, originally {!r}".format(

1420 lang, form, origform

1421 ),

1422 sortid="form_descriptions/1089",

1423 )

1424

1425 # print("parse_head_final_tags: form={!r} tags={}".format(form, tags))

1426 return form, tags

1427

1428

1429def quote_kept_parens(s: str) -> str:

1430 """Changes certain parenthesized expressions so that they won't be

1431 interpreted as parentheses. This is used for parts that are kept as

1432 part of the word, such as "read admiral (upper half)"."""

1433 return re.sub(

1434 r"\((lower half|upper half|k|s|n|II|III|A|C|G|U|Y|"

1435 r"vinyl|p-phenylene vinylene|$\(\s*$\))\)",

1436 r"__lpar__\1__rpar__",

1437 s,

1438 )

1439

1440

1441def quote_kept_ruby(

1442 wxr: WiktextractContext,

1443 ruby_tuples: list[

1444 tuple[

1445 str,

1446 str,

1447 ]

1448 ],

1449 s: str,

1450) -> str:

1451 if len(ruby_tuples) < 1:

1452 wxr.wtp.debug(

1453 "quote_kept_ruby called with no ruby",

1454 sortid="form_description/1114/20230517",

1455 )

1456 return s

1457 ks = []

1458 rs = []

1459 for k, r in ruby_tuples:

1460 ks.append(re.escape(k))

1461 rs.append(re.escape(r))

1462 if not (ks and rs):

1463 wxr.wtp.debug(

1464 f"empty column in ruby_tuples: {ruby_tuples}",

1465 sortid="form_description/1124/20230606",

1466 )

1467 return s

1468 newm = re.compile(

1469 r"({})\s*$\s*({})\s*$".format("|".join(ks), "|".join(rs))

1470 )

1471 rub_re = re.compile(

1472 r"({})".format(

1473 r"|".join(

1474 r"{}$*{}$*".format(

1475 re.escape(k),

1476 re.escape(r),

1477 )

1478 for k, r in ruby_tuples

1479 )

1480 )

1481 )

1482

1483 def paren_replace(m: re.Match) -> str:

1484 return re.sub(newm, r"\1__lrub__\2__rrub__", m.group(0))

1485

1486 return re.sub(rub_re, paren_replace, s)

1487

1488

1489def unquote_kept_parens(s: str) -> str:

1490 """Conerts the quoted parentheses back to normal parentheses."""

1491 return re.sub(r"__lpar__(.*?)__rpar__", r"(\1)", s)

1492

1493

1494def add_romanization(

1495 wxr: WiktextractContext,

1496 data: WordData,

1497 roman: str,

1498 text: str,

1499 is_reconstruction: bool,

1500 head_group: Optional[int],

1501 ruby: Sequence[tuple[str, str]],

1502) -> None:

1503 tags_lst = ["romanization"]

1504 m = re.match(r"([^:]+):(.+)", roman)

1505 # This function's purpose is to intercept broken romanizations,

1506 # like "Yale: hēnpyeng" style tags. Most romanization styles

1507 # are already present as tags, so we can use decode_tags to find

1508 # them.

1509 if m: 1509 ↛ 1510line 1509 didn't jump to line 1510 because the condition on line 1509 was never true

1510 tagsets, topics = decode_tags(m.group(1))

1511 if tagsets:

1512 for tags in tagsets:

1513 tags_lst.extend(tags)

1514 roman = m.group(2)

1515 add_related(

1516 wxr,

1517 data,

1518 tags_lst,

1519 [roman],

1520 text,

1521 True,

1522 is_reconstruction,

1523 head_group,

1524 ruby,

1525 )

1526

1527

1528def add_related(

1529 wxr: WiktextractContext,

1530 data: WordData,

1531 tags_lst: Union[list[str], tuple[str, ...]],

1532 related_list: list[str],

1533 origtext: str,

1534 add_all_canonicals: bool,

1535 is_reconstruction: bool,

1536 head_group: Optional[int],

1537 ruby_data: Optional[Sequence[tuple[str, str]]] = None,

1538) -> Optional[list[tuple[str, ...]]]:

1539 """Internal helper function for some post-processing entries for related

1540 forms (e.g., in word head). This returns a list of list of tags to be

1541 added to following related forms or None (cf. walrus/English word head,

1542 parenthesized part starting with "both")."""

1543 assert isinstance(wxr, WiktextractContext)

1544 assert isinstance(tags_lst, (list, tuple))

1545 for x in tags_lst:

1546 assert isinstance(x, str)

1547 assert isinstance(related_list, (list, tuple))

1548 assert isinstance(origtext, str)

1549 assert add_all_canonicals in (True, False)

1550 assert isinstance(ruby_data, (list, tuple)) or ruby_data is None

1551 if ruby_data is None: 1551 ↛ 1552line 1551 didn't jump to line 1552 because the condition on line 1551 was never true

1552 ruby_data = []

1553 related = " ".join(related_list)

1554 # print("add_related: tags_lst={} related={}".format(tags_lst, related))

1555 if related == "[please provide]": 1555 ↛ 1556line 1555 didn't jump to line 1556 because the condition on line 1555 was never true

1556 return None

1557 if related in IGNORED_RELATED: 1557 ↛ 1558line 1557 didn't jump to line 1558 because the condition on line 1557 was never true

1558 return None

1559 if is_reconstruction and related.startswith("*") and len(related) > 1:

1560 related = related[1:]

1561

1562 # Get title word, with any reconstruction prefix removed

1563 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title) # type:ignore[arg-type]

1564

1565 def check_related(related: str) -> None:

1566 # Warn about some suspicious related forms

1567 m = re.search(suspicious_related_re, related)

1568 if (m and m.group(0) not in titleword) or (

1569 related in ("f", "m", "n", "c") and len(titleword) >= 3

1570 ):

1571 if "eumhun" in tags_lst: 1571 ↛ 1572line 1571 didn't jump to line 1572 because the condition on line 1571 was never true

1572 return

1573 if "cangjie-input" in tags_lst: 1573 ↛ 1574line 1573 didn't jump to line 1574 because the condition on line 1573 was never true

1574 return

1575 if "class" in tags_lst: 1575 ↛ 1576line 1575 didn't jump to line 1576 because the condition on line 1575 was never true

1576 return

1577 if wxr.wtp.section == "Korean" and re.search( 1577 ↛ 1581line 1577 didn't jump to line 1581 because the condition on line 1577 was never true

1578 r"^\s*\w*>\w*\s*$", related

1579 ):

1580 # ignore Korean "i>ni" / "라>나" values

1581 return

1582 if ( 1582 ↛ 1589line 1582 didn't jump to line 1589 because the condition on line 1582 was never true

1583 wxr.wtp.section == "Burmese"

1584 and "romanization" in tags_lst

1585 and re.search(r":", related)

1586 ):

1587 # ignore Burmese with ":", that is used in Burmese

1588 # translitteration of "း", the high-tone visarga.

1589 return

1590 wxr.wtp.debug(

1591 "suspicious related form tags {}: {!r} in {!r}".format(

1592 tags_lst, related, origtext

1593 ),

1594 sortid="form_descriptions/1147",

1595 )

1596

1597 following_tagsets = None # Tagsets to add to following related forms

1598 roman = None

1599 tagsets1: list[tuple[str, ...]] = [tuple()]

1600 topics1: list[str] = []

1601

1602 m = re.match(r"$(([^()]|\([^()]*$)*)\)\s+", related)

1603 if m:

1604 paren = m.group(1)

1605 related = related[m.end() :]

1606 m = re.match(r"^(all|both) (.*)", paren)

1607 if m: 1607 ↛ 1608line 1607 didn't jump to line 1608 because the condition on line 1607 was never true

1608 tagsets1, topics1 = decode_tags(m.group(2))

1609 following_tagsets = tagsets1

1610 else:

1611 tagsets1, topics1 = decode_tags(paren)

1612 else:

1613 m = re.search(r"\s+$(([^()]|\([^()]*$)*)\)$", related)

1614 if m:

1615 paren = m.group(1)

1616 if paren.startswith("U+"): 1616 ↛ 1617line 1616 didn't jump to line 1617 because the condition on line 1616 was never true

1617 related = related[: m.start()]

1618 else:

1619 cls = classify_desc(paren)

1620 if ( 1620 ↛ 1627line 1620 didn't jump to line 1627 because the condition on line 1620 was always true

1621 cls in ("romanization", "english")

1622 and classify_desc(related[: m.start()]) == "other"

1623 ):

1624 roman = paren

1625 related = related[: m.start()]

1626 else:

1627 related = related[: m.start()]

1628 tagsets1, topics1 = decode_tags(paren)

1629 if related and related.startswith("{{"): 1629 ↛ 1630line 1629 didn't jump to line 1630 because the condition on line 1629 was never true

1630 wxr.wtp.debug(

1631 "{{ in word head form - possible Wiktionary error: {!r}".format(

1632 related

1633 ),

1634 sortid="form_descriptions/1177",

1635 )

1636 return None # Likely Wiktionary coding error

1637 related = unquote_kept_parens(related)

1638 # Split related by "/" (e.g., grande/Spanish) superlative in head

1639 # Do not split if / in word title, see π//Japanese

1640 if len(related) > 5 and "/" not in wxr.wtp.title: # type:ignore[operator]

1641 alts = split_at_comma_semi(related, separators=["/"])

1642 else:

1643 alts = [related]

1644 if ruby_data: 1644 ↛ 1646line 1644 didn't jump to line 1646 because the condition on line 1644 was never true

1645 # prepare some regex stuff in advance

1646 ks, rs = [], []

1647 for k, r in ruby_data:

1648 ks.append(re.escape(k))

1649 rs.append(re.escape(r))

1650 splitter = r"((?:{})__lrub__(?:{})__rrub__)".format(

1651 "|".join(ks), "|".join(rs)

1652 )

1653 for related in alts:

1654 ruby: list[tuple[str, str]] = []

1655 if ruby_data: 1655 ↛ 1656line 1655 didn't jump to line 1656 because the condition on line 1655 was never true

1656 new_related = []

1657 rub_split = re.split(splitter, related)

1658 for s in rub_split:

1659 m = re.match(r"(.+)__lrub__(.+)__rrub__", s)

1660 if m:

1661 # add ruby with (\1, \2)

1662 ruby.append((m.group(1), m.group(2)))

1663 new_related.append(m.group(1))

1664 else:

1665 new_related.append(s)

1666 related = "".join(new_related)

1667 tagsets2, topics2 = decode_tags(" ".join(tags_lst))

1668 for tags1 in tagsets1:

1669 assert isinstance(tags1, (list, tuple))

1670 for tags2 in tagsets2:

1671 assert isinstance(tags1, (list, tuple))

1672 dt: LinkageData = {"word": related}

1673 if roman:

1674 dt["roman"] = roman

1675 if ruby: 1675 ↛ 1676line 1675 didn't jump to line 1676 because the condition on line 1675 was never true

1676 dt["ruby"] = ruby

1677 if "alt-of" in tags2: 1677 ↛ 1678line 1677 didn't jump to line 1678 because the condition on line 1677 was never true

1678 check_related(related)

1679 data_extend(data, "tags", tags1)

1680 data_extend(data, "tags", tags2)

1681 data_extend(data, "topics", topics1)

1682 data_extend(data, "topics", topics2)

1683 data_append(data, "alt_of", dt)

1684 elif "form-of" in tags2: 1684 ↛ 1685line 1684 didn't jump to line 1685 because the condition on line 1684 was never true

1685 check_related(related)

1686 data_extend(data, "tags", tags1)

1687 data_extend(data, "tags", tags2)

1688 data_extend(data, "topics", topics1)

1689 data_extend(data, "topics", topics2)

1690 data_append(data, "form_of", dt)

1691 elif "compound-of" in tags2: 1691 ↛ 1692line 1691 didn't jump to line 1692 because the condition on line 1691 was never true

1692 check_related(related)

1693 data_extend(data, "tags", tags1)

1694 data_extend(data, "tags", tags2)

1695 data_extend(data, "topics", topics1)

1696 data_extend(data, "topics", topics2)

1697 data_append(data, "compound", related)

1698 else:

1699 lang = wxr.wtp.section or "LANG_MISSING"

1700 related, final_tags = parse_head_final_tags(

1701 wxr, lang, related

1702 )

1703 # print("add_related: related={!r} tags1={!r} tags2={!r} "

1704 # "final_tags={!r}"

1705 # .format(related, tags1, tags2, final_tags))

1706 tags = list(tags1) + list(tags2) + list(final_tags)

1707 check_related(related)

1708 form: FormData = {"form": related}

1709 if head_group:

1710 form["head_nr"] = head_group

1711 if roman:

1712 form["roman"] = roman

1713 if ruby: 1713 ↛ 1714line 1713 didn't jump to line 1714 because the condition on line 1713 was never true

1714 form["ruby"] = ruby

1715 data_extend(form, "topics", topics1)

1716 data_extend(form, "topics", topics2)

1717 if topics1 or topics2: 1717 ↛ 1718line 1717 didn't jump to line 1718 because the condition on line 1717 was never true

1718 wxr.wtp.debug(

1719 "word head form has topics: {}".format(form),

1720 sortid="form_descriptions/1233",

1721 )

1722 # Add tags from canonical form into the main entry

1723 if "canonical" in tags:

1724 if related in ("m", "f") and len(titleword) > 1: 1724 ↛ 1725line 1724 didn't jump to line 1725 because the condition on line 1724 was never true

1725 wxr.wtp.debug(

1726 "probably incorrect canonical form "

1727 "{!r} ignored (probably tag combination "

1728 "missing from xlat_head_map)".format(related),

1729 sortid="form_descriptions/1241",

1730 )

1731 continue

1732 if (

1733 related != titleword

1734 or add_all_canonicals

1735 or topics1

1736 or topics2

1737 or ruby

1738 ):

1739 data_extend(form, "tags", list(sorted(set(tags))))

1740 else:

1741 # We won't add canonical form here

1742 filtered_tags = list(

1743 x for x in tags if x != "canonical"

1744 )

1745 data_extend(data, "tags", filtered_tags)

1746 continue

1747 else:

1748 data_extend(form, "tags", list(sorted(set(tags))))

1749 # Only insert if the form is not already there

1750 for old in data.get("forms", ()):

1751 if form == old: 1751 ↛ 1752line 1751 didn't jump to line 1752 because the condition on line 1751 was never true

1752 break

1753 else:

1754 data_append(data, "forms", form)

1755

1756 # If this form had pre-tags that started with "both" or "all", add those

1757 # tags also to following related forms that don't have their own tags

1758 # specified.

1759 return following_tagsets

1760

1761

1762# Issue #967, in English word forms sometimes forms are skipped because

1763# they are taggable words and their distw() is too big, like clipping from clip

1764WORDS_WITH_FALSE_POSITIVE_TAGS: dict[str, list[str]] = {

1765 "clip": ["clipping"], # XXX remember to change me back to clipping after

1766 "English": ["English", "Englishes"],

1767 "common": ["common", "commoner"],

1768 # tests.

1769}

1770

1771WORDS_WITH_FALSE_POSITIVE_FORMS: dict[str, list[str]] = {

1772 "unaccountability": ["countable", "uncountable"],

1773 "uncountability": ["countable", "uncountable"],

1774}

1775

1776FALSE_POSITIVE_MISSING_FORMS: dict[str, list[str]] = {}

1777

1778FORM_ASSOCIATED_TAG_WORDS: set[str] = {

1779 "participle",

1780 "past",

1781 "present",

1782 "singular",

1783 "plural",

1784 "first-person",

1785 "second-person",

1786 "third-person",

1787 "gerund",

1788}

1789

1790

1791def parse_word_head(

1792 wxr: WiktextractContext,

1793 pos: str,

1794 text: str,

1795 data: WordData,

1796 is_reconstruction: bool,

1797 head_group: Optional[int],

1798 ruby=None,

1799 links=None,

1800) -> None:

1801 """Parses the head line for a word for in a particular language and

1802 part-of-speech, extracting tags and related forms."""

1803 assert isinstance(wxr, WiktextractContext)

1804 assert isinstance(pos, str)

1805 assert isinstance(text, str)

1806 assert isinstance(data, dict)

1807 assert isinstance(ruby, (list, tuple)) or ruby is None

1808 if ruby is None:

1809 ruby = []

1810 assert is_reconstruction in (True, False)

1811 # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text))

1812 # print(f"PARSE_WORD_HEAD: {data=}")

1813 if links is None:

1814 links = []

1815

1816 if len(links) > 0:

1817 # if we have link data (that is, links with stuff like commas and

1818 # spaces, replace word_re with a modified local scope pattern

1819 # print(f"links {list((c, ord(c)) for link in links for c in link)=}")

1820 word_re = re.compile(

1821 r"\b" # In case we have forms that are longer and contain links

1822 +

1823 # or words as a substring...

1824 r"\b|\b".join(

1825 sorted((re.escape(s) for s in links), key=lambda x: -len(x))

1826 )

1827 + r"\b|"

1828 + word_pattern

1829 )

1830 else:

1831 word_re = word_re_global

1832

1833 if "Lua execution error" in text or "Lua timeout error" in text: 1833 ↛ 1834line 1833 didn't jump to line 1834 because the condition on line 1833 was never true

1834 return

1835

1836 # In Aug 2021, some words had spurious Template:en at the end of head forms

1837 # due to a Wiktionary error.

1838 text = re.sub(r"\s+Template:[-a-zA-Z]+\s*$", "", text)

1839

1840 # Fix words with "superlative:" or "comparative:" at end of head

1841 # e.g. grande/Spanish/Adj

1842 text = re.sub(r" (superlative|comparative): (.*)", r" (\1 \2)", text)

1843

1844 # Parse Arabic non-past forms, e.g. أبلع/Arabic/Verb

1845 m = re.search(r", non-past ([^)]+ $[^)]+$)", text)

1846 if m:

1847 add_related(

1848 wxr,

1849 data,

1850 ["non-past"],

1851 [m.group(1)],

1852 text,

1853 True,

1854 is_reconstruction,

1855 head_group,

1856 ruby,

1857 )

1858 text = text[: m.start()] + text[m.end() :]

1859

1860 language = wxr.wtp.section

1861 titleword = re.sub(

1862 r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "MISSING_TITLE"

1863 )

1864 titleparts = list(

1865 m.group(0)

1866 for m in re.finditer(word_re, wxr.wtp.title or "MISSING_TITLE")

1867 )

1868 if not titleparts: 1868 ↛ 1869line 1868 didn't jump to line 1869 because the condition on line 1868 was never true

1869 return

1870

1871 # Remove " or" from the end to prevent weird canonical forms

1872 if text.endswith(" or"):

1873 for tp in titleparts:

1874 if text.endswith(tp): 1874 ↛ 1875line 1874 didn't jump to line 1875 because the condition on line 1874 was never true

1875 break

1876 else:

1877 text = text.removesuffix(" or").rstrip()

1878

1879 # Handle the part of the head that is not in parentheses. However, certain

1880 # parenthesized parts are part of word, and those must be handled

1881 # specially here.

1882 if ruby: 1882 ↛ 1883line 1882 didn't jump to line 1883 because the condition on line 1882 was never true

1883 text = quote_kept_ruby(wxr, ruby, text)

1884 base = text

1885 base = quote_kept_parens(base)

1886 base = remove_text_in_parentheses(base)

1887 base = base.replace("?", "") # Removes uncertain articles etc

1888 base = re.sub(r"\s+", " ", base)

1889 base = re.sub(r" ([,;])", r"\1", base)

1890 base = re.sub(r"(.*) •.*", r"\1", base)

1891 # Many languages use • as a punctuation mark separating the base

1892 # from the rest of the head. στάδιος/Ancient Greek, issue #176

1893 base = base.strip()

1894

1895 # Check for certain endings in head (mostly for compatibility with weird

1896 # heads, e.g. rata/Romanian "1st conj." at end)

1897 m = re.search(head_end_re, base)

1898 tags: Union[tuple[str, ...], list[str]] = []

1899 if m: 1899 ↛ 1900line 1899 didn't jump to line 1900 because the condition on line 1899 was never true

1900 tags = head_end_map[m.group(1).lower()].split()

1901 data_extend(data, "tags", tags)

1902 base = base[: m.start()]

1903

1904 # Special case: handle Hán Nôm readings for Vietnamese characters

1905 m = re.match(

1906 r"{}: (Hán Nôm) readings: (.*)".format(re.escape(titleword)), base

1907 )

1908 if m: 1908 ↛ 1909line 1908 didn't jump to line 1909 because the condition on line 1908 was never true

1909 tag, readings = m.groups()

1910 tag = re.sub(r"\s+", "-", tag)

1911 for reading in split_at_comma_semi(readings, skipped=links):

1912 add_related(

1913 wxr,

1914 data,

1915 [tag],

1916 [reading],

1917 text,

1918 True,

1919 is_reconstruction,

1920 head_group,

1921 ruby,

1922 )

1923 return

1924

1925 # Special case: Hebrew " [pattern: nnn]" ending

1926 m = re.search(r"\s+\[pattern: ([^]]+)\]", base)

1927 if m: 1927 ↛ 1928line 1927 didn't jump to line 1928 because the condition on line 1927 was never true

1928 add_related(

1929 wxr,

1930 data,

1931 ["class"],

1932 [m.group(1)],

1933 text,

1934 True,

1935 is_reconstruction,

1936 head_group,

1937 ruby,

1938 )

1939 base = base[: m.start()] + base[m.end() :]

1940

1941 # Clean away some messy "Upload an image" template text used in

1942 # American Sign Language:

1943 # S@NearBaseForearm-PalmUp Frontandback S@BaseForearm-PalmUp

1944 m = re.search(r"Upload .+ gif image.", base)

1945 if m: 1945 ↛ 1946line 1945 didn't jump to line 1946 because the condition on line 1945 was never true

1946 base = base[: m.start()] + base[m.end() :]

1947

1948 # Split the head into alternatives. This is a complicated task, as

1949 # we do not want so split on "or" or "," when immediately followed by more

1950 # head-final tags, but otherwise do want to split by them.

1951 # 20230907 added "or" to this to handle 'true or false', titles with 'or'

1952 if wxr.wtp.title and ("," in wxr.wtp.title or " or " in wxr.wtp.title):

1953 # A kludge to handle article titles/phrases with commas.

1954 # Preprocess splits to first capture the title, then handle

1955 # all the others as usual.

1956 presplits = re.split(r"({})".format(wxr.wtp.title), base)

1957 splits = []

1958 for psplit in presplits:

1959 if psplit == wxr.wtp.title:

1960 splits.append(psplit)

1961 else:

1962 splits.extend(re.split(head_split_re, psplit))

1963 else:

1964 # Do the normal split; previous only-behavior.

1965 splits = re.split(head_split_re, base)

1966 # print("SPLITS:", splits)

1967 alts: list[str] = []

1968 # print("parse_word_head: splits:", splits,

1969 # "head_split_re_parens:", head_split_re_parens)

1970 for i in range(

1971 0, len(splits) - head_split_re_parens, head_split_re_parens + 1

1972 ):

1973 v = splits[i]

1974 ending = splits[i + 1] or "" # XXX is this correct???

1975 # print("parse_word_head alts v={!r} ending={!r} alts={}"

1976 # .format(v, ending, alts))

1977 if alts and (v == "" and ending):

1978 assert ending[0] == " "

1979 alts[-1] += " or" + ending # endings starts with space

1980 elif v or ending: 1980 ↛ 1970line 1980 didn't jump to line 1970 because the condition on line 1980 was always true

1981 alts.append((v or "") + (ending or ""))

1982 last = splits[-1].strip()

1983 conn = "" if len(splits) < 3 else splits[-2]

1984 # print("parse_word_head alts last={!r} conn={!r} alts={}"

1985 # .format(last, conn, alts))

1986 if (

1987 alts

1988 and last

1989 and (

1990 last.split()[0] in xlat_head_map

1991 or (

1992 conn == " or "

1993 and (alts[-1] + " or " + last).strip() in xlat_head_map

1994 )

1995 )

1996 ):

1997 alts[-1] += " or " + last

1998 elif last:

1999 alts.append(last)

2000

2001 # print("parse_word_head alts: {}".format(alts))

2002 # print(f"{base=}")

2003

2004 # Process the head alternatives

2005 canonicals: list[tuple[list[str], list[str]]] = []

2006 mode: Optional[str] = None

2007 for alt_i, alt in enumerate(alts):

2008 alt = alt.strip()

2009 if alt.startswith("compound form:"): 2009 ↛ 2010line 2009 didn't jump to line 2010 because the condition on line 2009 was never true

2010 mode = "compound-form"

2011 alt = alt[14:].strip()

2012 if mode == "compound-form": 2012 ↛ 2013line 2012 didn't jump to line 2013 because the condition on line 2012 was never true

2013 add_related(

2014 wxr,

2015 data,

2016 ["in-compounds"],

2017 [alt],

2018 text,

2019 True,

2020 is_reconstruction,

2021 head_group,

2022 ruby,

2023 )

2024 continue

2025 # For non-first parts, see if it can be treated as tags-only

2026 if alt_i == 0:

2027 expanded_alts = [alt]

2028 else:

2029 expanded_alts = map_with(xlat_descs_map, [alt])

2030 # print("EXPANDED_ALTS:", expanded_alts)

2031 tagsets: Optional[list[tuple[str, ...]]]

2032 for alt in expanded_alts:

2033 baseparts = list(m.group(0) for m in word_re.finditer(alt))

2034 if alt_i > 0:

2035 tagsets, topics = decode_tags(" ".join(baseparts))

2036 if not any("error-unknown-tag" in x for x in tagsets):

2037 data_extend(data, "topics", topics)

2038 for tags1 in tagsets:

2039 data_extend(data, "tags", tags1)

2040 continue

2041

2042 alt, tags = parse_head_final_tags(

2043 wxr, language or "MISSING_LANG", alt

2044 )

2045 tags = list(tags) # Make sure we don't modify anything cached

2046 tags.append("canonical")

2047 if alt_i == 0 and "," in wxr.wtp.title: # type:ignore[operator]

2048 # Kludge to handle article titles/phrases with commas.

2049 # basepart's regex strips commas, which leads to a

2050 # canonical form that is the title phrase without a comma.

2051 # basepart in add_related is almost immediately joined with

2052 # spaces anyhow. XXX not exactly sure why it's

2053 # canonicals.append((tags, baseparts)) and not (tags, [alt])

2054 baseparts = [alt]

2055 canonicals.append((tags, baseparts))

2056 for tags, baseparts in canonicals:

2057 add_related(

2058 wxr,

2059 data,

2060 tags,

2061 baseparts,

2062 text,

2063 len(canonicals) > 1,

2064 is_reconstruction,

2065 head_group,

2066 ruby,

2067 )

2068

2069 # Handle parenthesized descriptors for the word form and links to

2070 # related words

2071 text = quote_kept_parens(text)

2072 parens = list(

2073 m.group(2)

2074 for m in re.finditer(r"(^|\s)$(([^()]|\([^()]*$)*)\)", text)

2075 )

2076 parens.extend(

2077 m.group(1)

2078 for m in re.finditer(r"[^\s]$(([^()]|\([^()]*$)*)\)($|\s)", text)

2079 )

2080 have_romanization = False

2081 have_ruby = False

2082 hiragana = ""

2083 katakana = ""

2084 for paren in parens:

2085 paren = paren.strip()

2086 if not paren: 2086 ↛ 2087line 2086 didn't jump to line 2087 because the condition on line 2086 was never true

2087 continue

2088 if paren.startswith("see "):

2089 continue

2090 if paren.startswith("U+"): 2090 ↛ 2091line 2090 didn't jump to line 2091 because the condition on line 2090 was never true

2091 continue

2092 # In some rare cases, strip word that inflects form the form

2093 # description, e.g. "look through rose-tinted glasses"/English.

2094 paren = re.sub(r"\s*$\[[^])]*\]$", "", paren)

2095

2096 # If it starts with hiragana or katakana, treat as such form. Note

2097 # that each hiragana/katakana character is in separate parentheses,

2098 # so we must concatenate them.

2099 try:

2100 un = unicodedata.name(paren[0]).split()[0]

2101 except ValueError:

2102 un = "INVALID"

2103 if un == "KATAKANA": 2103 ↛ 2104line 2103 didn't jump to line 2104 because the condition on line 2103 was never true

2104 katakana += paren

2105 have_ruby = True

2106 continue

2107 if un == "HIRAGANA": 2107 ↛ 2108line 2107 didn't jump to line 2108 because the condition on line 2107 was never true

2108 hiragana += paren

2109 have_ruby = True

2110 continue

2111

2112 # Parse format ", 16 (Japan, Mainland), 17 (Hong Kong, Taiwan) strokes,"

2113 # in the middle of the parenthesized expression, e.g. 薄

2114 def strokes_repl(m: re.Match) -> str:

2115 strokes1, tags1, strokes2, tags2 = m.groups()

2116 for strokes, tags in [[strokes1, tags1], [strokes2, tags2]]:

2117 tags = tags.split(", ")

2118 tags = list(

2119 "Mainland China" if t == "Mainland" else t for t in tags

2120 )

2121 tags.append("strokes")

2122 add_related(

2123 wxr,

2124 data,

2125 tags,

2126 [strokes],

2127 text,

2128 True,

2129 is_reconstruction,

2130 head_group,

2131 ruby,

2132 )

2133 return ", "

2134

2135 paren = re.sub(

2136 r", (\d+) $([^()]+)$, (\d+) $([^()]+)$ strokes, ",

2137 strokes_repl,

2138 paren,

2139 )

2140

2141 descriptors = map_with(xlat_descs_map, [paren])

2142 new_desc = []

2143 for desc in descriptors:

2144 new_desc.extend(

2145 map_with(

2146 xlat_tags_map,

2147 split_at_comma_semi(desc, extra=[", or "], skipped=links),

2148 )

2149 )

2150 prev_tags: Union[list[list[str]], list[tuple[str, ...]], None] = None

2151 following_tags = None # Added to prev_tags from previous parenthesized

2152 # part, e.g. walrus/English

2153 # "(both nonstandard, proscribed, uncommon)"

2154 for desc_i, desc in enumerate(new_desc):

2155 # print("HEAD DESC: {!r}".format(desc))

2156

2157 # Abort on certain descriptors (assume remaining values are

2158 # examples or uninteresting, cf. gaan/Navajo, horior/Latin)

2159 if re.match(r"^(per |e\.g\.$)", desc): 2159 ↛ 2160line 2159 didn't jump to line 2160 because the condition on line 2159 was never true

2160 break

2161

2162 # If it all consists of CJK characters, add it with the

2163 # CJK tag. This is used at least for some Vietnamese

2164 # words (e.g., ba/Vietnamese)

2165 try:

2166 if all(unicodedata.name(x).startswith("CJK ") for x in desc): 2166 ↛ 2167line 2166 didn't jump to line 2167 because the condition on line 2166 was never true

2167 add_related(

2168 wxr,

2169 data,

2170 ["CJK"],

2171 [desc],

2172 text,

2173 True,

2174 is_reconstruction,

2175 head_group,

2176 ruby,

2177 )

2178 continue

2179 except ValueError:

2180 pass

2181

2182 # Handle some special cases

2183 splitdesc = desc.split()

2184 if ( 2184 ↛ 2193line 2184 didn't jump to line 2193 because the condition on line 2184 was never true

2185 len(splitdesc) >= 3

2186 and splitdesc[1] == "superlative"

2187 and classify_desc(splitdesc[0]) != "tags"

2188 and prev_tags

2189 ):

2190 # Handle the special case of second comparative after comma,

2191 # followed by superlative without comma. E.g.

2192 # mal/Portuguese/Adv

2193 for ts in prev_tags:

2194 add_related(

2195 wxr,

2196 data,

2197 ts,

2198 [splitdesc[0]],

2199 text,

2200 True,

2201 is_reconstruction,

2202 head_group,

2203 ruby,

2204 )

2205 desc = " ".join(splitdesc[1:])

2206 elif ( 2206 ↛ 2214line 2206 didn't jump to line 2214 because the condition on line 2206 was never true

2207 len(splitdesc) == 2

2208 and splitdesc[0] in ("also", "and")

2209 and prev_tags

2210 and classify_desc(splitdesc[1]) != "tags"

2211 ):

2212 # Sometimes alternative forms are prefixed with "also" or

2213 # "and"

2214 for ts in prev_tags:

2215 add_related(

2216 wxr,

2217 data,

2218 ts,

2219 [splitdesc[1]],

2220 text,

2221 True,

2222 is_reconstruction,

2223 head_group,

2224 ruby,

2225 )

2226 continue

2227 elif len(splitdesc) >= 2 and splitdesc[0] in ("including",): 2227 ↛ 2228line 2227 didn't jump to line 2228 because the condition on line 2227 was never true

2228 continue

2229

2230 # If only one word, assume it is comma-separated alternative

2231 # to the previous one

2232 if " " not in desc:

2233 cls = classify_desc(desc)

2234 if cls != "tags":

2235 if prev_tags: 2235 ↛ 2237line 2235 didn't jump to line 2237 because the condition on line 2235 was never true

2236 # Assume comma-separated alternative to previous one

2237 for ts in prev_tags:

2238 add_related(

2239 wxr,

2240 data,

2241 ts,

2242 [desc],

2243 text,

2244 True,

2245 is_reconstruction,

2246 head_group,

2247 ruby,

2248 )

2249 continue

2250 elif distw(titleparts, desc) <= 0.5: 2250 ↛ 2253line 2250 didn't jump to line 2253 because the condition on line 2250 was never true

2251 # Similar to head word, assume a dialectal variation to

2252 # the base form. Cf. go/Alemannic German/Verb

2253 add_related(

2254 wxr,

2255 data,

2256 ["alternative"],

2257 [desc],

2258 text,

2259 True,

2260 is_reconstruction,

2261 head_group,

2262 ruby,

2263 )

2264 continue

2265 elif (

2266 cls in ("romanization", "english")

2267 and not have_romanization

2268 and classify_desc(titleword) == "other"

2269 and not (

2270 "categories" in data and desc in data["categories"]

2271 )

2272 ):

2273 # Assume it to be a romanization

2274 add_romanization(

2275 wxr,

2276 data,

2277 desc,

2278 text,

2279 is_reconstruction,

2280 head_group,

2281 ruby,

2282 )

2283 have_romanization = True

2284 continue

2285

2286 m = re.match(r"^(\d+) strokes?$", desc)

2287 if m:

2288 # Special case, used to give #strokes for Han characters

2289 add_related(

2290 wxr,

2291 data,

2292 ["strokes"],

2293 [m.group(1)],

2294 text,

2295 True,

2296 is_reconstruction,

2297 head_group,

2298 ruby,

2299 )

2300 continue

2301

2302 # See if it is radical+strokes

2303 m = re.match(

2304 r"^([\u2F00-\u2FDF\u2E80-\u2EFF\U00018800-\U00018AFF"

2305 r"\uA490-\uA4CF\u4E00-\u9FFF]\+\d+)"

2306 r"( in (Japanese|Chinese|traditional Chinese|"

2307 r"simplified Chinese))?$",

2308 desc,

2309 )

2310 if m: 2310 ↛ 2313line 2310 didn't jump to line 2313 because the condition on line 2310 was never true

2311 # Special case, used to give radical + strokes for Han

2312 # characters

2313 radical_strokes = m.group(1)

2314 lang = m.group(3)

2315 t = ["radical+strokes"]

2316 if lang:

2317 t.extend(lang.split())

2318 add_related(

2319 wxr,

2320 data,

2321 t,

2322 [radical_strokes],

2323 text,

2324 True,

2325 is_reconstruction,

2326 head_group,

2327 ruby,

2328 )

2329 prev_tags = None

2330 following_tags = None

2331 continue

2332

2333 # See if it indicates historical Katakana ortography (←) or

2334 # just otherwise katakana/hiragana form

2335 m = re.match(r"←\s*|kana\s+", desc)

2336 if m: 2336 ↛ 2337line 2336 didn't jump to line 2337 because the condition on line 2336 was never true

2337 if desc.startswith("←"):

2338 t1 = "historical "

2339 else:

2340 t1 = ""

2341 x = desc[m.end() :]

2342 if x.endswith("?"):

2343 x = x[:-1]

2344 # XXX should we add a tag indicating uncertainty?

2345 if x:

2346 name = unicodedata.name(x[0])

2347 if name.startswith("HIRAGANA "):

2348 desc = t1 + "hiragana " + x

2349 elif name.startswith("KATAKANA "):

2350 desc = t1 + "katakana " + x

2351

2352 # See if it is "n strokes in Chinese" or similar

2353 m = re.match(

2354 r"(\d+) strokes in (Chinese|Japanese|"

2355 r"traditional Chinese|simplified Chinese)$",

2356 desc,

2357 )

2358 if m: 2358 ↛ 2360line 2358 didn't jump to line 2360 because the condition on line 2358 was never true

2359 # Special case, used to give just strokes for some Han chars

2360 strokes = m.group(1)

2361 lang = m.group(2)

2362 t = ["strokes"]

2363 t.extend(lang.split())

2364 add_related(

2365 wxr,

2366 data,

2367 t,

2368 [strokes],

2369 text,

2370 True,

2371 is_reconstruction,

2372 head_group,

2373 ruby,

2374 )

2375 prev_tags = None

2376 following_tags = None

2377 continue

2378

2379 # American Sign Language has images (or requests for image)

2380 # as heads, + this ASL gloss after.

2381 m2 = re.search(r"$ASL gloss:\s+(.*)$", text)

2382 if m2: 2382 ↛ 2383line 2382 didn't jump to line 2383 because the condition on line 2382 was never true

2383 add_related(

2384 wxr,

2385 data,

2386 ["ASL-gloss"],

2387 [m2.group(1)],

2388 text,

2389 True,

2390 is_reconstruction,

2391 head_group,

2392 ruby,

2393 )

2394 continue

2395

2396 parts = list(m.group(0) for m in re.finditer(word_re, desc))

2397 if not parts: 2397 ↛ 2398line 2397 didn't jump to line 2398 because the condition on line 2397 was never true

2398 prev_tags = None

2399 following_tags = None

2400 continue

2401

2402 # Check for certain language-specific header part starts that

2403 # modify

2404 if len(parts) == 2 and language in lang_specific_head_map: 2404 ↛ 2405line 2404 didn't jump to line 2405 because the condition on line 2404 was never true

2405 ht = lang_specific_head_map[language]

2406 if parts[0] in ht:

2407 rem_tags, add_tags = ht[parts[0]]

2408 new_prev_tags1: list[list[str]] = []

2409 tags2: Union[tuple[str, ...], list[str]]

2410 for tags2 in prev_tags or [()]:

2411 if rem_tags is True: # Remove all old tags

2412 tsets = set()

2413 else:

2414 tsets = set(tags2) - set(rem_tags.split())

2415 tsets = tsets | set(add_tags.split())

2416 tags = list(sorted(tsets))

2417 add_related(

2418 wxr,

2419 data,

2420 tags,

2421 [parts[1]],

2422 text,

2423 True,

2424 is_reconstruction,

2425 head_group,

2426 ruby,

2427 )

2428 new_prev_tags1.append(tags)

2429 prev_tags = new_prev_tags1

2430 following_tags = None

2431 continue

2432

2433 # Handle the special case of descriptors that are parenthesized,

2434 # e.g., (archaic or Scotland)

2435 m = re.match(r"$([^)]+)$\s+(.*)$", desc)

2436 if m is not None and classify_desc(m.group(1)) == "tags": 2436 ↛ 2437line 2436 didn't jump to line 2437 because the condition on line 2436 was never true

2437 tagpart = m.group(1)

2438 related = [m.group(2)]

2439 tagsets, topics = decode_tags(tagpart, no_unknown_starts=True)

2440 if topics:

2441 wxr.wtp.debug(

2442 "parenthized head part {!r} contains topics: {}".format(

2443 tagpart, topics

2444 ),

2445 sortid="form_descriptions/1647",

2446 )

2447 elif m is not None and re.match(r"in the sense ", m.group(1)): 2447 ↛ 2450line 2447 didn't jump to line 2450 because the condition on line 2447 was never true

2448 # Handle certain ignored cases

2449 # e.g. bord/Danish: in the sense "plank"

2450 related = [m.group(2)]

2451 tagsets = [()]

2452 else:

2453 # Normal parsing of the descriptor

2454 alt_related = None

2455 alt_tagsets = None

2456 tagsets = None

2457 for i in range(len(parts), 0, -1):

2458 related = parts[i:]

2459 tagparts = parts[:i]

2460 # print(" i={} related={} tagparts={}"

2461 # .format(i, related, tagparts))

2462 tagsets, topics = decode_tags(

2463 " ".join(tagparts), no_unknown_starts=True

2464 )

2465 # print("tagparts={!r} tagsets={} topics={} related={} "

2466 # "alt_related={} distw={:.2f}"

2467 # .format(tagparts, tagsets, topics, related,

2468 # alt_related,

2469 # distw(titleparts, parts[i - 1])))

2470 if (

2471 topics

2472 or not tagsets

2473 or any("error-unknown-tag" in x for x in tagsets)

2474 ):

2475 if alt_related is not None: 2475 ↛ 2477line 2475 didn't jump to line 2477 because the condition on line 2475 was never true

2476 # We already had a good division, so let's stop.

2477 break

2478 # Bad division, try deeper

2479 continue

2480 # print(f"{parts[i-1]=}, {parts=}")

2481 if (

2482 i > 1

2483 and len(parts[i - 1]) >= 4

2484 and (

2485 distw(titleparts, parts[i - 1]) <= 0.4

2486 or (

2487 wxr.wtp.section == "English"

2488 and wxr.wtp.title

2489 in WORDS_WITH_FALSE_POSITIVE_TAGS

2490 and parts[i - 1]

2491 in WORDS_WITH_FALSE_POSITIVE_TAGS[wxr.wtp.title]

2492 )

2493 )

2494 # Fixes 'unaccountability' wiktext #1196

2495 and not (

2496 wxr.wtp.section == "English"

2497 and wxr.wtp.title in WORDS_WITH_FALSE_POSITIVE_FORMS

2498 and parts[i - 1]

2499 in WORDS_WITH_FALSE_POSITIVE_FORMS[wxr.wtp.title]

2500 )

2501 # Fixes wiktextract #983, where "participle"

2502 # was too close to "Martinize" and so this accepted

2503 # ["participle", "Martinize"] as matching; this

2504 # kludge prevents this from happening if titleparts

2505 # is shorter than what would be 'related'.

2506 # This breaks if we want to detect stuff that

2507 # actually gets an extra space-separated word when

2508 # 'inflected'.

2509 and (

2510 len(titleparts) >= len(parts[i - 1 :])

2511 or "or" in parts[i - 1 :]

2512 )

2513 ):

2514 # print(f"Reached; {parts=}, {parts[i-1]=}")

2515 alt_related = related

2516 alt_tagsets = tagsets

2517 continue

2518 alt_related = None

2519 alt_tagsets = None

2520 break

2521 else:

2522 if alt_related is None: 2522 ↛ 2554line 2522 didn't jump to line 2554 because the condition on line 2522 was always true

2523 # Check if the parenthesized part is likely a

2524 # romanization

2525 if ( 2525 ↛ 2533line 2525 didn't jump to line 2533 because the condition on line 2525 was never true

2526 (have_ruby or classify_desc(base) == "other")

2527 and classify_desc(paren) == "romanization"

2528 and not (

2529 "categories" in data

2530 and desc in data["categories"]

2531 )

2532 ):

2533 for r in split_at_comma_semi(

2534 paren, extra=[" or "], skipped=links

2535 ):

2536 add_romanization(

2537 wxr,

2538 data,

2539 r,

2540 text,

2541 is_reconstruction,

2542 head_group,

2543 ruby,

2544 )

2545 have_romanization = True

2546 continue

2547 tagsets = [("error-unrecognized-head-form",)]

2548 wxr.wtp.debug(

2549 "unrecognized head form: {}".format(desc),

2550 sortid="form_descriptions/1698",

2551 )

2552 continue

2553

2554 if alt_related is not None: 2554 ↛ 2555line 2554 didn't jump to line 2555 because the condition on line 2554 was never true

2555 related = alt_related

2556 tagsets = alt_tagsets

2557

2558 # print("FORM END: tagsets={} related={}".format(tagsets, related))

2559 # print("==================")

2560

2561 if ( 2561 ↛ 2582line 2561 didn't jump to line 2582 because the condition on line 2561 was never true

2562 len(related) <= 0

2563 and wxr.wtp.section == "English"

2564 and tagsets is not None

2565 and len(tagsets) > 0

2566 and not any(

2567 s.startswith("error-") for tagset in tagsets for s in tagset

2568 )

2569 and any(

2570 s in FORM_ASSOCIATED_TAG_WORDS

2571 for tagset in tagsets

2572 for s in tagset

2573 )

2574 and (

2575 wxr.wtp.title not in FALSE_POSITIVE_MISSING_FORMS

2576 and not any(

2577 rel in FALSE_POSITIVE_MISSING_FORMS[wxr.wtp.title or ""]

2578 for rel in related

2579 )

2580 )

2581 ):

2582 wxr.wtp.debug(

2583 f"Form tags without form: {desc=}, {tagsets=}",

2584 sortid="form_description/20250107",

2585 )

2586 if not tagsets: 2586 ↛ 2587line 2586 didn't jump to line 2587 because the condition on line 2586 was never true

2587 continue

2588

2589 # print(f"{alts=}, {related=}")

2590

2591 assert isinstance(related, (list, tuple))

2592 related_str = " ".join(related)

2593 if "or" in titleparts:

2594 alts = [related_str]

2595 else:

2596 alts = split_at_comma_semi(

2597 related_str, separators=[r"\bor\b"], skipped=links

2598 )

2599 # print(f"{related_str=}, {alts=}")

2600 if not alts:

2601 alts = [""]

2602 for related_str in alts:

2603 if related_str:

2604 if prev_tags and (

2605 all(

2606 all(

2607 t in ["nonstandard", "dialectal"]

2608 or valid_tags[t] == "dialect"

2609 for t in tags

2610 )

2611 for ts in tagsets

2612 )

2613 or (

2614 any("participle" in ts for ts in prev_tags)

2615 and all(

2616 "attributive" in ts

2617 or any(valid_tags[t] == "gender" for t in ts)

2618 for ts in tagsets

2619 )

2620 )

2621 ):

2622 # Merged with previous tags. Don't update previous

2623 # tags here; cf. burn/English/Verb

2624 for tags_l in tagsets:

2625 for ts in prev_tags:

2626 tags_l1 = list(sorted(set(tags_l) | set(ts)))

2627 add_related(

2628 wxr,

2629 data,

2630 tags_l1,

2631 [related_str],

2632 text,

2633 True,

2634 is_reconstruction,

2635 head_group,

2636 ruby,

2637 )

2638 else:

2639 # Not merged with previous tags

2640 for tags_l in tagsets:

2641 if following_tags is not None: 2641 ↛ 2642line 2641 didn't jump to line 2642 because the condition on line 2641 was never true

2642 for ts in following_tags:

2643 tags_l1 = list(

2644 sorted(set(tags_l) | set(ts))

2645 )

2646 add_related(

2647 wxr,

2648 data,

2649 tags_l1,

2650 [related_str],

2651 text,

2652 True,

2653 is_reconstruction,

2654 head_group,

2655 ruby,

2656 )

2657 else:

2658 ret = add_related(

2659 wxr,

2660 data,

2661 tags_l,

2662 [related_str],

2663 text,

2664 True,

2665 is_reconstruction,

2666 head_group,

2667 ruby,

2668 )

2669 if ret is not None: 2669 ↛ 2670line 2669 didn't jump to line 2670 because the condition on line 2669 was never true

2670 following_tags = ret

2671 prev_tags = tagsets

2672 else:

2673 if desc_i < len(new_desc) - 1 and all( 2673 ↛ 2680line 2673 didn't jump to line 2680 because the condition on line 2673 was never true

2674 "participle" in ts or "infinitive" in ts

2675 for ts in tagsets

2676 ):

2677 # Interpret it as a standalone form description

2678 # in the middle, probably followed by forms or

2679 # language-specific descriptors. cf. drikke/Danish

2680 new_prev_tags2 = []

2681 for ts1 in prev_tags or [()]:

2682 for ts2 in tagsets:

2683 ts = tuple(sorted(set(ts1) | set(ts2)))

2684 new_prev_tags2.append(ts)

2685 prev_tags = new_prev_tags2

2686 continue

2687 for tags in tagsets:

2688 data_extend(data, "tags", tags)

2689 prev_tags = tagsets

2690 following_tags = None

2691

2692 # Finally, if we collected hirakana/katakana, add them now

2693 if hiragana: 2693 ↛ 2694line 2693 didn't jump to line 2694 because the condition on line 2693 was never true

2694 add_related(

2695 wxr,

2696 data,

2697 ["hiragana"],

2698 [hiragana],

2699 text,

2700 True,

2701 is_reconstruction,

2702 head_group,

2703 ruby,

2704 )

2705 if katakana: 2705 ↛ 2706line 2705 didn't jump to line 2706 because the condition on line 2705 was never true

2706 add_related(

2707 wxr,

2708 data,

2709 ["katakana"],

2710 [katakana],

2711 text,

2712 True,

2713 is_reconstruction,

2714 head_group,

2715 ruby,

2716 )

2717

2718 # XXX check if this is actually relevant, tags in word root data

2719 # is extremely rare (not sure where they slip through).

2720 tags = data.get("tags", []) # type:ignore

2721 if len(tags) > 0:

2722 # wxr.wtp.debug(

2723 # f"Tags appear in word root data: {data['tags']=}", # type:ignore

2724 # sortid="form_descriptions/2620/20240606",

2725 # ) # Messes up tests.

2726 data["tags"] = list(sorted(set(tags))) # type:ignore

2727

2728

2729def parse_sense_qualifier(

2730 wxr: WiktextractContext, text: str, data: Union[SenseData, LinkageData]

2731) -> None:

2732 """Parses tags or topics for a sense or some other data. The values are

2733 added into the dictionary ``data``."""

2734 assert isinstance(wxr, WiktextractContext)

2735 assert isinstance(text, str)

2736 assert isinstance(data, dict)

2737 # print("parse_sense_qualifier:", text)

2738 if re.match(r"$[^()]+$$", text): 2738 ↛ 2739line 2738 didn't jump to line 2739 because the condition on line 2738 was never true

2739 text = text[1:-1]

2740 if re.match(r'"[^"]+"$', text): 2740 ↛ 2741line 2740 didn't jump to line 2741 because the condition on line 2740 was never true

2741 text = text[1:-1]

2742 lst = map_with(xlat_descs_map, [text])

2743 sense_tags: list[str] = []

2744 for text in lst:

2745 for semi in split_at_comma_semi(text):

2746 if not semi: 2746 ↛ 2747line 2746 didn't jump to line 2747 because the condition on line 2746 was never true

2747 continue

2748 orig_semi = semi

2749 idx = semi.find(":")

2750 if idx >= 0: 2750 ↛ 2751line 2750 didn't jump to line 2751 because the condition on line 2750 was never true

2751 semi = semi[:idx]

2752 cls = classify_desc(semi, allow_unknown_tags=True)

2753 # print("parse_sense_qualifier: classify_desc: {} -> {}"

2754 # .format(semi, cls))

2755 if cls == "tags":

2756 tagsets, topics = decode_tags(semi)

2757 data_extend(data, "topics", topics)

2758 # XXX should think how to handle distinct options better,

2759 # e.g., "singular and plural genitive"; that can't really be

2760 # done with changing the calling convention of this function.

2761 # Should split sense if more than one category of tags differs.

2762 for tags in tagsets:

2763 sense_tags.extend(tags)

2764 elif cls == "taxonomic": 2764 ↛ 2765line 2764 didn't jump to line 2765 because the condition on line 2764 was never true

2765 if re.match(r"×[A-Z]", semi):

2766 sense_tags.append("extinct")

2767 semi = semi[1:]

2768 data["taxonomic"] = semi

2769 elif cls == "english":

2770 if "qualifier" in data and data["qualifier"] != orig_semi: 2770 ↛ 2771line 2770 didn't jump to line 2771 because the condition on line 2770 was never true

2771 data["qualifier"] += "; " + orig_semi

2772 else:

2773 data["qualifier"] = orig_semi

2774 else:

2775 wxr.wtp.debug(

2776 "unrecognized sense qualifier: {}".format(text),

2777 sortid="form_descriptions/1831",

2778 )

2779 sense_tags = list(sorted(set(sense_tags)))

2780 data_extend(data, "tags", sense_tags)

2781

2782

2783def parse_pronunciation_tags(

2784 wxr: WiktextractContext, text: str, data: SoundData

2785) -> None:

2786 assert isinstance(wxr, WiktextractContext)

2787 assert isinstance(text, str)

2788 assert isinstance(data, dict)

2789 text = text.strip()

2790 if not text: 2790 ↛ 2791line 2790 didn't jump to line 2791 because the condition on line 2790 was never true

2791 return

2792 cls = classify_desc(text)

2793 notes = []

2794 if cls == "tags":

2795 tagsets, topics = decode_tags(text)

2796 data_extend(data, "topics", topics)

2797 for tagset in tagsets:

2798 for t in tagset:

2799 if " " in t: 2799 ↛ 2800line 2799 didn't jump to line 2800 because the condition on line 2799 was never true

2800 notes.append(t)

2801 else:

2802 data_append(data, "tags", t)

2803 else:

2804 notes.append(text)

2805 if notes:

2806 data["note"] = "; ".join(notes)

2807

2808

2809def parse_translation_desc(

2810 wxr: WiktextractContext, lang: str, text: str, tr: TranslationData

2811) -> None:

2812 assert isinstance(wxr, WiktextractContext)

2813 assert isinstance(lang, str) # The language of ``text``

2814 assert isinstance(text, str)

2815 assert isinstance(tr, dict)

2816 # print("parse_translation_desc:", text)

2817

2818 # Process all parenthesized parts from the translation item

2819 note = None

2820 restore_beginning = ""

2821 restore_end = ""

2822 while True:

2823 beginning = False

2824 # See if we can find a parenthesized expression at the end

2825 m = re.search(r"\s*$(([^()]|\([^()]+$)+)\)\.?$", text)

2826 if m:

2827 par = m.group(1)

2828 text = text[: m.start()]

2829 if par.startswith(("literally ", "lit.")):

2830 continue # Not useful for disambiguation in many idioms

2831 else:

2832 # See if we can find a parenthesized expression at the start

2833 m = re.match(r"^\^?$(([^()]|\([^()]+$)+)\):?(\s+|$)", text)

2834 if m:

2835 par = m.group(1)

2836 text = text[m.end() :]

2837 beginning = True

2838 if re.match(r"^(\d|\s|,| or | and )+$", par): 2838 ↛ 2843line 2838 didn't jump to line 2843 because the condition on line 2838 was never true

2839 # Looks like this beginning parenthesized expression only

2840 # contains digits or their combinations. We assume such

2841 # to be sense descriptions if no sense has been selected,

2842 # or otherwise just ignore them.

2843 if not tr.get("sense"):

2844 tr["sense"] = par

2845 continue

2846 else:

2847 # See if we can find a parenthesized expression in the middle.

2848 # Romanizations are sometimes between word and gender marker,

2849 # e.g. wife/English/Tr/Yiddish.

2850 m = re.search(r"\s+$(([^()]|\([^()]+$)+)\)", text)

2851 if m:

2852 par = m.group(1)

2853 text = text[: m.start()] + text[m.end() :]

2854 else:

2855 # No more parenthesized expressions - break out of the loop

2856 break

2857

2858 # Some cleanup of artifacts that may result from skipping some templates

2859 # in earlier stages

2860 if par.startswith(": "): 2860 ↛ 2861line 2860 didn't jump to line 2861 because the condition on line 2860 was never true

2861 par = par[2:]

2862 if par.endswith(","): 2862 ↛ 2863line 2862 didn't jump to line 2863 because the condition on line 2862 was never true

2863 par = par[:-1]

2864 if re.match(r'^[“"]([^“”"]*)[“”"]$', par): 2864 ↛ 2865line 2864 didn't jump to line 2865 because the condition on line 2864 was never true

2865 par = par[1:-1]

2866 par = par.strip()

2867

2868 # Check for special script pronunciation followed by romanization,

2869 # used in many Asian languages.

2870 lst = par.split(", ")

2871 if len(lst) == 2:

2872 a, r = lst

2873 if classify_desc(a) == "other":

2874 cls = classify_desc(r)

2875 # print("parse_translation_desc: r={} cls={}".format(r, cls))

2876 if cls == "romanization" or (

2877 cls == "english" and len(r.split()) == 1 and r[0].islower()

2878 ):

2879 if tr.get("alt") and tr.get("alt") != a: 2879 ↛ 2880line 2879 didn't jump to line 2880 because the condition on line 2879 was never true

2880 wxr.wtp.debug(

2881 'more than one value in "alt": {} vs. {}'.format(

2882 tr["alt"], a

2883 ),

2884 sortid="form_descriptions/1930",

2885 )

2886 tr["alt"] = a

2887 if tr.get("roman") and tr.get("roman") != r: 2887 ↛ 2888line 2887 didn't jump to line 2888 because the condition on line 2887 was never true

2888 wxr.wtp.debug(

2889 'more than one value in "roman": '

2890 "{} vs. {}".format(tr["roman"], r),

2891 sortid="form_descriptions/1936",

2892 )

2893 tr["roman"] = r

2894 continue

2895

2896 # Check for certain comma-separated tags combined with English text

2897 # at the beginning or end of a comma-separated parenthesized list

2898 while len(lst) > 1:

2899 cls = classify_desc(lst[0])

2900 if cls == "tags": 2900 ↛ 2901line 2900 didn't jump to line 2901 because the condition on line 2900 was never true

2901 tagsets, topics = decode_tags(lst[0])

2902 for t in tagsets:

2903 data_extend(tr, "tags", t)

2904 data_extend(tr, "topics", topics)

2905 lst = lst[1:]

2906 continue

2907 cls = classify_desc(lst[-1])

2908 if cls == "tags":

2909 tagsets, topics = decode_tags(lst[-1])

2910 for t in tagsets:

2911 data_extend(tr, "tags", t)

2912 data_extend(tr, "topics", topics)

2913 lst = lst[:-1]

2914 continue

2915 break

2916 par = ", ".join(lst)

2917

2918 if not par: 2918 ↛ 2919line 2918 didn't jump to line 2919 because the condition on line 2918 was never true

2919 continue

2920 if re.search(tr_ignored_parens_re, par): 2920 ↛ 2921line 2920 didn't jump to line 2921 because the condition on line 2920 was never true

2921 continue

2922 if par.startswith("numeral:"):

2923 par = par[8:].strip()

2924

2925 # Classify the part in parenthesis and process accordingly

2926 cls = classify_desc(par)

2927 # print("parse_translation_desc classify: {!r} -> {}"

2928 # .format(par, cls))

2929 if par == text:

2930 pass

2931 if par == "f": 2931 ↛ 2932line 2931 didn't jump to line 2932 because the condition on line 2931 was never true

2932 data_append(tr, "tags", "feminine")

2933 elif par == "m": 2933 ↛ 2934line 2933 didn't jump to line 2934 because the condition on line 2933 was never true

2934 data_append(tr, "tags", "masculine")

2935 elif cls == "tags":

2936 tagsets, topics = decode_tags(par)

2937 for tags in tagsets:

2938 data_extend(tr, "tags", tags)

2939 data_extend(tr, "topics", topics)

2940 elif cls == "english":

2941 # If the text contains any of certain grammatical words, treat it

2942 # as a "note" instead of "english"

2943 if re.search(tr_note_re, par):

2944 if par.endswith(":"): 2944 ↛ 2945line 2944 didn't jump to line 2945 because the condition on line 2944 was never true

2945 par = par[:-1]

2946 if par not in ("see entry for forms",): 2946 ↛ 2822line 2946 didn't jump to line 2822 because the condition on line 2946 was always true

2947 if note: 2947 ↛ 2948line 2947 didn't jump to line 2948 because the condition on line 2947 was never true

2948 note = note + ";" + par

2949 else:

2950 note = par

2951 else:

2952 # There can be more than one parenthesized english item, see

2953 # e.g. Aunt/English/Translations/Tamil

2954 if tr.get("english"): 2954 ↛ 2955line 2954 didn't jump to line 2955 because the condition on line 2954 was never true

2955 tr["english"] += "; " + par

2956 else:

2957 tr["english"] = par

2958 elif cls == "romanization":

2959 # print("roman text={!r} text cls={}"

2960 # .format(text, classify_desc(text)))

2961 if classify_desc(text) in (

2962 "english",

2963 "romanization",

2964 ) and lang not in ("Egyptian",):

2965 if beginning:

2966 restore_beginning += "({}) ".format(par)

2967 else:

2968 restore_end = " ({})".format(par) + restore_end

2969 else:

2970 if tr.get("roman"): 2970 ↛ 2971line 2970 didn't jump to line 2971 because the condition on line 2970 was never true

2971 wxr.wtp.debug(

2972 'more than one value in "roman": {} vs. {}'.format(

2973 tr["roman"], par

2974 ),

2975 sortid="form_descriptions/2013",

2976 )

2977 tr["roman"] = par

2978 elif cls == "taxonomic": 2978 ↛ 2979line 2978 didn't jump to line 2979 because the condition on line 2978 was never true

2979 if tr.get("taxonomic"):

2980 wxr.wtp.debug(

2981 'more than one value in "taxonomic": {} vs. {}'.format(

2982 tr["taxonomic"], par

2983 ),

2984 sortid="form_descriptions/2019",

2985 )

2986 if re.match(r"×[A-Z]", par):

2987 data_append(tr, "tags", "extinct")

2988 par = par[1:]

2989 tr["taxonomic"] = par

2990 elif cls == "other": 2990 ↛ 3000line 2990 didn't jump to line 3000 because the condition on line 2990 was always true

2991 if tr.get("alt"): 2991 ↛ 2992line 2991 didn't jump to line 2992 because the condition on line 2991 was never true

2992 wxr.wtp.debug(

2993 'more than one value in "alt": {} vs. {}'.format(

2994 tr["alt"], par

2995 ),

2996 sortid="form_descriptions/2028",

2997 )

2998 tr["alt"] = par

2999 else:

3000 wxr.wtp.debug(

3001 "parse_translation_desc unimplemented cls {}: {}".format(

3002 cls, par

3003 ),

3004 sortid="form_descriptions/2033",

3005 )

3006

3007 # Check for gender indications in suffix

3008 text, final_tags = parse_head_final_tags(wxr, lang, text)

3009 data_extend(tr, "tags", final_tags)

3010

3011 # Restore those parts that we did not want to remove (they are often

3012 # optional words or words that are always used with the given translation)

3013 text = restore_beginning + text + restore_end

3014

3015 if note:

3016 tr["note"] = note.strip()

3017 if text and text not in ignored_translations:

3018 tr["word"] = text.strip()

3019

3020 # Sometimes gender seems to be at the end of "roman" field, see e.g.

3021 # fire/English/Noun/Translations/Egyptian (for "oxidation reaction")

3022 roman = tr.get("roman")

3023 if roman:

3024 if roman.endswith(" f"): 3024 ↛ 3025line 3024 didn't jump to line 3025 because the condition on line 3024 was never true

3025 data_append(tr, "tags", "feminine")

3026 tr["roman"] = roman[:-2].strip()

3027 elif roman.endswith(" m"): 3027 ↛ 3028line 3027 didn't jump to line 3028 because the condition on line 3027 was never true

3028 data_append(tr, "tags", "masculine")

3029 tr["roman"] = roman[:-2].strip()

3030

3031 # If the word now has "english" field but no "roman" field, and

3032 # the word would be classified "other" (generally non-latin

3033 # characters), and the value in "english" is only one lowercase

3034 # word, move it to "roman". This happens semi-frequently when the

3035 # translation is transliterated the same as some English word.

3036 roman = tr.get("roman")

3037 english = tr.get("english")

3038 if english and not roman and "word" in tr:

3039 cls = classify_desc(tr["word"])

3040 if cls == "other" and " " not in english and english[0].islower():

3041 del tr["english"]

3042 tr["roman"] = english

3043

3044 # If the entry now has both tr["roman"] and tr["word"] and they have

3045 # the same value, delete tr["roman"] (e.g., man/English/Translations

3046 # Evenki)

3047 if tr.get("word") and tr.get("roman") == tr.get("word"): 3047 ↛ 3048line 3047 didn't jump to line 3048 because the condition on line 3047 was never true

3048 del tr["roman"]

3049

3050

3051def parse_alt_or_inflection_of(

3052 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str]

3053) -> Optional[tuple[list[str], Optional[list[AltOf]]]]:

3054 """Tries to parse an inflection-of or alt-of description. If successful,

3055 this returns (tags, alt-of/inflection-of-dict). If the description cannot

3056 be parsed, this returns None. This may also return (tags, None) when the

3057 gloss describes a form (or some other tags were extracted from it), but

3058 there was no alt-of/form-of/synonym-of word."""

3059 # print("parse_alt_or_inflection_of: {!r}".format(gloss))

3060 # Occasionally inflection_of/alt_of have "A(n) " etc. at the beginning.

3061

3062 # Never interpret a gloss that is equal to the word itself as a tag

3063 # (e.g., instrumental/Romanian, instrumental/Spanish).

3064 if gloss.lower() == wxr.wtp.title.lower() or ( # type:ignore[union-attr]

3065 len(gloss) >= 5 and distw([gloss.lower()], wxr.wtp.title.lower()) < 0.2 # type:ignore[union-attr]

3066 ):

3067 return None

3068

3069 # First try parsing it as-is

3070 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args)

3071 if parsed is not None:

3072 return parsed

3073

3074 # Next try parsing it with the first character converted to lowercase if

3075 # it was previously uppercase.

3076 if gloss and gloss[0].isupper():

3077 gloss = gloss[0].lower() + gloss[1:]

3078 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args)

3079 if parsed is not None:

3080 return parsed

3081

3082 return None

3083

3084

3085# These tags are not allowed in alt-or-inflection-of parsing

3086alt_infl_disallowed: set[str] = set(

3087 [

3088 "error-unknown-tag",

3089 "place", # Not in inflected forms and causes problems e.g. house/English

3090 ]

3091)

3092

3093

3094def parse_alt_or_inflection_of1(

3095 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str]

3096) -> Optional[tuple[list[str], Optional[list[AltOf]]]]:

3097 """Helper function for parse_alt_or_inflection_of. This handles a single

3098 capitalization."""

3099 if not gloss or not gloss.strip(): 3099 ↛ 3100line 3099 didn't jump to line 3100 because the condition on line 3099 was never true

3100 return None

3101

3102 # Prevent some common errors where we would parse something we shouldn't

3103 if re.search(r"(?i)form of address ", gloss): 3103 ↛ 3104line 3103 didn't jump to line 3104 because the condition on line 3103 was never true

3104 return None

3105

3106 gloss = re.sub(r"only used in [^,]+, ", "", gloss)

3107

3108 # First try all formats ending with "of" (or other known last words that

3109 # can end a form description)

3110 matches = list(re.finditer(r"\b(of|for|by|as|letter|number) ", gloss))

3111 m: Optional[re.Match]

3112 for m in reversed(matches):

3113 desc = gloss[: m.end()].strip()

3114 base = gloss[m.end() :].strip()

3115 tagsets, topics = decode_tags(desc, no_unknown_starts=True)

3116 if not topics and any(

3117 not (alt_infl_disallowed & set(ts)) for ts in tagsets

3118 ):

3119 # Successfully parsed, including "of" etc.

3120 tags: list[str] = []

3121 # If you have ("Western-Armenian", ..., "form-of") as your

3122 # tag set, it's most probable that it's something like

3123 # "Western Armenian form of խոսել (xosel)", which should

3124 # get "alt-of" instead of "form-of" (inflection).

3125 # խօսիլ/Armenian

3126 for ts_t in tagsets:

3127 if "form-of" in ts_t and any(

3128 valid_tags.get(tk) == "dialect" for tk in ts_t

3129 ):

3130 ts_s = (set(ts_t) - {"form-of"}) | {"alt-of"}

3131 else:

3132 ts_s = set(ts_t)

3133 if not (alt_infl_disallowed & ts_s): 3133 ↛ 3126line 3133 didn't jump to line 3126 because the condition on line 3133 was always true

3134 tags.extend(ts_s)

3135 if (

3136 "alt-of" in tags

3137 or "form-of" in tags

3138 or "synonym-of" in tags

3139 or "compound-of" in tags

3140 ):

3141 break

3142 if m.group(1) == "of":

3143 # Try parsing without the final "of". This is commonly used in

3144 # various form-of expressions.

3145 desc = gloss[: m.start()]

3146 base = gloss[m.end() :]

3147 tagsets, topics = decode_tags(desc, no_unknown_starts=True)

3148 # print("ALT_OR_INFL: desc={!r} base={!r} tagsets={} topics={}"

3149 # .format(desc, base, tagsets, topics))

3150 if not topics and any(

3151 not (alt_infl_disallowed & set(t)) for t in tagsets

3152 ):

3153 tags = []

3154 for t in tagsets:

3155 if not (alt_infl_disallowed & set(t)): 3155 ↛ 3154line 3155 didn't jump to line 3154 because the condition on line 3155 was always true

3156 tags.extend(t)

3157 # It must have at least one tag from form_of_tags

3158 if set(tags) & form_of_tags:

3159 # Accept this as form-of

3160 tags.append("form-of")

3161 break

3162 if set(tags) & alt_of_tags:

3163 # Accept this as alt-of

3164 tags.append("alt-of")

3165 break

3166

3167 else:

3168 # Did not find a form description based on last word; see if the

3169 # whole description is tags

3170 tagsets, topics = decode_tags(gloss, no_unknown_starts=True)

3171 if not topics and any(

3172 not (alt_infl_disallowed & set(ts)) and form_of_tags & set(ts)

3173 for ts in tagsets

3174 ):

3175 tags = []

3176 for ts in tagsets:

3177 if not (alt_infl_disallowed & set(ts)) and form_of_tags & set( 3177 ↛ 3176line 3177 didn't jump to line 3176 because the condition on line 3177 was always true

3178 ts

3179 ):

3180 tags.extend(ts)

3181 base = ""

3182 else:

3183 return None

3184

3185 # kludge for Spanish (again): 'x of [word] combined with [clitic]'

3186 m = re.search(r"combined with \w+$", base)

3187 if m: 3187 ↛ 3188line 3187 didn't jump to line 3188 because the condition on line 3187 was never true

3188 tagsets, topics = decode_tags(m.group(0), no_unknown_starts=True)

3189 if not topics:

3190 for ts in tagsets:

3191 tags.extend(ts)

3192 base = base[: m.start()]

3193

3194 # It is fairly common for form_of glosses to end with something like

3195 # "ablative case" or "in instructive case". Parse that ending.

3196 base = base.strip()

3197 lst = base.split()

3198 # print("parse_alt_or_inflection_of: lst={}".format(lst))

3199 if len(lst) >= 3 and lst[-1] in ("case", "case."): 3199 ↛ 3200line 3199 didn't jump to line 3200 because the condition on line 3199 was never true

3200 node = valid_sequences.children.get(lst[-2])

3201 if node and node.end:

3202 for s in node.tags:

3203 tags.extend(s.split(" "))

3204 lst = lst[:-2]

3205 if lst[-1] == "in" and len(lst) > 1:

3206 lst = lst[:-1]

3207

3208 # Eliminate empty and duplicate tags

3209 tags = list(sorted(set(t for t in tags if t)))

3210

3211 # Clean up some extra stuff from the linked word, separating the text

3212 # into ``base`` (the linked word) and ``extra`` (additional information,

3213 # such as English translation or clarifying word sense information).

3214 orig_base = base

3215 base = re.sub(alt_of_form_of_clean_re, "", orig_base)

3216 base = re.sub(r" [(⟨][^()]*[)⟩]", "", base) # Remove all (...) groups

3217 extra = orig_base[len(base) :]

3218 extra = re.sub(r"^[- :;.,，—]+", "", extra)

3219 if extra.endswith(".") and extra.count(".") == 1:

3220 extra = extra[:-1].strip()

3221 m = re.match(r"^$([^()]*)$$", extra)

3222 if m: 3222 ↛ 3223line 3222 didn't jump to line 3223 because the condition on line 3222 was never true

3223 extra = m.group(1)

3224 else:

3225 # These weird backets used in "slash mark"

3226 m = re.match(r"^⟨([^()]*)⟩$", extra)

3227 if m: 3227 ↛ 3228line 3227 didn't jump to line 3228 because the condition on line 3227 was never true

3228 extra = m.group(1)

3229 m = re.match(r'^[“"]([^"“”]*)["”]$', extra)

3230 if m: 3230 ↛ 3231line 3230 didn't jump to line 3231 because the condition on line 3230 was never true

3231 extra = m.group(1)

3232 # Note: base might still contain comma-separated values and values

3233 # separated by "and"

3234 base = base.strip()

3235 if base.endswith(",") and len(base) > 2: 3235 ↛ 3236line 3235 didn't jump to line 3236 because the condition on line 3235 was never true

3236 base = base[:-1].strip()

3237 while (

3238 base.endswith(".")

3239 and not wxr.wtp.page_exists(base)

3240 and base not in gloss_template_args

3241 ):

3242 base = base[:-1].strip()

3243 if base.endswith('(\u201cconjecture")'): 3243 ↛ 3244line 3243 didn't jump to line 3244 because the condition on line 3243 was never true

3244 base = base[:-14].strip()

3245 tags.append("conjecture")

3246 while ( 3246 ↛ 3251line 3246 didn't jump to line 3251 because the condition on line 3246 was never true

3247 base.endswith(".")

3248 and not wxr.wtp.page_exists(base)

3249 and base not in gloss_template_args

3250 ):

3251 base = base[:-1].strip()

3252 if ( 3252 ↛ 3257line 3252 didn't jump to line 3257 because the condition on line 3252 was never true

3253 base.endswith(".")

3254 and base not in gloss_template_args

3255 and base[:-1] in gloss_template_args

3256 ):

3257 base = base[:-1]

3258 base = base.strip()

3259 if not base:

3260 return tags, None

3261

3262 # Kludge: Spanish verb forms seem to have a dot added at the end.

3263 # Remove it; we know of no Spanish verbs ending with a dot.

3264 language = wxr.wtp.section

3265 pos = wxr.wtp.subsection

3266 # print("language={} pos={} base={}".format(language, pos, base))

3267 if ( 3267 ↛ 3273line 3267 didn't jump to line 3273 because the condition on line 3267 was never true

3268 base.endswith(".")

3269 and len(base) > 1

3270 and base[-2].isalpha()

3271 and (language == "Spanish" and pos == "Verb")

3272 ):

3273 base = base[:-1]

3274

3275 # Split base to alternatives when multiple alternatives provided

3276 parts = split_at_comma_semi(base, extra=[" / ", "／", r" \+ "])

3277 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "")

3278 if (

3279 len(parts) <= 1

3280 or base.startswith("/")

3281 or base.endswith("/")

3282 or "/" in titleword

3283 ):

3284 parts = [base]

3285 # Split base to alternatives when of form "a or b" and "a" and "b" are

3286 # similar (generally spelling variants of the same word or similar words)

3287 if len(parts) == 1:

3288 pp = base.split()

3289 if len(pp) == 3 and pp[1] == "or" and distw([pp[0]], pp[2]) < 0.4:

3290 parts = [pp[0], pp[2]]

3291

3292 # Create form-of/alt-of entries based on the extracted data

3293 dt_lst: list[AltOf] = []

3294 for p in parts:

3295 # Check for some suspicious base forms

3296 m = re.search(r"[.,] |[{}()]", p)

3297 if m and not wxr.wtp.page_exists(p): 3297 ↛ 3298line 3297 didn't jump to line 3298 because the condition on line 3297 was never true

3298 wxr.wtp.debug(

3299 "suspicious alt_of/form_of with {!r}: {}".format(m.group(0), p),

3300 sortid="form_descriptions/2278",

3301 )

3302 if p.startswith("*") and len(p) >= 3 and p[1].isalpha(): 3302 ↛ 3303line 3302 didn't jump to line 3303 because the condition on line 3302 was never true

3303 p = p[1:]

3304 dt: AltOf = {"word": p}

3305 if extra:

3306 dt["extra"] = extra

3307 dt_lst.append(dt)

3308 # print("alt_or_infl_of returning tags={} lst={} base={!r}"

3309 # .format(tags, lst, base))

3310 return tags, dt_lst

3311

3312

3313@functools.lru_cache(maxsize=65536)

3314def classify_desc(

3315 desc: str,

3316 allow_unknown_tags=False,

3317 no_unknown_starts=False,

3318 accepted: Union[tuple[str, ...], frozenset[str]] = tuple(),

3319) -> str:

3320 """Determines whether the given description is most likely tags, english,

3321 a romanization, or something else. Returns one of: "tags", "english",

3322 "romanization", or "other". If ``allow_unknown_tags`` is True, then

3323 allow "tags" classification even when the only tags are those starting

3324 with a word in allowed_unknown_starts."""

3325 assert isinstance(desc, str)

3326 # Empty and whitespace-only strings are treated as "other"

3327 desc = desc.strip()

3328 if not desc:

3329 return "other"

3330

3331 normalized_desc = unicodedata.normalize("NFKD", desc)

3332

3333 # If it can be fully decoded as tags without errors, treat as tags

3334 tagsets, topics = decode_tags(desc, no_unknown_starts=no_unknown_starts)

3335 for tagset in tagsets:

3336 assert isinstance(tagset, (list, tuple, set))

3337 if "error-unknown-tag" not in tagset and (

3338 topics or allow_unknown_tags or any(" " not in x for x in tagset)

3339 ):

3340 return "tags"

3341

3342 # Check if it looks like the taxonomic name of a species

3343 if desc in known_species:

3344 return "taxonomic"

3345 desc1 = re.sub(r"^×([A-Z])", r"\1", desc)

3346 desc1 = re.sub(r"\s*×.*", "", desc1)

3347 lst = desc1.split()

3348 if len(lst) > 1 and len(lst) <= 5 and lst[0] in known_firsts:

3349 have_non_english = 1 if lst[0].lower() not in english_words else 0

3350 for x in lst[1:]:

3351 if x in ("A", "B", "C", "D", "E", "F", "I", "II", "III", "IV", "V"):

3352 continue

3353 if x[0].isupper():

3354 break

3355 if x not in english_words:

3356 have_non_english += 1

3357 else:

3358 # Starts with known taxonomic term, does not contain uppercase

3359 # words (except allowed letters) and at least one word is not

3360 # English

3361 if have_non_english >= len(lst) - 1 and have_non_english > 0: 3361 ↛ 3367line 3361 didn't jump to line 3367 because the condition on line 3361 was always true

3362 return "taxonomic"

3363

3364 # If all words are in our English dictionary, interpret as English.

3365 # [ -~] is regex black magic, "ALL CHARACTERS from space to tilde"

3366 # in ASCII. Took me a while to figure out.

3367 if re.match(r"[ -~―—“”…'‘’ʹ€]+$", normalized_desc) and len(desc) > 1:

3368 if desc in english_words and desc[0].isalpha():

3369 return "english" # Handles ones containing whitespace

3370 desc1 = re.sub(

3371 tokenizer_fixup_re, lambda m: tokenizer_fixup_map[m.group(0)], desc

3372 )

3373 tokens = tokenizer.tokenize(desc1)

3374 if not tokens: 3374 ↛ 3375line 3374 didn't jump to line 3375 because the condition on line 3374 was never true

3375 return "other"

3376 lst_bool = list(

3377 x not in not_english_words

3378 and

3379 # not x.isdigit() and

3380 (

3381 x in english_words

3382 or x.lower() in english_words

3383 or x in known_firsts

3384 or x[0].isdigit()

3385 or x in accepted

3386 or

3387 # (x[0].isupper() and x.find("-") < 0 and x.isascii()) or

3388 (

3389 x.endswith("s") and len(x) >= 4 and x[:-1] in english_words

3390 ) # Plural

3391 or (

3392 x.endswith("ies")

3393 and len(x) >= 5

3394 and x[:-3] + "y" in english_words

3395 ) # E.g. lily - lilies

3396 or (

3397 x.endswith("ing")

3398 and len(x) >= 5

3399 and x[:-3] in english_words

3400 ) # E.g. bring - bringing

3401 or (

3402 x.endswith("ing")

3403 and len(x) >= 5

3404 and x[:-3] + "e" in english_words

3405 ) # E.g., tone - toning

3406 or (

3407 x.endswith("ed") and len(x) >= 5 and x[:-2] in english_words

3408 ) # E.g. hang - hanged

3409 or (

3410 x.endswith("ed")

3411 and len(x) >= 5

3412 and x[:-2] + "e" in english_words

3413 ) # E.g. atone - atoned

3414 or (x.endswith("'s") and x[:-2] in english_words)

3415 or (x.endswith("s'") and x[:-2] in english_words)

3416 or (

3417 x.endswith("ise")

3418 and len(x) >= 5

3419 and x[:-3] + "ize" in english_words

3420 )

3421 or (

3422 x.endswith("ised")

3423 and len(x) >= 6

3424 and x[:-4] + "ized" in english_words

3425 )

3426 or (

3427 x.endswith("ising")

3428 and len(x) >= 7

3429 and x[:-5] + "izing" in english_words

3430 )

3431 or (

3432 re.search(r"[-/]", x)

3433 and all(

3434 ((y in english_words and len(y) > 2) or not y)

3435 for y in re.split(r"[-/]", x)

3436 )

3437 )

3438 )

3439 for x in tokens

3440 )

3441 cnt = lst_bool.count(True)

3442 rejected_words = tuple(

3443 x for i, x in enumerate(tokens) if not lst_bool[i]

3444 )

3445 if (

3446 any(

3447 lst_bool[i] and x[0].isalpha() and len(x) > 1

3448 for i, x in enumerate(tokens)

3449 )

3450 and not desc.startswith("-")

3451 and not desc.endswith("-")

3452 and re.search(r"\w+", desc)

3453 and (

3454 cnt == len(lst_bool)

3455 or (

3456 any(

3457 lst_bool[i] and len(x) > 3 for i, x in enumerate(tokens)

3458 )

3459 and cnt >= len(lst_bool) - 1

3460 )

3461 or cnt / len(lst_bool) >= 0.8

3462 or (

3463 all(x in potentially_english_words for x in rejected_words)

3464 and cnt / len(lst_bool) >= 0.50

3465 )

3466 )

3467 ):

3468 return "english"

3469 # Some translations have apparent pronunciation descriptions in /.../

3470 # which we'll put in the romanization field (even though they probably are

3471 # not exactly romanizations).

3472 if desc.startswith("/") and desc.endswith("/"):

3473 return "romanization"

3474 # If all characters are in classes that could occur in romanizations,

3475 # treat as romanization

3476 classes = list(

3477 unicodedata.category(x) if x not in ("-", ",", ":", "/", '"') else "OK"

3478 for x in normalized_desc

3479 )

3480 classes1 = []

3481 num_latin = 0

3482 num_greek = 0

3483 # part = ""

3484 # for ch, cl in zip(normalized_desc, classes):

3485 # part += f"{ch}({cl})"

3486 # print(part)

3487 for ch, cl in zip(normalized_desc, classes):

3488 if ch in (

3489 "'", # ' in Arabic, / in IPA-like parenthesized forms

3490 ".", # e.g., "..." in translations

3491 ";",

3492 ":",

3493 "!",

3494 "‘",

3495 "’",

3496 '"',

3497 "“",

3498 "”",

3499 "/",

3500 "?",

3501 "…", # alternative to "..."

3502 "⁉", # 見る/Japanese automatic transcriptions...

3503 "？",

3504 "！",

3505 "⁻", # superscript -, used in some Cantonese roman, e.g. "we"

3506 "ʔ",

3507 "ʼ",

3508 "ʾ",

3509 "ʹ",

3510 ): # ʹ e.g. in understand/English/verb Russian transl

3511 classes1.append("OK")

3512 continue

3513 if cl not in ("Ll", "Lu"):

3514 classes1.append(cl)

3515 continue

3516 try:

3517 name = unicodedata.name(ch)

3518 first = name.split()[0]

3519 if first == "LATIN":

3520 num_latin += 1

3521 elif first == "GREEK":

3522 num_greek += 1

3523 elif first == "COMBINING": # Combining diacritic 3523 ↛ 3524line 3523 didn't jump to line 3524 because the condition on line 3523 was never true

3524 cl = "OK"

3525 elif re.match(non_latin_scripts_re, name): 3525 ↛ 3529line 3525 didn't jump to line 3529 because the condition on line 3525 was always true

3526 cl = "NO" # Not acceptable in romanizations

3527 except ValueError:

3528 cl = "NO" # Not acceptable in romanizations

3529 classes1.append(cl)

3530 # print("classify_desc: {!r} classes1: {}".format(desc, classes1))

3531 # print(set(classes1) )

3532 if all(

3533 x in ("Ll", "Lu", "Lt", "Lm", "Mn", "Mc", "Zs", "Nd", "OK")

3534 for x in classes1

3535 ):

3536 if (

3537 (num_latin >= num_greek + 2 or num_greek == 0)

3538 and classes1.count("OK") < len(classes1)

3539 and classes1.count("Nd") < len(classes1)

3540 ):

3541 return "romanization"

3542 # Otherwise it is something else, such as hanji version of the word

3543 return "other"

3544

3545

3546def remove_text_in_parentheses(text: str) -> str:

3547 parentheses = 0

3548 new_text = ""

3549 for c in text:

3550 if c == "(":

3551 parentheses += 1

3552 elif c == ")":

3553 parentheses -= 1

3554 elif parentheses == 0:

3555 new_text += c

3556 return new_text

Coverage for src/wiktextract/extractor/en/form_descriptions.py: 76%

1327 statements