Coverage for src/wiktextract/extractor/en/form_descriptions.py: 76%
1323 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1# Code for parsing linguistic form descriptions and tags for word senses
2# (both the word entry head - initial part and parenthesized parts -
3# and tags at the beginning of word senses)
4#
5# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
7import functools
8import re
9import unicodedata
10from typing import (
11 Any,
12 Literal,
13 Optional,
14 Sequence,
15 Union,
16)
18import Levenshtein
19from nltk import TweetTokenizer # type:ignore[import-untyped]
21from ...datautils import data_append, data_extend, split_at_comma_semi
22from ...tags import (
23 alt_of_tags,
24 form_of_tags,
25 head_final_bantu_langs,
26 head_final_bantu_map,
27 head_final_numeric_langs,
28 head_final_other_langs,
29 head_final_other_map,
30 head_final_semitic_langs,
31 head_final_semitic_map,
32 uppercase_tags,
33 valid_tags,
34 xlat_descs_map,
35 xlat_head_map,
36 xlat_tags_map,
37)
38from ...topics import topic_generalize_map, valid_topics
39from ...wxr_context import WiktextractContext
40from .english_words import (
41 english_words,
42 not_english_words,
43 potentially_english_words,
44)
45from .form_descriptions_known_firsts import known_firsts
46from .taxondata import known_species
47from .type_utils import (
48 AltOf,
49 FormData,
50 LinkageData,
51 SenseData,
52 SoundData,
53 TranslationData,
54 WordData,
55)
57# Tokenizer for classify_desc()
58tokenizer = TweetTokenizer()
60# These are ignored as the value of a related form in form head.
61IGNORED_RELATED: set[str] = set(
62 [
63 "-",
64 "־",
65 "᠆",
66 "‐",
67 "‑",
68 "‒",
69 "–",
70 "—",
71 "―",
72 "−",
73 "⸺",
74 "⸻",
75 "﹘",
76 "﹣",
77 "-",
78 "?",
79 "(none)",
80 ]
81)
84# First words of unicodedata.name() that indicate scripts that cannot be
85# accepted in romanizations or english (i.e., should be considered "other"
86# in classify_desc()).
87non_latin_scripts: list[str] = [
88 "ADLAM",
89 "ARABIC",
90 "ARABIC-INDIC",
91 "ARMENIAN",
92 "BALINESE",
93 "BENGALI",
94 "BRAHMI",
95 "BRAILLE",
96 "CANADIAN",
97 "CHAKMA",
98 "CHAM",
99 "CHEROKEE",
100 "CJK",
101 "COPTIC",
102 "COUNTING ROD",
103 "CUNEIFORM",
104 "CYRILLIC",
105 "DOUBLE-STRUCK",
106 "EGYPTIAN",
107 "ETHIOPIC",
108 "EXTENDED ARABIC-INDIC",
109 "GEORGIAN",
110 "GLAGOLITIC",
111 "GOTHIC",
112 "GREEK",
113 "GUJARATI",
114 "GURMUKHI",
115 "HANGUL",
116 "HANIFI ROHINGYA",
117 "HEBREW",
118 "HIRAGANA",
119 "JAVANESE",
120 "KANNADA",
121 "KATAKANA",
122 "KAYAH LI",
123 "KHMER",
124 "KHUDAWADI",
125 "LAO",
126 "LEPCHA",
127 "LIMBU",
128 "MALAYALAM",
129 "MEETEI",
130 "MYANMAR",
131 "NEW TAI LUE",
132 "NKO",
133 "OL CHIKI",
134 "OLD PERSIAN",
135 "OLD SOUTH ARABIAN",
136 "ORIYA",
137 "OSMANYA",
138 "PHOENICIAN",
139 "SAURASHTRA",
140 "SHARADA",
141 "SINHALA",
142 "SUNDANESE",
143 "SYLOTI",
144 "TAI THAM",
145 "TAKRI",
146 "TAMIL",
147 "TELUGU",
148 "THAANA",
149 "THAI",
150 "TIBETAN",
151 "TIFINAGH",
152 "TIRHUTA",
153 "UGARITIC",
154 "WARANG CITI",
155 "YI",
156]
157non_latin_scripts_re = re.compile(
158 r"(" + r"|".join(re.escape(x) for x in non_latin_scripts) + r")\b"
159)
161# Sanity check xlat_head_map values
162for k, v in xlat_head_map.items():
163 if v.startswith("?"):
164 v = v[1:]
165 for tag in v.split():
166 if tag not in valid_tags: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true
167 print(
168 "WARNING: xlat_head_map[{}] contains unrecognized tag {}".format(
169 k, tag
170 )
171 )
173# Regexp for finding nested translations from translation items (these are
174# used in, e.g., year/English/Translations/Arabic). This is actually used
175# in page.py.
176nested_translations_re = re.compile(
177 r"\s+\((({}): ([^()]|\([^()]+\))+)\)".format(
178 "|".join(
179 re.escape(x.removeprefix("?"))
180 for x in sorted(xlat_head_map.values(), key=len, reverse=True)
181 if x and not x.startswith("class-")
182 )
183 )
184)
186# Regexp that matches head tag specifiers. Used to match tags from end of
187# translations and linkages
188head_final_re_text = r"( -)?( ({}))+".format(
189 "|".join(
190 re.escape(x)
191 for x in
192 # The sort is to put longer ones first, preferring them in
193 # the regexp match
194 sorted(xlat_head_map.keys(), key=len, reverse=True)
195 )
196)
197head_final_re = re.compile(head_final_re_text + "$")
199# Regexp used to match head tag specifiers at end of a form for certain
200# Bantu languages (particularly Swahili and similar languages).
201head_final_bantu_re_text = r" ({})".format(
202 "|".join(re.escape(x) for x in head_final_bantu_map.keys())
203)
204head_final_bantu_re = re.compile(head_final_bantu_re_text + "$")
206# Regexp used to match head tag specifiers at end of a form for certain
207# Semitic languages (particularly Arabic and similar languages).
208head_final_semitic_re_text = r" ({})".format(
209 "|".join(re.escape(x) for x in head_final_semitic_map.keys())
210)
211head_final_semitic_re = re.compile(head_final_semitic_re_text + "$")
213# Regexp used to match head tag specifiers at end of a form for certain
214# other languages (e.g., Lithuanian, Finnish, French).
215head_final_other_re_text = r" ({})".format(
216 "|".join(re.escape(x) for x in head_final_other_map.keys())
217)
218head_final_other_re = re.compile(head_final_other_re_text + "$")
220# Regexp for splitting heads. See parse_word_head().
221head_split_re_text = (
222 "("
223 + head_final_re_text
224 + "|"
225 + head_final_bantu_re_text
226 + "|"
227 + head_final_semitic_re_text
228 + "|"
229 + head_final_other_re_text
230 + ")?( or |[,;]+)"
231)
232head_split_re = re.compile(head_split_re_text)
233head_split_re_parens = 0
234for m in re.finditer(r"(^|[^\\])[(]+", head_split_re_text):
235 head_split_re_parens += m.group(0).count("(")
237# Parenthesized parts that are ignored in translations
238tr_ignored_parens: set[str] = set(
239 [
240 "please verify",
241 "(please verify)",
242 "transliteration needed",
243 "(transliteration needed)",
244 "in words with back vowel harmony",
245 "(in words with back vowel harmony)",
246 "in words with front vowel harmony",
247 "(in words with front vowel harmony)",
248 "see below",
249 "see usage notes below",
250 ]
251)
252tr_ignored_parens_re = re.compile(
253 r"^("
254 + "|".join(re.escape(x) for x in tr_ignored_parens)
255 + ")$"
256 + r"|^(Can we clean up|Can we verify|for other meanings see "
257 r"lit\. )"
258)
260# Translations that are ignored
261ignored_translations: set[str] = set(
262 [
263 "[script needed]",
264 "please add this translation if you can",
265 ]
266)
268# Put english text into the "note" field in a translation if it contains one
269# of these words
270tr_note_re = re.compile(
271 r"(\b(article|definite|indefinite|superlative|comparative|pattern|"
272 r"adjective|adjectives|clause|clauses|pronoun|pronouns|preposition|prep|"
273 r"postposition|postp|action|actions|articles|"
274 r"adverb|adverbs|noun|nouns|verb|verbs|before|"
275 r"after|placed|prefix|suffix|used with|translated|"
276 r"nominative|genitive|dative|infinitive|participle|past|perfect|imperfect|"
277 r"perfective|imperfective|auxiliary|negative|future|present|tense|aspect|"
278 r"conjugation|declension|class|category|plural|singular|positive|"
279 r"seldom used|formal|informal|familiar|unspoken|spoken|written|"
280 r"indicative|progressive|conditional|potential|"
281 r"accusative|adessive|inessive|superessive|elative|allative|"
282 r"dialect|dialects|object|subject|predicate|movies|recommended|language|"
283 r"locative|continuous|simple|continuousness|gerund|subjunctive|"
284 r"periphrastically|no equivalent|not used|not always used|"
285 r"used only with|not applicable|use the|signifying|wordplay|pronounced|"
286 r"preconsonantal|spelled|spelling|respelling|respellings|phonetic|"
287 r"may be replaced|stricter sense|for nonhumans|"
288 r"sense:|used:|in full:|informally used|followed by|"
289 r"not restricted to|pertaining to|or optionally with|are optional|"
290 r"in conjunction with|in compounds|depending on the relationship|"
291 r"person addressed|one person|multiple persons|may be replaced with|"
292 r"optionally completed with|in the phrase|in response to|"
293 r"before a|before an|preceded by|verbs ending|very common|after a verb|"
294 r"with verb|with uncountable|with the objects|with stative|"
295 r"can be replaced by|often after|used before|used after|"
296 r"used in|clipping of|spoken|somewhat|capitalized|"
297 r"short form|shortening of|shortened form|initialism of|"
298 r"said to|rare:|rarer also|is rarer|negatively connoted|"
299 r"previously mentioned|uncountable noun|countable noun|"
300 r"countable nouns|uncountable nouns|"
301 r"with predicative|with -|with imperfect|with a negated|"
302 r"colloquial|misspelling|holophrastic|frequently|esp\.|especially|"
303 r'"|'
304 r"general term|after a vowel|before a vowel|"
305 r"form|regular|irregular|alternative)"
306 r")($|[) ])|^("
307 # Following are only matched at the beginning of the string
308 r"pl|pl\.|see:|pl:|sg:|plurals:|e\.g\.|e\.g\.:|e\.g\.,|cf\.|compare|such as|"
309 r"see|only|often|usually|used|usage:|of|not|in|compare|usu\.|"
310 r"as|about|abbrv\.|abbreviation|abbr\.|that:|optionally|"
311 r"mainly|from|for|also|also:|acronym|"
312 r"\+|with) "
313)
314# \b does not work at the end???
316# Related forms matching this regexp will be considered suspicious if the
317# page title does not also match one of these.
318suspicious_related_re = re.compile(
319 r"(^| )(f|m|n|c|or|pl|sg|inan|anim|pers|anml|impf|pf|vir|nvir)( |$)"
320 r"|[][:=<>&#*|]"
321 r"| \d+$"
322)
324# Word forms (head forms, translations, etc) that will be considered ok and
325# silently accepted even if they would otherwise trigger a suspicious
326# form warning.
327ok_suspicious_forms: set[str] = set(
328 [
329 "but en or", # "golden goal"/English/Tr/French
330 "cœur en or", # "heart of gold"/Eng/Tr/French
331 "en or", # golden/Eng/Tr/French
332 "men du", # jet/Etym2/Noun/Tr/Cornish
333 "parachute en or", # "golden parachute"/Eng/Tr/French
334 "vieil or", # "old gold"/Eng/Tr/French
335 # "all that glitters is not gold"/Eng/Tr/French
336 "tout ce qui brille n’est pas or",
337 "μη αποκλειστικό or", # inclusive or/Eng/Tr/Greek
338 "period or full stop",
339 ]
340)
343# Replacements to be done in classify_desc before tokenizing. This is a
344# workaround for shortcomings in TweetTokenizer.
345tokenizer_fixup_map = {
346 r"a.m.": "AM",
347 r"p.m.": "PM",
348}
349tokenizer_fixup_re = re.compile(
350 r"\b("
351 + "|".join(
352 re.escape(x)
353 for x in sorted(
354 tokenizer_fixup_map.keys(), key=lambda x: len(x), reverse=True
355 )
356 )
357 + r")"
358)
360# Unknown tags starting with these words will be silently ignored.
361ignored_unknown_starts: set[str] = set(
362 [
363 "originally",
364 "e.g.",
365 "c.f.",
366 "supplanted by",
367 "supplied by",
368 ]
369)
371ignored_unknown_starts_re = re.compile(
372 r"^("
373 + "|".join(
374 re.escape(x)
375 for x in sorted(ignored_unknown_starts, key=lambda x: -len(x))
376 )
377 + ") "
378)
380# If an unknown sequence starts with one of these, it will continue as an
381# unknown sequence until the end, unless it turns out to have a replacement.
382allowed_unknown_starts: set[str] = set(
383 [
384 "Relating",
385 "accompanied",
386 "added",
387 "after",
388 "answering",
389 "as",
390 "based",
391 "before",
392 "conjugated",
393 "conjunction",
394 "construed",
395 "especially",
396 "expression:",
397 "figurative:",
398 "followed",
399 "for",
400 "forms",
401 "from",
402 "governs",
403 "in",
404 "indicating",
405 "modifying",
406 "normally",
407 "not",
408 "of",
409 "preceding",
410 "prefixed",
411 "referring",
412 "relating",
413 "revived",
414 "said",
415 "since",
416 "takes",
417 "used",
418 "with",
419 "With",
420 "without",
421 ]
422)
423# Allow the ignored unknown starts without complaining
424allowed_unknown_starts.update(ignored_unknown_starts)
426# Full unknown tags that will be ignored in decode_tags()
427# XXX this is unused, ask Tatu where the contents is now
428ignored_unknown_tags: set[str] = set([])
430# Head endings that are mapped to tags
431head_end_map = {
432 " 1st conj.": "conjugation-1",
433 " 2nd conj.": "conjugation-2",
434 " 3rd conj.": "conjugation-3",
435 " 4th conj.": "conjugation-4",
436 " 5th conj.": "conjugation-5",
437 " 6th conj.": "conjugation-6",
438 " 7th conj.": "conjugation-7",
439}
440head_end_re = re.compile(
441 r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$"
442)
445# Dictionary of language-specific parenthesized head part starts that
446# either introduce new tags or modify previous tags. The value for each
447# language is a dictionary that maps the first word of the head part to
448# (rem_tags, add_tags), where ``rem_tags`` can be True to remove all previous
449# tags or a space-separated string of tags to remove, and ``add_tags`` should
450# be a string of tags to add.
451lang_specific_head_map: dict[
452 str, dict[str, Union[tuple[str, str], tuple[Literal[True], str]]]
453] = {
454 "Danish": {
455 # prefix: (rem_tags space separate string/True, add_tags s-sep str)
456 "c": ("neuter", "common-gender"),
457 "n": ("common-gender", "neuter"),
458 "pl": ("singular neuter common-gender", "plural"),
459 "sg": ("plural neuter common-gender", "singular"),
460 },
461}
464# Regular expression used to strip additional stuff from the end of alt_of and
465# form_of.
466alt_of_form_of_clean_re = re.compile(
467 r"(?s)("
468 + "|".join(
469 [
470 r":",
471 r'[“"]',
472 r";",
473 r" \(",
474 r" - ",
475 r" ־ ",
476 r" ᠆ ",
477 r" ‐ ",
478 r" ‑ ",
479 r" ‒ ",
480 r" – ",
481 r" — ",
482 r" ― ",
483 r" − ",
484 r" ⸺ ",
485 r" ⸻ ",
486 r" ﹘ ",
487 r" ﹣ ",
488 r" - ",
489 r" \+ ",
490 r" \(with ",
491 r" with -ra/-re",
492 r"\. Used ",
493 r"\. Also ",
494 r"\. Since ",
495 r"\. A ",
496 r"\.\. A ",
497 r"\. An ",
498 r"\.\. An ",
499 r"\. an ",
500 r"\. The ",
501 r"\. Spanish ",
502 r"\. Language ",
503 r"\. former name of ",
504 r"\. AIM",
505 r"\. OT",
506 r"\. Not ",
507 r"\. Now ",
508 r"\. Nowadays ",
509 r"\. Early ",
510 r"\. ASEAN",
511 r"\. UN",
512 r"\. IMF",
513 r"\. WHO",
514 r"\. WIPO",
515 r"\. AC",
516 r"\. DC",
517 r"\. DNA",
518 r"\. RNA",
519 r"\. SOB",
520 r"\. IMO",
521 r"\. Behavior",
522 r"\. Income ",
523 r"\. More ",
524 r"\. Most ",
525 r"\. Only ",
526 r"\. Also ",
527 r"\. From ",
528 r"\. Of ",
529 r"\.\. Of ",
530 r"\. To ",
531 r"\. For ",
532 r"\. If ",
533 r"\. Praenominal ",
534 r"\. This ",
535 r"\. Replaced ",
536 r"\. CHCS is the ",
537 r"\. Equivalent ",
538 r"\. Initialism ",
539 r"\. Note ",
540 r"\. Alternative ",
541 r"\. Compare ",
542 r"\. Cf\. ",
543 r"\. Comparable ",
544 r"\. Involves ",
545 r"\. Sometimes ",
546 r"\. Commonly ",
547 r"\. Often ",
548 r"\. Typically ",
549 r"\. Possibly ",
550 r"\. Although ",
551 r"\. Rare ",
552 r"\. Instead ",
553 r"\. Integrated ",
554 r"\. Distinguished ",
555 r"\. Given ",
556 r"\. Found ",
557 r"\. Was ",
558 r"\. In ",
559 r"\. It ",
560 r"\.\. It ",
561 r"\. One ",
562 r"\. Any ",
563 r"\. They ",
564 r"\. Members ",
565 r"\. Each ",
566 r"\. Original ",
567 r"\. Especially ",
568 r"\. Usually ",
569 r"\. Known ",
570 r"\.\. Known ",
571 r"\. See ",
572 r"\. see ",
573 r"\. target was not ",
574 r"\. Popular ",
575 r"\. Pedantic ",
576 r"\. Positive ",
577 r"\. Society ",
578 r"\. Plan ",
579 r"\. Environmentally ",
580 r"\. Affording ",
581 r"\. Encompasses ",
582 r"\. Expresses ",
583 r"\. Indicates ",
584 r"\. Text ",
585 r"\. Large ",
586 r"\. Sub-sorting ",
587 r"\. Sax",
588 r"\. First-person ",
589 r"\. Second-person ",
590 r"\. Third-person ",
591 r"\. 1st ",
592 r"\. 2nd ",
593 r"\. 3rd ",
594 r"\. Term ",
595 r"\. Northeastern ",
596 r"\. Northwestern ",
597 r"\. Southeast ",
598 r"\. Egyptian ",
599 r"\. English ",
600 r"\. Cape Province was split into ",
601 r"\. Pañcat",
602 r"\. of the ",
603 r"\. is ",
604 r"\. after ",
605 r"\. or ",
606 r"\. chromed",
607 r"\. percussion",
608 r"\. with his ",
609 r"\. a\.k\.a\. ",
610 r"\. comparative form ",
611 r"\. singular ",
612 r"\. plural ",
613 r"\. present ",
614 r"\. his ",
615 r"\. her ",
616 r"\. equivalent ",
617 r"\. measuring ",
618 r"\. used in ",
619 r"\. cutely ",
620 r"\. Protects",
621 r'\. "',
622 r"\.^",
623 r"\. \+ ",
624 r"\., ",
625 r". — ",
626 r", a ",
627 r", an ",
628 r", the ",
629 r", obsolete ",
630 r", possessed", # 'd/English
631 r", imitating", # 1/English
632 r", derived from",
633 r", called ",
634 r", especially ",
635 r", slang for ",
636 r" corresponding to ",
637 r" equivalent to ",
638 r" popularized by ",
639 r" denoting ",
640 r" in its various senses\.",
641 r" used by ",
642 r" but not for ",
643 r" since ",
644 r" i\.e\. ",
645 r" i\. e\. ",
646 r" e\.g\. ",
647 r" eg\. ",
648 r" etc\. ",
649 r"\[http",
650 r" — used as ",
651 r" by K\. Forsyth ",
652 r" by J\. R\. Allen ",
653 r" by S\. Ferguson ",
654 r" by G\. Donaldson ",
655 r" May refer to ",
656 r" An area or region ",
657 ]
658 )
659 + r").*$"
660)
663class ValidNode:
664 """Node in the valid_sequences tree. Each node is part of a chain
665 or chains that form sequences built out of keys in key->tags
666 maps like xlat_tags, etc. The ValidNode's 'word' is the key
667 by which it is refered to in the root dict or a `children` dict,
668 `end` marks that the node is the end-terminus of a sequence (but
669 it can still continue if the sequence is shared by the start of
670 other sequences: "nominative$" and "nominative plural$" for example),
671 `tags` and `topics` are the dicts containing tag and topic strings
672 for terminal nodes (end==True)."""
674 __slots__ = (
675 "end",
676 "tags",
677 "topics",
678 "children",
679 )
681 def __init__(
682 self,
683 end=False,
684 tags: Optional[list[str]] = None,
685 topics: Optional[list[str]] = None,
686 children: Optional[dict[str, "ValidNode"]] = None,
687 ) -> None:
688 self.end = end
689 self.tags: list[str] = tags or []
690 self.topics: list[str] = topics or []
691 self.children: dict[str, "ValidNode"] = children or {}
694def add_to_valid_tree(tree: ValidNode, desc: str, v: Optional[str]) -> None:
695 """Helper function for building trees of valid tags/sequences during
696 initialization."""
697 assert isinstance(tree, ValidNode)
698 assert isinstance(desc, str)
699 assert v is None or isinstance(v, str)
700 node = tree
702 # Build the tree structure: each node has children nodes
703 # whose names are denoted by their dict key.
704 for w in desc.split(" "):
705 if w in node.children:
706 node = node.children[w]
707 else:
708 new_node = ValidNode()
709 node.children[w] = new_node
710 node = new_node
711 if not node.end:
712 node.end = True
713 if not v:
714 return None # Terminate early because there are no tags
716 tagslist = []
717 topicslist = []
718 for vv in v.split():
719 if vv in valid_tags:
720 tagslist.append(vv)
721 elif vv in valid_topics: 721 ↛ 724line 721 didn't jump to line 724 because the condition on line 721 was always true
722 topicslist.append(vv)
723 else:
724 print(
725 "WARNING: tag/topic {!r} maps to unknown {!r}".format(desc, vv)
726 )
727 topics = " ".join(topicslist)
728 tags = " ".join(tagslist)
729 # Changed to "_tags" and "_topics" to avoid possible key-collisions.
730 if topics:
731 node.topics.extend([topics])
732 if tags:
733 node.tags.extend([tags])
736def add_to_valid_tree1(
737 tree: ValidNode,
738 k: str,
739 v: Union[list[str], tuple[str, ...], str],
740 valid_values: Union[set[str], dict[str, Any]],
741) -> list[str]:
742 assert isinstance(tree, ValidNode)
743 assert isinstance(k, str)
744 assert v is None or isinstance(v, (list, tuple, str))
745 assert isinstance(valid_values, (set, dict))
746 if not v: 746 ↛ 747line 746 didn't jump to line 747 because the condition on line 746 was never true
747 add_to_valid_tree(valid_sequences, k, None)
748 return []
749 elif isinstance(v, str):
750 v = [v]
751 q = []
752 for vv in v:
753 assert isinstance(vv, str)
754 add_to_valid_tree(valid_sequences, k, vv)
755 vvs = vv.split()
756 for x in vvs:
757 q.append(x)
758 # return each individual tag
759 return q
762def add_to_valid_tree_mapping(
763 tree: ValidNode,
764 mapping: Union[dict[str, Union[list[str], str]], dict[str, str]],
765 valid_values: Union[set[str], dict[str, Any]],
766 recurse: bool,
767) -> None:
768 assert isinstance(tree, ValidNode)
769 assert isinstance(mapping, dict)
770 assert isinstance(valid_values, (set, dict))
771 assert recurse in (True, False)
772 for k, v in mapping.items():
773 assert isinstance(k, str)
774 assert isinstance(v, (list, str))
775 if isinstance(v, str):
776 q = add_to_valid_tree1(tree, k, [v], valid_values)
777 else:
778 q = add_to_valid_tree1(tree, k, v, valid_values)
779 if recurse:
780 visited = set()
781 while q:
782 v = q.pop()
783 if v in visited:
784 continue
785 visited.add(v)
786 if v not in mapping:
787 continue
788 vv = mapping[v]
789 qq = add_to_valid_tree1(tree, k, vv, valid_values)
790 q.extend(qq)
793# Tree of sequences considered to be tags (includes sequences that are
794# mapped to something that becomes one or more valid tags)
795valid_sequences = ValidNode()
796sequences_with_slashes: set[str] = set()
797for tag in valid_tags:
798 # The basic tags used in our tag system; some are a bit weird, but easier
799 # to implement this with 'false' positives than filter out stuff no one else
800 # uses.
801 if "/" in tag:
802 sequences_with_slashes.add(tag)
803 add_to_valid_tree(valid_sequences, tag, tag)
804for tag in uppercase_tags:
805 hyphenated = re.sub(r"\s+", "-", tag)
806 if "/" in tag:
807 sequences_with_slashes.add(tag)
808 add_to_valid_tree(valid_sequences, tag, hyphenated)
810# xlat_tags_map!
811add_to_valid_tree_mapping(valid_sequences, xlat_tags_map, valid_tags, False)
812for k in xlat_tags_map:
813 if "/" in k:
814 sequences_with_slashes.add(k)
815# Add topics to the same table, with all generalized topics also added
816for topic in valid_topics:
817 assert " " not in topic
818 if "/" in topic: 818 ↛ 819line 818 didn't jump to line 819 because the condition on line 818 was never true
819 sequences_with_slashes.add(topic)
820 add_to_valid_tree(valid_sequences, topic, topic)
821# Let each original topic value stand alone. These are not generally on
822# valid_topics. We add the original topics with spaces replaced by hyphens.
823for topic in topic_generalize_map.keys():
824 hyphenated = re.sub(r"\s+", "-", topic)
825 if "/" in topic: 825 ↛ 826line 825 didn't jump to line 826 because the condition on line 825 was never true
826 sequences_with_slashes.add(topic)
827 add_to_valid_tree(valid_sequences, topic, hyphenated)
828# Add canonicalized/generalized topic values
829add_to_valid_tree_mapping(
830 valid_sequences, topic_generalize_map, valid_topics, True
831)
833# Regex used to divide a decode candidate into parts that shouldn't
834# have their slashes turned into spaces
835slashes_re = re.compile(
836 r"(" + "|".join((re.escape(s) for s in sequences_with_slashes)) + r")"
837)
839# Regexp used to find "words" from word heads and linguistic descriptions
840word_pattern = (
841 r"[^ ,;()\u200e]+|"
842 r"\([^ ,;()\u200e]+\)[^ ,;()\u200e]+|"
843 r"[\u2800-\u28ff]|" # Braille characters
844 r"\(([^()]|\([^()]*\))*\)"
845)
847word_re_global = re.compile(word_pattern)
850def distw(titleparts: Sequence[str], word: str) -> float:
851 """Computes how distinct ``word`` is from the most similar word in
852 ``titleparts``. Returns 1 if words completely distinct, 0 if
853 identical, or otherwise something in between."""
854 assert isinstance(titleparts, (list, tuple))
855 assert isinstance(word, str)
856 w = min(
857 Levenshtein.distance(word, tw) / max(len(tw), len(word))
858 for tw in titleparts
859 )
860 return w
863def map_with(
864 ht: dict[str, str | list[str]] | dict[str, str],
865 lst: Sequence[str],
866) -> list[str]:
867 """Takes alternatives from ``lst``, maps them using ``ht`` to zero or
868 more alternatives each, and returns a combined list of alternatives."""
869 assert isinstance(ht, dict)
870 assert isinstance(lst, (list, tuple))
871 ret = []
872 for x in lst:
873 assert isinstance(x, str)
874 x = x.strip()
875 x = ht.get(x, x)
876 if isinstance(x, str): 876 ↛ 879line 876 didn't jump to line 879 because the condition on line 876 was always true
877 if x: 877 ↛ 872line 877 didn't jump to line 872 because the condition on line 877 was always true
878 ret.append(x)
879 elif isinstance(x, (list, tuple)):
880 ret.extend(x)
881 else:
882 raise RuntimeError("map_with unexpected value: {!r}".format(x))
883 return ret
886TagList = list[str]
887PosPathStep = tuple[int, TagList, TagList]
890def check_unknown(
891 from_i: int,
892 to_i: int,
893 i: int,
894 wordlst: Sequence[str],
895 allow_any: bool,
896 no_unknown_starts: bool,
897) -> list[PosPathStep]:
898 """Check if the current section from_i->to_i is actually unknown
899 or if it needs some special handling. We already presupposed that
900 this is UNKNOWN; this is just called to see what *kind* of UNKNOWN."""
901 assert isinstance(to_i, int)
902 assert isinstance(from_i, int)
903 assert isinstance(i, int)
904 # Adds unknown tag if needed. Returns new last_i
905 # print("check_unknown to_i={} from_i={} i={}"
906 # .format(to_i, from_i, i))
907 if from_i >= to_i:
908 return []
909 words = wordlst[from_i:to_i]
910 tag = " ".join(words)
911 assert tag
912 # print(f"{tag=}")
913 if re.match(ignored_unknown_starts_re, tag):
914 # Tags with this start are to be ignored
915 return [(from_i, ["UNKNOWN"], [])]
916 if tag in ignored_unknown_tags: 916 ↛ 917line 916 didn't jump to line 917 because the condition on line 916 was never true
917 return [] # One of the tags listed as to be ignored
918 if tag in ("and", "or"):
919 return []
920 if (
921 not allow_any
922 and not words[0].startswith("~")
923 and (
924 no_unknown_starts
925 or words[0] not in allowed_unknown_starts
926 or len(words) <= 1
927 )
928 ):
929 # print("ERR allow_any={} words={}"
930 # .format(allow_any, words))
931 return [
932 (from_i, ["UNKNOWN"], ["error-unknown-tag"])
933 ] # Add ``tag`` here to include
934 else:
935 return [(from_i, ["UNKNOWN"], [tag])]
938def add_new1(
939 node: ValidNode,
940 i: int,
941 start_i: int,
942 last_i: int,
943 new_paths: list[list[PosPathStep]],
944 new_nodes: list[tuple[ValidNode, int, int]],
945 pos_paths: list[list[list[PosPathStep]]],
946 wordlst: list[str],
947 allow_any: bool,
948 no_unknown_starts: bool,
949 max_last_i: int,
950) -> int:
951 assert isinstance(new_paths, list)
952 # print("add_new: start_i={} last_i={}".format(start_i, last_i))
953 # print("$ {} last_i={} start_i={}"
954 # .format(w, last_i, start_i))
955 max_last_i = max(max_last_i, last_i) # if last_i has grown
956 if (node, start_i, last_i) not in new_nodes:
957 new_nodes.append((node, start_i, last_i))
958 if node.end:
959 # We can see a terminal point in the search tree.
960 u = check_unknown(
961 last_i, start_i, i, wordlst, allow_any, no_unknown_starts
962 )
963 # Create new paths candidates based on different past possible
964 # paths; pos_path[last_i] contains possible paths, so add this
965 # new one at the beginning(?)
966 # The list comprehension inside the parens generates an iterable
967 # of lists, so this is .extend( [(last_i...)], [(last_i...)], ... )
968 # XXX: this is becoming impossible to annotate, nodes might
969 # need to become classed objects and not just dicts, or at least
970 # a TypedDict with a "children" node
971 new_paths.extend(
972 [(last_i, node.tags, node.topics)] + u + x
973 for x in pos_paths[last_i]
974 )
975 max_last_i = i + 1
976 return max_last_i
979@functools.lru_cache(maxsize=65536)
980def decode_tags(
981 src: str,
982 allow_any=False,
983 no_unknown_starts=False,
984) -> tuple[list[tuple[str, ...]], list[str]]:
985 tagsets, topics = decode_tags1(src, allow_any, no_unknown_starts)
986 # print(f"decode_tags: {src=}, {tagsets=}")
988 # Insert retry-code here that modifies the text source
989 if (
990 any(s.startswith("error-") for tagset in tagsets for s in tagset)
991 # I hate Python's *nested* list comprehension syntax ^
992 or any(s.startswith("error-") for s in topics)
993 ):
994 new_tagsets: list[tuple[str, ...]] = []
995 new_topics: list[str] = []
997 if "/" in src:
998 # slashes_re contains valid key entries with slashes; we're going
999 # to skip them by splitting the string and skipping handling every
1000 # second entry, which contains the splitting group like "masculine/
1001 # feminine" style keys.
1002 split_parts = re.split(slashes_re, src)
1003 new_parts: list[str] = []
1004 if len(split_parts) > 1:
1005 for i, s in enumerate(split_parts):
1006 if i % 2 == 0:
1007 new_parts.append(s.replace("/", " "))
1008 else:
1009 new_parts.append(s)
1010 new_src = "".join(new_parts)
1011 else:
1012 new_src = src
1013 new_tagsets, new_topics = decode_tags1(
1014 new_src, allow_any, no_unknown_starts
1015 )
1016 elif " or " in src or " and " in src:
1017 # Annoying kludge.
1018 new_src = src.replace(" and ", " ")
1019 new_src = new_src.replace(" or ", " ")
1020 new_tagsets, new_topics = decode_tags1(
1021 new_src, allow_any, no_unknown_starts
1022 )
1023 # print(f"{new_tagsets=}")
1025 if new_tagsets or new_topics:
1026 old_errors = sum(
1027 1 for tagset in tagsets for s in tagset if s.startswith("error")
1028 )
1029 old_errors += sum(1 for s in topics if s.startswith("error"))
1030 new_errors = sum(
1031 1
1032 for new_tagset in new_tagsets
1033 for s in new_tagset
1034 if s.startswith("error")
1035 )
1036 new_errors += sum(1 for s in new_topics if s.startswith("error"))
1038 if new_errors <= old_errors: 1038 ↛ 1041line 1038 didn't jump to line 1041 because the condition on line 1038 was always true
1039 return new_tagsets, new_topics
1041 return tagsets, topics
1044def decode_tags1(
1045 src: str,
1046 allow_any=False,
1047 no_unknown_starts=False,
1048) -> tuple[list[tuple[str, ...]], list[str]]:
1049 """Decodes tags, doing some canonicalizations. This returns a list of
1050 lists of tags and a list of topics."""
1051 assert isinstance(src, str)
1053 # print("decode_tags: src={!r}".format(src))
1055 pos_paths: list[list[list[PosPathStep]]] = [[[]]]
1056 wordlst: list[str] = []
1057 max_last_i = 0 # pre-initialized here so that it can be used as a ref
1059 add_new = functools.partial(
1060 add_new1, # pre-set parameters and references for function
1061 pos_paths=pos_paths,
1062 wordlst=wordlst,
1063 allow_any=allow_any,
1064 no_unknown_starts=no_unknown_starts,
1065 max_last_i=max_last_i,
1066 )
1067 # First split the tags at commas and semicolons. Their significance is that
1068 # a multi-word sequence cannot continue across them.
1069 parts = split_at_comma_semi(src, extra=[";", ":"])
1071 for part in parts:
1072 max_last_i = len(wordlst) # "how far have we gone?"
1073 lst1 = part.split()
1074 if not lst1:
1075 continue
1076 wordlst.extend(lst1)
1077 cur_nodes: list[tuple[ValidNode, int, int]] = [] # Currently seen
1078 for w in lst1:
1079 i = len(pos_paths) - 1
1080 new_nodes: list[tuple[ValidNode, int, int]] = []
1081 # replacement nodes for next loop
1082 new_paths: list[list[PosPathStep]] = []
1083 # print("ITER i={} w={} max_last_i={} wordlst={}"
1084 # .format(i, w, max_last_i, wordlst))
1085 node: ValidNode
1086 start_i: int
1087 last_i: int
1088 for node, start_i, last_i in cur_nodes:
1089 # ValidNodes are part of a search tree that checks if a
1090 # phrase is found in xlat_tags_map and other text->tags dicts.
1091 if w in node.children:
1092 # the phrase continues down the tree
1093 # print("INC", w)
1094 max_last_i = add_new(
1095 node.children[w],
1096 i,
1097 start_i,
1098 last_i,
1099 new_paths,
1100 new_nodes,
1101 )
1102 if node.end:
1103 # we've hit an end point, the tags and topics have already
1104 # been gathered at some point, don't do anything with the
1105 # old stuff
1106 if w in valid_sequences.children:
1107 # This starts a *new* possible section
1108 max_last_i = add_new(
1109 valid_sequences.children[w], # root->
1110 i,
1111 i,
1112 i,
1113 new_paths,
1114 new_nodes,
1115 )
1116 if w not in node.children and not node.end:
1117 # print("w not in node and $: i={} last_i={} wordlst={}"
1118 # .format(i, last_i, wordlst))
1119 # If i == last_i == 0, for example (beginning)
1120 if (
1121 i == last_i
1122 or no_unknown_starts
1123 or wordlst[last_i] not in allowed_unknown_starts
1124 ):
1125 # print("NEW", w)
1126 if w in valid_sequences.children:
1127 # Start new sequences here
1128 max_last_i = add_new(
1129 valid_sequences.children[w],
1130 i,
1131 i,
1132 last_i,
1133 new_paths,
1134 new_nodes,
1135 )
1136 if not new_nodes:
1137 # This is run at the start when i == max_last_i == 0,
1138 # which is what populates the first node in new_nodes.
1139 # Some initial words cause the rest to be interpreted as unknown
1140 # print("not new nodes: i={} last_i={} wordlst={}"
1141 # .format(i, max_last_i, wordlst))
1142 if (
1143 i == max_last_i
1144 or no_unknown_starts
1145 or wordlst[max_last_i] not in allowed_unknown_starts
1146 ):
1147 # print("RECOVER w={} i={} max_last_i={} wordlst={}"
1148 # .format(w, i, max_last_i, wordlst))
1149 if w in valid_sequences.children:
1150 max_last_i = add_new(
1151 # new sequence from root
1152 valid_sequences.children[w],
1153 i,
1154 i,
1155 max_last_i,
1156 new_paths,
1157 new_nodes,
1158 )
1159 cur_nodes = new_nodes # Completely replace nodes!
1160 # 2023-08-18, fix to improve performance
1161 # Decode tags does a big search of the best-shortest matching
1162 # sequences of tags, but the original algorithm didn't have
1163 # any culling happen during operation, so in a case with
1164 # a lot of tags (for example, big blocks of text inserted
1165 # somewhere by mistake that is processed by decode_tags),
1166 # it would lead to exponential growth of new_paths contents.
1167 # This culling, using the same weighting algorithm code as
1168 # in the original is just applied to new_paths before it is
1169 # added to pos_paths. Basically it's "take the 10 best paths".
1170 # This *can* cause bugs if it gets stuck in a local minimum
1171 # or something, but this whole process is one-dimensional
1172 # and not that complex, so hopefully it works out...
1173 pw = []
1174 path: list[PosPathStep]
1175 for path in new_paths:
1176 weight = len(path)
1177 if any(x[1] == ["UNKNOWN"] for x in path):
1178 weight += 100 # Penalize unknown paths
1179 pw.append((weight, path))
1180 new_paths = [weightpath[1] for weightpath in sorted(pw)[:10]]
1181 pos_paths.append(new_paths)
1183 # print("END max_last_i={} len(wordlst)={} len(pos_paths)={}"
1184 # .format(max_last_i, len(wordlst), len(pos_paths)))
1186 if cur_nodes:
1187 # print("END HAVE_NODES")
1188 for node, start_i, last_i in cur_nodes:
1189 if node.end:
1190 # print("$ END start_i={} last_i={}"
1191 # .format(start_i, last_i))
1192 for path in pos_paths[start_i]:
1193 pos_paths[-1].append(
1194 [(last_i, node.tags, node.topics)] + path
1195 )
1196 else:
1197 # print("UNK END start_i={} last_i={} wordlst={}"
1198 # .format(start_i, last_i, wordlst))
1199 u = check_unknown(
1200 last_i,
1201 len(wordlst),
1202 len(wordlst),
1203 wordlst,
1204 allow_any,
1205 no_unknown_starts,
1206 )
1207 if pos_paths[start_i]:
1208 for path in pos_paths[start_i]:
1209 pos_paths[-1].append(u + path)
1210 else:
1211 pos_paths[-1].append(u)
1212 else:
1213 # Check for a final unknown tag
1214 # print("NO END NODES max_last_i={}".format(max_last_i))
1215 paths = pos_paths[max_last_i] or [[]]
1216 u = check_unknown(
1217 max_last_i,
1218 len(wordlst),
1219 len(wordlst),
1220 wordlst,
1221 allow_any,
1222 no_unknown_starts,
1223 )
1224 if u:
1225 # print("end max_last_i={}".format(max_last_i))
1226 for path in list(paths): # Copy in case it is the last pos
1227 pos_paths[-1].append(u + path)
1229 # import json
1230 # print("POS_PATHS:", json.dumps(pos_paths, indent=2, sort_keys=True))
1232 if not pos_paths[-1]:
1233 # print("decode_tags: {}: EMPTY POS_PATHS[-1]".format(src))
1234 return [], []
1236 # Find the best path
1237 pw = []
1238 for path in pos_paths[-1]:
1239 weight = len(path)
1240 if any(x[1] == ["UNKNOWN"] for x in path):
1241 weight += 100 # Penalize unknown paths
1242 pw.append((weight, path))
1243 path = min(pw)[1]
1245 # Convert the best path to tagsets and topics
1246 tagsets: list[list[str]] = [[]]
1247 topics: list[str] = []
1248 for i, tagspec, topicspec in path:
1249 if len(tagsets or "") > 16:
1250 # ctx.error("Too many tagsets! This is probably exponential",
1251 # sortid="form_descriptions/20230818")
1252 return [("error-unknown-tag", "error-exponential-tagsets")], []
1253 if tagspec == ["UNKNOWN"]:
1254 new_tagsets = []
1255 for x in tagsets:
1256 new_tagsets.append(x + topicspec)
1257 tagsets = new_tagsets
1258 continue
1259 if tagspec:
1260 new_tagsets = []
1261 for x in tagsets:
1262 for t in tagspec:
1263 if t: 1263 ↛ 1270line 1263 didn't jump to line 1270 because the condition on line 1263 was always true
1264 new_tags = list(x)
1265 for tag in t.split():
1266 if tag not in new_tags:
1267 new_tags.append(tag)
1268 new_tagsets.append(new_tags)
1269 else:
1270 new_tagsets.append(x)
1271 tagsets = new_tagsets
1272 if topicspec:
1273 for t in topicspec:
1274 for topic in t.split():
1275 if topic not in topics:
1276 topics.append(topic)
1278 # print("unsorted tagsets:", tagsets)
1279 ret_tagsets = sorted(set(tuple(sorted(set(tags))) for tags in tagsets))
1280 # topics = list(sorted(set(topics))) XXX tests expect not sorted
1281 # print("decode_tags: {} -> {} topics {}".format(src, tagsets, topics))
1282 # Yes, ret_tagsets is a list of tags in tuples, while topics is a LIST
1283 # of tags. Turning topics into a tuple breaks tests, turning the tuples
1284 # inside tagsets into lists breaks tests, I'm leaving them mismatched
1285 # for now. XXX
1286 return ret_tagsets, topics
1289def parse_head_final_tags(
1290 wxr: WiktextractContext, lang: str, form: str
1291) -> tuple[str, list[str]]:
1292 """Parses tags that are allowed at the end of a form head from the end
1293 of the form. This can also be used for parsing the final gender etc tags
1294 from translations and linkages."""
1295 assert isinstance(wxr, WiktextractContext)
1296 assert isinstance(lang, str) # Should be language that "form" is for
1297 assert isinstance(form, str)
1299 # print("parse_head_final_tags: lang={} form={!r}".format(lang, form))
1301 # Make sure there are no double spaces in the form as this code does not
1302 # handle them otherwise.
1303 form = re.sub(r"\s+", " ", form.strip())
1304 if not form:
1305 return form, []
1307 origform = form
1309 tags = []
1311 # If parsing for certain Bantu languages (e.g., Swahili), handle
1312 # some extra head-final tags first
1313 if lang in head_final_bantu_langs:
1314 m = re.search(head_final_bantu_re, form)
1315 if m is not None:
1316 tagkeys = m.group(1)
1317 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1317 ↛ 1332line 1317 didn't jump to line 1332 because the condition on line 1317 was always true
1318 form = form[: m.start()]
1319 v = head_final_bantu_map[tagkeys]
1320 if v.startswith("?"): 1320 ↛ 1321line 1320 didn't jump to line 1321 because the condition on line 1320 was never true
1321 v = v[1:]
1322 wxr.wtp.debug(
1323 "suspicious suffix {!r} in language {}: {}".format(
1324 tagkeys, lang, origform
1325 ),
1326 sortid="form_descriptions/1028",
1327 )
1328 tags.extend(v.split())
1330 # If parsing for certain Semitic languages (e.g., Arabic), handle
1331 # some extra head-final tags first
1332 if lang in head_final_semitic_langs:
1333 m = re.search(head_final_semitic_re, form)
1334 if m is not None:
1335 tagkeys = m.group(1)
1336 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1336 ↛ 1351line 1336 didn't jump to line 1351 because the condition on line 1336 was always true
1337 form = form[: m.start()]
1338 v = head_final_semitic_map[tagkeys]
1339 if v.startswith("?"): 1339 ↛ 1340line 1339 didn't jump to line 1340 because the condition on line 1339 was never true
1340 v = v[1:]
1341 wxr.wtp.debug(
1342 "suspicious suffix {!r} in language {}: {}".format(
1343 tagkeys, lang, origform
1344 ),
1345 sortid="form_descriptions/1043",
1346 )
1347 tags.extend(v.split())
1349 # If parsing for certain other languages (e.g., Lithuanian,
1350 # French, Finnish), handle some extra head-final tags first
1351 if lang in head_final_other_langs:
1352 m = re.search(head_final_other_re, form)
1353 if m is not None:
1354 tagkeys = m.group(1)
1355 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1355 ↛ 1360line 1355 didn't jump to line 1360 because the condition on line 1355 was always true
1356 form = form[: m.start()]
1357 tags.extend(head_final_other_map[tagkeys].split(" "))
1359 # Handle normal head-final tags
1360 m = re.search(head_final_re, form)
1361 if m is not None:
1362 tagkeys = m.group(3)
1363 # Only replace tags ending with numbers in languages that have
1364 # head-final numeric tags (e.g., Bantu classes); also, don't replace
1365 # tags if the main title ends with them (then presume they are part
1366 # of the word)
1367 # print("head_final_tags form={!r} tagkeys={!r} lang={}"
1368 # .format(form, tagkeys, lang))
1369 tagkeys_contains_digit = re.search(r"\d", tagkeys)
1370 if (
1371 (not tagkeys_contains_digit or lang in head_final_numeric_langs)
1372 and not wxr.wtp.title.endswith(" " + tagkeys) # type:ignore[union-attr]
1373 and
1374 # XXX the above test does not capture when the whole word is a
1375 # xlat_head_map key, so I added the below test to complement
1376 # it; does this break anything?
1377 not wxr.wtp.title == tagkeys
1378 ): # defunct/English,
1379 # "more defunct" -> "more" ["archaic"]
1380 if not tagkeys_contains_digit or lang in head_final_numeric_langs: 1380 ↛ 1394line 1380 didn't jump to line 1394 because the condition on line 1380 was always true
1381 form = form[: m.start()]
1382 v = xlat_head_map[tagkeys]
1383 if v.startswith("?"): 1383 ↛ 1384line 1383 didn't jump to line 1384 because the condition on line 1383 was never true
1384 v = v[1:]
1385 wxr.wtp.debug(
1386 "suspicious suffix {!r} in language {}: {}".format(
1387 tagkeys, lang, origform
1388 ),
1389 sortid="form_descriptions/1077",
1390 )
1391 tags.extend(v.split())
1393 # Generate warnings about words ending in " or" after processing
1394 if (
1395 (form.endswith(" or") and not origform.endswith(" or"))
1396 or re.search(
1397 r" (1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|"
1398 r"1a|2a|9a|10a|m1|f1|f2|m2|f3|m3|f4|m4|f5|m5|or|\?)"
1399 r"($|/| (f|m|sg|pl|anim|inan))",
1400 form,
1401 )
1402 or form.endswith(" du")
1403 ):
1404 if form not in ok_suspicious_forms:
1405 wxr.wtp.debug(
1406 "suspicious unhandled suffix in {}: {!r}, originally {!r}".format(
1407 lang, form, origform
1408 ),
1409 sortid="form_descriptions/1089",
1410 )
1412 # print("parse_head_final_tags: form={!r} tags={}".format(form, tags))
1413 return form, tags
1416def quote_kept_parens(s: str) -> str:
1417 """Changes certain parenthesized expressions so that they won't be
1418 interpreted as parentheses. This is used for parts that are kept as
1419 part of the word, such as "read admiral (upper half)"."""
1420 return re.sub(
1421 r"\((lower half|upper half|k|s|n|II|III|A|C|G|U|Y|"
1422 r"vinyl|p-phenylene vinylene|\(\(\s*\)\))\)",
1423 r"__lpar__\1__rpar__",
1424 s,
1425 )
1428def quote_kept_ruby(
1429 wxr: WiktextractContext,
1430 ruby_tuples: list[
1431 tuple[
1432 str,
1433 str,
1434 ]
1435 ],
1436 s: str,
1437) -> str:
1438 if len(ruby_tuples) < 1:
1439 wxr.wtp.debug(
1440 "quote_kept_ruby called with no ruby",
1441 sortid="form_description/1114/20230517",
1442 )
1443 return s
1444 ks = []
1445 rs = []
1446 for k, r in ruby_tuples:
1447 ks.append(re.escape(k))
1448 rs.append(re.escape(r))
1449 if not (ks and rs):
1450 wxr.wtp.debug(
1451 f"empty column in ruby_tuples: {ruby_tuples}",
1452 sortid="form_description/1124/20230606",
1453 )
1454 return s
1455 newm = re.compile(
1456 r"({})\s*\(\s*({})\s*\)".format("|".join(ks), "|".join(rs))
1457 )
1458 rub_re = re.compile(
1459 r"({})".format(
1460 r"|".join(
1461 r"{}\(*{}\)*".format(
1462 re.escape(k),
1463 re.escape(r),
1464 )
1465 for k, r in ruby_tuples
1466 )
1467 )
1468 )
1470 def paren_replace(m: re.Match) -> str:
1471 return re.sub(newm, r"\1__lrub__\2__rrub__", m.group(0))
1473 return re.sub(rub_re, paren_replace, s)
1476def unquote_kept_parens(s: str) -> str:
1477 """Conerts the quoted parentheses back to normal parentheses."""
1478 return re.sub(r"__lpar__(.*?)__rpar__", r"(\1)", s)
1481def add_romanization(
1482 wxr: WiktextractContext,
1483 data: WordData,
1484 roman: str,
1485 text: str,
1486 is_reconstruction: bool,
1487 head_group: Optional[int],
1488 ruby: Sequence[tuple[str, str]],
1489) -> None:
1490 tags_lst = ["romanization"]
1491 m = re.match(r"([^:]+):(.+)", roman)
1492 # This function's purpose is to intercept broken romanizations,
1493 # like "Yale: hēnpyeng" style tags. Most romanization styles
1494 # are already present as tags, so we can use decode_tags to find
1495 # them.
1496 if m: 1496 ↛ 1497line 1496 didn't jump to line 1497 because the condition on line 1496 was never true
1497 tagsets, topics = decode_tags(m.group(1))
1498 if tagsets:
1499 for tags in tagsets:
1500 tags_lst.extend(tags)
1501 roman = m.group(2)
1502 add_related(
1503 wxr,
1504 data,
1505 tags_lst,
1506 [roman],
1507 text,
1508 True,
1509 is_reconstruction,
1510 head_group,
1511 ruby,
1512 )
1515def add_related(
1516 wxr: WiktextractContext,
1517 data: WordData,
1518 tags_lst: Union[list[str], tuple[str, ...]],
1519 related_list: list[str],
1520 origtext: str,
1521 add_all_canonicals: bool,
1522 is_reconstruction: bool,
1523 head_group: Optional[int],
1524 ruby_data: Optional[Sequence[tuple[str, str]]] = None,
1525) -> Optional[list[tuple[str, ...]]]:
1526 """Internal helper function for some post-processing entries for related
1527 forms (e.g., in word head). This returns a list of list of tags to be
1528 added to following related forms or None (cf. walrus/English word head,
1529 parenthesized part starting with "both")."""
1530 assert isinstance(wxr, WiktextractContext)
1531 assert isinstance(tags_lst, (list, tuple))
1532 for x in tags_lst:
1533 assert isinstance(x, str)
1534 assert isinstance(related_list, (list, tuple))
1535 assert isinstance(origtext, str)
1536 assert add_all_canonicals in (True, False)
1537 assert isinstance(ruby_data, (list, tuple)) or ruby_data is None
1538 if ruby_data is None: 1538 ↛ 1539line 1538 didn't jump to line 1539 because the condition on line 1538 was never true
1539 ruby_data = []
1540 related = " ".join(related_list)
1541 # print("add_related: tags_lst={} related={}".format(tags_lst, related))
1542 if related == "[please provide]": 1542 ↛ 1543line 1542 didn't jump to line 1543 because the condition on line 1542 was never true
1543 return None
1544 if related in IGNORED_RELATED: 1544 ↛ 1545line 1544 didn't jump to line 1545 because the condition on line 1544 was never true
1545 return None
1546 if is_reconstruction and related.startswith("*") and len(related) > 1:
1547 related = related[1:]
1549 # Get title word, with any reconstruction prefix removed
1550 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title) # type:ignore[arg-type]
1552 def check_related(related: str) -> None:
1553 # Warn about some suspicious related forms
1554 m = re.search(suspicious_related_re, related)
1555 if (m and m.group(0) not in titleword) or (
1556 related in ("f", "m", "n", "c") and len(titleword) >= 3
1557 ):
1558 if "eumhun" in tags_lst: 1558 ↛ 1559line 1558 didn't jump to line 1559 because the condition on line 1558 was never true
1559 return
1560 if "cangjie-input" in tags_lst: 1560 ↛ 1561line 1560 didn't jump to line 1561 because the condition on line 1560 was never true
1561 return
1562 if "class" in tags_lst: 1562 ↛ 1563line 1562 didn't jump to line 1563 because the condition on line 1562 was never true
1563 return
1564 if wxr.wtp.section == "Korean" and re.search( 1564 ↛ 1568line 1564 didn't jump to line 1568 because the condition on line 1564 was never true
1565 r"^\s*\w*>\w*\s*$", related
1566 ):
1567 # ignore Korean "i>ni" / "라>나" values
1568 return
1569 if ( 1569 ↛ 1576line 1569 didn't jump to line 1576 because the condition on line 1569 was never true
1570 wxr.wtp.section == "Burmese"
1571 and "romanization" in tags_lst
1572 and re.search(r":", related)
1573 ):
1574 # ignore Burmese with ":", that is used in Burmese
1575 # translitteration of "း", the high-tone visarga.
1576 return
1577 wxr.wtp.debug(
1578 "suspicious related form tags {}: {!r} in {!r}".format(
1579 tags_lst, related, origtext
1580 ),
1581 sortid="form_descriptions/1147",
1582 )
1584 following_tagsets = None # Tagsets to add to following related forms
1585 roman = None
1586 tagsets1: list[tuple[str, ...]] = [tuple()]
1587 topics1: list[str] = []
1589 m = re.match(r"\((([^()]|\([^()]*\))*)\)\s+", related)
1590 if m:
1591 paren = m.group(1)
1592 related = related[m.end() :]
1593 m = re.match(r"^(all|both) (.*)", paren)
1594 if m: 1594 ↛ 1595line 1594 didn't jump to line 1595 because the condition on line 1594 was never true
1595 tagsets1, topics1 = decode_tags(m.group(2))
1596 following_tagsets = tagsets1
1597 else:
1598 tagsets1, topics1 = decode_tags(paren)
1599 else:
1600 m = re.search(r"\s+\((([^()]|\([^()]*\))*)\)$", related)
1601 if m:
1602 paren = m.group(1)
1603 if paren.startswith("U+"): 1603 ↛ 1604line 1603 didn't jump to line 1604 because the condition on line 1603 was never true
1604 related = related[: m.start()]
1605 else:
1606 cls = classify_desc(paren)
1607 if ( 1607 ↛ 1614line 1607 didn't jump to line 1614 because the condition on line 1607 was always true
1608 cls in ("romanization", "english")
1609 and classify_desc(related[: m.start()]) == "other"
1610 ):
1611 roman = paren
1612 related = related[: m.start()]
1613 else:
1614 related = related[: m.start()]
1615 tagsets1, topics1 = decode_tags(paren)
1616 if related and related.startswith("{{"): 1616 ↛ 1617line 1616 didn't jump to line 1617 because the condition on line 1616 was never true
1617 wxr.wtp.debug(
1618 "{{ in word head form - possible Wiktionary error: {!r}".format(
1619 related
1620 ),
1621 sortid="form_descriptions/1177",
1622 )
1623 return None # Likely Wiktionary coding error
1624 related = unquote_kept_parens(related)
1625 # Split related by "/" (e.g., grande/Spanish) superlative in head
1626 # Do not split if / in word title, see π//Japanese
1627 if len(related) > 5 and "/" not in wxr.wtp.title: # type:ignore[operator]
1628 alts = split_at_comma_semi(related, separators=["/"])
1629 else:
1630 alts = [related]
1631 if ruby_data: 1631 ↛ 1633line 1631 didn't jump to line 1633 because the condition on line 1631 was never true
1632 # prepare some regex stuff in advance
1633 ks, rs = [], []
1634 for k, r in ruby_data:
1635 ks.append(re.escape(k))
1636 rs.append(re.escape(r))
1637 splitter = r"((?:{})__lrub__(?:{})__rrub__)".format(
1638 "|".join(ks), "|".join(rs)
1639 )
1640 for related in alts:
1641 ruby: list[tuple[str, str]] = []
1642 if ruby_data: 1642 ↛ 1643line 1642 didn't jump to line 1643 because the condition on line 1642 was never true
1643 new_related = []
1644 rub_split = re.split(splitter, related)
1645 for s in rub_split:
1646 m = re.match(r"(.+)__lrub__(.+)__rrub__", s)
1647 if m:
1648 # add ruby with (\1, \2)
1649 ruby.append((m.group(1), m.group(2)))
1650 new_related.append(m.group(1))
1651 else:
1652 new_related.append(s)
1653 related = "".join(new_related)
1654 tagsets2, topics2 = decode_tags(" ".join(tags_lst))
1655 for tags1 in tagsets1:
1656 assert isinstance(tags1, (list, tuple))
1657 for tags2 in tagsets2:
1658 assert isinstance(tags1, (list, tuple))
1659 dt: LinkageData = {"word": related}
1660 if roman:
1661 dt["roman"] = roman
1662 if ruby: 1662 ↛ 1663line 1662 didn't jump to line 1663 because the condition on line 1662 was never true
1663 dt["ruby"] = ruby
1664 if "alt-of" in tags2: 1664 ↛ 1665line 1664 didn't jump to line 1665 because the condition on line 1664 was never true
1665 check_related(related)
1666 data_extend(data, "tags", tags1)
1667 data_extend(data, "tags", tags2)
1668 data_extend(data, "topics", topics1)
1669 data_extend(data, "topics", topics2)
1670 data_append(data, "alt_of", dt)
1671 elif "form-of" in tags2: 1671 ↛ 1672line 1671 didn't jump to line 1672 because the condition on line 1671 was never true
1672 check_related(related)
1673 data_extend(data, "tags", tags1)
1674 data_extend(data, "tags", tags2)
1675 data_extend(data, "topics", topics1)
1676 data_extend(data, "topics", topics2)
1677 data_append(data, "form_of", dt)
1678 elif "compound-of" in tags2: 1678 ↛ 1679line 1678 didn't jump to line 1679 because the condition on line 1678 was never true
1679 check_related(related)
1680 data_extend(data, "tags", tags1)
1681 data_extend(data, "tags", tags2)
1682 data_extend(data, "topics", topics1)
1683 data_extend(data, "topics", topics2)
1684 data_append(data, "compound", related)
1685 else:
1686 lang = wxr.wtp.section or "LANG_MISSING"
1687 related, final_tags = parse_head_final_tags(
1688 wxr, lang, related
1689 )
1690 # print("add_related: related={!r} tags1={!r} tags2={!r} "
1691 # "final_tags={!r}"
1692 # .format(related, tags1, tags2, final_tags))
1693 tags = list(tags1) + list(tags2) + list(final_tags)
1694 check_related(related)
1695 form: FormData = {"form": related}
1696 if head_group:
1697 form["head_nr"] = head_group
1698 if roman:
1699 form["roman"] = roman
1700 if ruby: 1700 ↛ 1701line 1700 didn't jump to line 1701 because the condition on line 1700 was never true
1701 form["ruby"] = ruby
1702 data_extend(form, "topics", topics1)
1703 data_extend(form, "topics", topics2)
1704 if topics1 or topics2: 1704 ↛ 1705line 1704 didn't jump to line 1705 because the condition on line 1704 was never true
1705 wxr.wtp.debug(
1706 "word head form has topics: {}".format(form),
1707 sortid="form_descriptions/1233",
1708 )
1709 # Add tags from canonical form into the main entry
1710 if "canonical" in tags:
1711 if related in ("m", "f") and len(titleword) > 1: 1711 ↛ 1712line 1711 didn't jump to line 1712 because the condition on line 1711 was never true
1712 wxr.wtp.debug(
1713 "probably incorrect canonical form "
1714 "{!r} ignored (probably tag combination "
1715 "missing from xlat_head_map)".format(related),
1716 sortid="form_descriptions/1241",
1717 )
1718 continue
1719 if (
1720 related != titleword
1721 or add_all_canonicals
1722 or topics1
1723 or topics2
1724 or ruby
1725 ):
1726 data_extend(form, "tags", sorted(set(tags)))
1727 else:
1728 # We won't add canonical form here
1729 filtered_tags = list(
1730 x for x in tags if x != "canonical"
1731 )
1732 data_extend(data, "tags", filtered_tags)
1733 continue
1734 else:
1735 data_extend(form, "tags", sorted(set(tags)))
1736 # Only insert if the form is not already there
1737 for old in data.get("forms", ()):
1738 if form == old: 1738 ↛ 1739line 1738 didn't jump to line 1739 because the condition on line 1738 was never true
1739 break
1740 else:
1741 data_append(data, "forms", form)
1743 # If this form had pre-tags that started with "both" or "all", add those
1744 # tags also to following related forms that don't have their own tags
1745 # specified.
1746 return following_tagsets
1749# Issue #967, in English word forms sometimes forms are skipped because
1750# they are taggable words and their distw() is too big, like clipping from clip
1751WORDS_WITH_FALSE_POSITIVE_TAGS: dict[str, list[str]] = {
1752 "clip": ["clipping"], # XXX remember to change me back to clipping after
1753 "English": ["English", "Englishes"],
1754 "common": ["common", "commoner"],
1755 # tests.
1756}
1758WORDS_WITH_FALSE_POSITIVE_FORMS: dict[str, list[str]] = {
1759 "unaccountability": ["countable", "uncountable"],
1760 "uncountability": ["countable", "uncountable"],
1761}
1763FALSE_POSITIVE_MISSING_FORMS: dict[str, list[str]] = {}
1765FORM_ASSOCIATED_TAG_WORDS: set[str] = {
1766 "participle",
1767 "past",
1768 "present",
1769 "singular",
1770 "plural",
1771 "first-person",
1772 "second-person",
1773 "third-person",
1774 "gerund",
1775}
1778def parse_word_head(
1779 wxr: WiktextractContext,
1780 pos: str,
1781 text: str,
1782 data: WordData,
1783 is_reconstruction: bool,
1784 head_group: Optional[int],
1785 ruby=None,
1786 links=None,
1787) -> None:
1788 """Parses the head line for a word for in a particular language and
1789 part-of-speech, extracting tags and related forms."""
1790 assert isinstance(wxr, WiktextractContext)
1791 assert isinstance(pos, str)
1792 assert isinstance(text, str)
1793 assert isinstance(data, dict)
1794 assert isinstance(ruby, (list, tuple)) or ruby is None
1795 if ruby is None:
1796 ruby = []
1797 assert is_reconstruction in (True, False)
1798 # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text))
1799 # print(f"PARSE_WORD_HEAD: {data=}")
1800 if links is None:
1801 links = []
1803 if len(links) > 0:
1804 # if we have link data (that is, links with stuff like commas and
1805 # spaces, replace word_re with a modified local scope pattern
1806 # print(f"links {list((c, ord(c)) for link in links for c in link)=}")
1807 word_re = re.compile(
1808 r"\b" # In case we have forms that are longer and contain links
1809 +
1810 # or words as a substring...
1811 r"\b|\b".join(
1812 sorted((re.escape(s) for s in links), key=lambda x: -len(x))
1813 )
1814 + r"\b|"
1815 + word_pattern
1816 )
1817 else:
1818 word_re = word_re_global
1820 if "Lua execution error" in text or "Lua timeout error" in text: 1820 ↛ 1821line 1820 didn't jump to line 1821 because the condition on line 1820 was never true
1821 return
1823 # In Aug 2021, some words had spurious Template:en at the end of head forms
1824 # due to a Wiktionary error.
1825 text = re.sub(r"\s+Template:[-a-zA-Z]+\s*$", "", text)
1827 # Fix words with "superlative:" or "comparative:" at end of head
1828 # e.g. grande/Spanish/Adj
1829 text = re.sub(r" (superlative|comparative): (.*)", r" (\1 \2)", text)
1831 # Parse Arabic non-past forms, e.g. أبلع/Arabic/Verb
1832 m = re.search(r", non-past ([^)]+ \([^)]+\))", text)
1833 if m:
1834 add_related(
1835 wxr,
1836 data,
1837 ["non-past"],
1838 [m.group(1)],
1839 text,
1840 True,
1841 is_reconstruction,
1842 head_group,
1843 ruby,
1844 )
1845 text = text[: m.start()] + text[m.end() :]
1847 language = wxr.wtp.section
1848 titleword = re.sub(
1849 r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "MISSING_TITLE"
1850 )
1851 titleparts = list(
1852 m.group(0)
1853 for m in re.finditer(word_re, wxr.wtp.title or "MISSING_TITLE")
1854 )
1855 if not titleparts: 1855 ↛ 1856line 1855 didn't jump to line 1856 because the condition on line 1855 was never true
1856 return
1858 # Remove " or" from the end to prevent weird canonical forms
1859 if text.endswith(" or"):
1860 for tp in titleparts:
1861 if text.endswith(tp): 1861 ↛ 1862line 1861 didn't jump to line 1862 because the condition on line 1861 was never true
1862 break
1863 else:
1864 text = text.removesuffix(" or").rstrip()
1866 # Handle the part of the head that is not in parentheses. However, certain
1867 # parenthesized parts are part of word, and those must be handled
1868 # specially here.
1869 if ruby: 1869 ↛ 1870line 1869 didn't jump to line 1870 because the condition on line 1869 was never true
1870 text = quote_kept_ruby(wxr, ruby, text)
1871 base = text
1872 base = quote_kept_parens(base)
1873 base = remove_text_in_parentheses(base)
1874 base = base.replace("?", "") # Removes uncertain articles etc
1875 base = re.sub(r"\s+", " ", base)
1876 base = re.sub(r" ([,;])", r"\1", base)
1877 base = re.sub(r" • ", r" ", base)
1878 # Many languages use • as a punctuation mark separating the base
1879 # from the rest of the head. στάδιος/Ancient Greek, issue #176
1880 base = base.strip()
1882 # Check for certain endings in head (mostly for compatibility with weird
1883 # heads, e.g. rata/Romanian "1st conj." at end)
1884 m = re.search(head_end_re, base)
1885 tags: Union[tuple[str, ...], list[str]] = []
1886 if m: 1886 ↛ 1887line 1886 didn't jump to line 1887 because the condition on line 1886 was never true
1887 tags = head_end_map[m.group(1).lower()].split()
1888 data_extend(data, "tags", tags)
1889 base = base[: m.start()]
1891 # Special case: handle Hán Nôm readings for Vietnamese characters
1892 m = re.match(
1893 r"{}: (Hán Nôm) readings: (.*)".format(re.escape(titleword)), base
1894 )
1895 if m: 1895 ↛ 1896line 1895 didn't jump to line 1896 because the condition on line 1895 was never true
1896 tag, readings = m.groups()
1897 tag = re.sub(r"\s+", "-", tag)
1898 for reading in split_at_comma_semi(readings, skipped=links):
1899 add_related(
1900 wxr,
1901 data,
1902 [tag],
1903 [reading],
1904 text,
1905 True,
1906 is_reconstruction,
1907 head_group,
1908 ruby,
1909 )
1910 return
1912 # Special case: Hebrew " [pattern: nnn]" ending
1913 m = re.search(r"\s+\[pattern: ([^]]+)\]", base)
1914 if m: 1914 ↛ 1915line 1914 didn't jump to line 1915 because the condition on line 1914 was never true
1915 add_related(
1916 wxr,
1917 data,
1918 ["class"],
1919 [m.group(1)],
1920 text,
1921 True,
1922 is_reconstruction,
1923 head_group,
1924 ruby,
1925 )
1926 base = base[: m.start()] + base[m.end() :]
1928 # Clean away some messy "Upload an image" template text used in
1929 # American Sign Language:
1930 # S@NearBaseForearm-PalmUp Frontandback S@BaseForearm-PalmUp
1931 m = re.search(r"Upload .+ gif image.", base)
1932 if m: 1932 ↛ 1933line 1932 didn't jump to line 1933 because the condition on line 1932 was never true
1933 base = base[: m.start()] + base[m.end() :]
1935 # Split the head into alternatives. This is a complicated task, as
1936 # we do not want so split on "or" or "," when immediately followed by more
1937 # head-final tags, but otherwise do want to split by them.
1938 # 20230907 added "or" to this to handle 'true or false', titles with 'or'
1939 if wxr.wtp.title and ("," in wxr.wtp.title or " or " in wxr.wtp.title):
1940 # A kludge to handle article titles/phrases with commas.
1941 # Preprocess splits to first capture the title, then handle
1942 # all the others as usual.
1943 presplits = re.split(r"({})".format(wxr.wtp.title), base)
1944 splits = []
1945 for psplit in presplits:
1946 if psplit == wxr.wtp.title:
1947 splits.append(psplit)
1948 else:
1949 splits.extend(re.split(head_split_re, psplit))
1950 else:
1951 # Do the normal split; previous only-behavior.
1952 splits = re.split(head_split_re, base)
1953 # print("SPLITS:", splits)
1954 alts: list[str] = []
1955 # print("parse_word_head: splits:", splits,
1956 # "head_split_re_parens:", head_split_re_parens)
1957 for i in range(
1958 0, len(splits) - head_split_re_parens, head_split_re_parens + 1
1959 ):
1960 v = splits[i]
1961 ending = splits[i + 1] or "" # XXX is this correct???
1962 # print("parse_word_head alts v={!r} ending={!r} alts={}"
1963 # .format(v, ending, alts))
1964 if alts and (v == "" and ending):
1965 assert ending[0] == " "
1966 alts[-1] += " or" + ending # endings starts with space
1967 elif v or ending: 1967 ↛ 1957line 1967 didn't jump to line 1957 because the condition on line 1967 was always true
1968 alts.append((v or "") + (ending or ""))
1969 last = splits[-1].strip()
1970 conn = "" if len(splits) < 3 else splits[-2]
1971 # print("parse_word_head alts last={!r} conn={!r} alts={}"
1972 # .format(last, conn, alts))
1973 if (
1974 alts
1975 and last
1976 and (
1977 last.split()[0] in xlat_head_map
1978 or (
1979 conn == " or "
1980 and (alts[-1] + " or " + last).strip() in xlat_head_map
1981 )
1982 )
1983 ):
1984 alts[-1] += " or " + last
1985 elif last:
1986 alts.append(last)
1988 # print("parse_word_head alts: {}".format(alts))
1989 # print(f"{base=}")
1991 # Process the head alternatives
1992 canonicals: list[tuple[list[str], list[str]]] = []
1993 mode: Optional[str] = None
1994 for alt_i, alt in enumerate(alts):
1995 alt = alt.strip()
1996 if alt.startswith("compound form:"): 1996 ↛ 1997line 1996 didn't jump to line 1997 because the condition on line 1996 was never true
1997 mode = "compound-form"
1998 alt = alt[14:].strip()
1999 if mode == "compound-form": 1999 ↛ 2000line 1999 didn't jump to line 2000 because the condition on line 1999 was never true
2000 add_related(
2001 wxr,
2002 data,
2003 ["in-compounds"],
2004 [alt],
2005 text,
2006 True,
2007 is_reconstruction,
2008 head_group,
2009 ruby,
2010 )
2011 continue
2012 # For non-first parts, see if it can be treated as tags-only
2013 if alt_i == 0:
2014 expanded_alts = [alt]
2015 else:
2016 expanded_alts = map_with(xlat_descs_map, [alt])
2017 # print("EXPANDED_ALTS:", expanded_alts)
2018 tagsets: Optional[list[tuple[str, ...]]]
2019 for alt in expanded_alts:
2020 baseparts = list(m.group(0) for m in word_re.finditer(alt))
2021 if alt_i > 0:
2022 tagsets, topics = decode_tags(" ".join(baseparts))
2023 if not any("error-unknown-tag" in x for x in tagsets):
2024 data_extend(data, "topics", topics)
2025 for tags1 in tagsets:
2026 data_extend(data, "tags", tags1)
2027 continue
2029 alt, tags = parse_head_final_tags(
2030 wxr, language or "MISSING_LANG", alt
2031 )
2032 tags = list(tags) # Make sure we don't modify anything cached
2033 tags.append("canonical")
2034 if alt_i == 0 and "," in wxr.wtp.title: # type:ignore[operator]
2035 # Kludge to handle article titles/phrases with commas.
2036 # basepart's regex strips commas, which leads to a
2037 # canonical form that is the title phrase without a comma.
2038 # basepart in add_related is almost immediately joined with
2039 # spaces anyhow. XXX not exactly sure why it's
2040 # canonicals.append((tags, baseparts)) and not (tags, [alt])
2041 baseparts = [alt]
2042 canonicals.append((tags, baseparts))
2043 for tags, baseparts in canonicals:
2044 add_related(
2045 wxr,
2046 data,
2047 tags,
2048 baseparts,
2049 text,
2050 len(canonicals) > 1,
2051 is_reconstruction,
2052 head_group,
2053 ruby,
2054 )
2056 # Handle parenthesized descriptors for the word form and links to
2057 # related words
2058 text = quote_kept_parens(text)
2059 parens = list(
2060 m.group(2)
2061 for m in re.finditer(r"(^|\s)\((([^()]|\([^()]*\))*)\)", text)
2062 )
2063 parens.extend(
2064 m.group(1)
2065 for m in re.finditer(r"[^\s]\((([^()]|\([^()]*\))*)\)($|\s)", text)
2066 )
2067 have_romanization = False
2068 have_ruby = False
2069 hiragana = ""
2070 katakana = ""
2071 for paren in parens:
2072 paren = paren.strip()
2073 if not paren: 2073 ↛ 2074line 2073 didn't jump to line 2074 because the condition on line 2073 was never true
2074 continue
2075 if paren.startswith("see "):
2076 continue
2077 if paren.startswith("U+"): 2077 ↛ 2078line 2077 didn't jump to line 2078 because the condition on line 2077 was never true
2078 continue
2079 # In some rare cases, strip word that inflects form the form
2080 # description, e.g. "look through rose-tinted glasses"/English.
2081 paren = re.sub(r"\s*\(\[[^])]*\]\)", "", paren)
2083 # If it starts with hiragana or katakana, treat as such form. Note
2084 # that each hiragana/katakana character is in separate parentheses,
2085 # so we must concatenate them.
2086 try:
2087 un = unicodedata.name(paren[0]).split()[0]
2088 except ValueError:
2089 un = "INVALID"
2090 if un == "KATAKANA": 2090 ↛ 2091line 2090 didn't jump to line 2091 because the condition on line 2090 was never true
2091 katakana += paren
2092 have_ruby = True
2093 continue
2094 if un == "HIRAGANA": 2094 ↛ 2095line 2094 didn't jump to line 2095 because the condition on line 2094 was never true
2095 hiragana += paren
2096 have_ruby = True
2097 continue
2099 # Parse format ", 16 (Japan, Mainland), 17 (Hong Kong, Taiwan) strokes,"
2100 # in the middle of the parenthesized expression, e.g. 薄
2101 def strokes_repl(m: re.Match) -> str:
2102 strokes1, tags1, strokes2, tags2 = m.groups()
2103 for strokes, tags in [[strokes1, tags1], [strokes2, tags2]]:
2104 tags = tags.split(", ")
2105 tags = list(
2106 "Mainland China" if t == "Mainland" else t for t in tags
2107 )
2108 tags.append("strokes")
2109 add_related(
2110 wxr,
2111 data,
2112 tags,
2113 [strokes],
2114 text,
2115 True,
2116 is_reconstruction,
2117 head_group,
2118 ruby,
2119 )
2120 return ", "
2122 paren = re.sub(
2123 r", (\d+) \(([^()]+)\), (\d+) \(([^()]+)\) strokes, ",
2124 strokes_repl,
2125 paren,
2126 )
2128 descriptors = map_with(xlat_descs_map, [paren])
2129 new_desc = []
2130 for desc in descriptors:
2131 new_desc.extend(
2132 map_with(
2133 xlat_tags_map,
2134 split_at_comma_semi(desc, extra=[", or "], skipped=links),
2135 )
2136 )
2137 prev_tags: Union[list[list[str]], list[tuple[str, ...]], None] = None
2138 following_tags = None # Added to prev_tags from previous parenthesized
2139 # part, e.g. walrus/English
2140 # "(both nonstandard, proscribed, uncommon)"
2141 for desc_i, desc in enumerate(new_desc):
2142 # print("HEAD DESC: {!r}".format(desc))
2144 # Abort on certain descriptors (assume remaining values are
2145 # examples or uninteresting, cf. gaan/Navajo, horior/Latin)
2146 if re.match(r"^(per |e\.g\.$)", desc): 2146 ↛ 2147line 2146 didn't jump to line 2147 because the condition on line 2146 was never true
2147 break
2149 # If it all consists of CJK characters, add it with the
2150 # CJK tag. This is used at least for some Vietnamese
2151 # words (e.g., ba/Vietnamese)
2152 try:
2153 if all(unicodedata.name(x).startswith("CJK ") for x in desc): 2153 ↛ 2154line 2153 didn't jump to line 2154 because the condition on line 2153 was never true
2154 add_related(
2155 wxr,
2156 data,
2157 ["CJK"],
2158 [desc],
2159 text,
2160 True,
2161 is_reconstruction,
2162 head_group,
2163 ruby,
2164 )
2165 continue
2166 except ValueError:
2167 pass
2169 # Handle some special cases
2170 splitdesc = desc.split()
2171 if ( 2171 ↛ 2180line 2171 didn't jump to line 2180 because the condition on line 2171 was never true
2172 len(splitdesc) >= 3
2173 and splitdesc[1] == "superlative"
2174 and classify_desc(splitdesc[0]) != "tags"
2175 and prev_tags
2176 ):
2177 # Handle the special case of second comparative after comma,
2178 # followed by superlative without comma. E.g.
2179 # mal/Portuguese/Adv
2180 for ts in prev_tags:
2181 add_related(
2182 wxr,
2183 data,
2184 ts,
2185 [splitdesc[0]],
2186 text,
2187 True,
2188 is_reconstruction,
2189 head_group,
2190 ruby,
2191 )
2192 desc = " ".join(splitdesc[1:])
2193 elif ( 2193 ↛ 2201line 2193 didn't jump to line 2201 because the condition on line 2193 was never true
2194 len(splitdesc) == 2
2195 and splitdesc[0] in ("also", "and")
2196 and prev_tags
2197 and classify_desc(splitdesc[1]) != "tags"
2198 ):
2199 # Sometimes alternative forms are prefixed with "also" or
2200 # "and"
2201 for ts in prev_tags:
2202 add_related(
2203 wxr,
2204 data,
2205 ts,
2206 [splitdesc[1]],
2207 text,
2208 True,
2209 is_reconstruction,
2210 head_group,
2211 ruby,
2212 )
2213 continue
2214 elif len(splitdesc) >= 2 and splitdesc[0] in ("including",): 2214 ↛ 2215line 2214 didn't jump to line 2215 because the condition on line 2214 was never true
2215 continue
2217 # If only one word, assume it is comma-separated alternative
2218 # to the previous one
2219 if " " not in desc:
2220 cls = classify_desc(desc)
2221 if cls != "tags":
2222 if prev_tags: 2222 ↛ 2224line 2222 didn't jump to line 2224 because the condition on line 2222 was never true
2223 # Assume comma-separated alternative to previous one
2224 for ts in prev_tags:
2225 add_related(
2226 wxr,
2227 data,
2228 ts,
2229 [desc],
2230 text,
2231 True,
2232 is_reconstruction,
2233 head_group,
2234 ruby,
2235 )
2236 continue
2237 elif distw(titleparts, desc) <= 0.5: 2237 ↛ 2240line 2237 didn't jump to line 2240 because the condition on line 2237 was never true
2238 # Similar to head word, assume a dialectal variation to
2239 # the base form. Cf. go/Alemannic German/Verb
2240 add_related(
2241 wxr,
2242 data,
2243 ["alternative"],
2244 [desc],
2245 text,
2246 True,
2247 is_reconstruction,
2248 head_group,
2249 ruby,
2250 )
2251 continue
2252 elif (
2253 cls in ("romanization", "english")
2254 and not have_romanization
2255 and classify_desc(titleword) == "other"
2256 and not (
2257 "categories" in data and desc in data["categories"]
2258 )
2259 ):
2260 # Assume it to be a romanization
2261 add_romanization(
2262 wxr,
2263 data,
2264 desc,
2265 text,
2266 is_reconstruction,
2267 head_group,
2268 ruby,
2269 )
2270 have_romanization = True
2271 continue
2273 m = re.match(r"^(\d+) strokes?$", desc)
2274 if m:
2275 # Special case, used to give #strokes for Han characters
2276 add_related(
2277 wxr,
2278 data,
2279 ["strokes"],
2280 [m.group(1)],
2281 text,
2282 True,
2283 is_reconstruction,
2284 head_group,
2285 ruby,
2286 )
2287 continue
2289 # See if it is radical+strokes
2290 m = re.match(
2291 r"^([\u2F00-\u2FDF\u2E80-\u2EFF\U00018800-\U00018AFF"
2292 r"\uA490-\uA4CF\u4E00-\u9FFF]\+\d+)"
2293 r"( in (Japanese|Chinese|traditional Chinese|"
2294 r"simplified Chinese))?$",
2295 desc,
2296 )
2297 if m: 2297 ↛ 2300line 2297 didn't jump to line 2300 because the condition on line 2297 was never true
2298 # Special case, used to give radical + strokes for Han
2299 # characters
2300 radical_strokes = m.group(1)
2301 lang = m.group(3)
2302 t = ["radical+strokes"]
2303 if lang:
2304 t.extend(lang.split())
2305 add_related(
2306 wxr,
2307 data,
2308 t,
2309 [radical_strokes],
2310 text,
2311 True,
2312 is_reconstruction,
2313 head_group,
2314 ruby,
2315 )
2316 prev_tags = None
2317 following_tags = None
2318 continue
2320 # See if it indicates historical Katakana ortography (←) or
2321 # just otherwise katakana/hiragana form
2322 m = re.match(r"←\s*|kana\s+", desc)
2323 if m: 2323 ↛ 2324line 2323 didn't jump to line 2324 because the condition on line 2323 was never true
2324 if desc.startswith("←"):
2325 t1 = "historical "
2326 else:
2327 t1 = ""
2328 x = desc[m.end() :]
2329 if x.endswith("?"):
2330 x = x[:-1]
2331 # XXX should we add a tag indicating uncertainty?
2332 if x:
2333 name = unicodedata.name(x[0])
2334 if name.startswith("HIRAGANA "):
2335 desc = t1 + "hiragana " + x
2336 elif name.startswith("KATAKANA "):
2337 desc = t1 + "katakana " + x
2339 # See if it is "n strokes in Chinese" or similar
2340 m = re.match(
2341 r"(\d+) strokes in (Chinese|Japanese|"
2342 r"traditional Chinese|simplified Chinese)$",
2343 desc,
2344 )
2345 if m: 2345 ↛ 2347line 2345 didn't jump to line 2347 because the condition on line 2345 was never true
2346 # Special case, used to give just strokes for some Han chars
2347 strokes = m.group(1)
2348 lang = m.group(2)
2349 t = ["strokes"]
2350 t.extend(lang.split())
2351 add_related(
2352 wxr,
2353 data,
2354 t,
2355 [strokes],
2356 text,
2357 True,
2358 is_reconstruction,
2359 head_group,
2360 ruby,
2361 )
2362 prev_tags = None
2363 following_tags = None
2364 continue
2366 # American Sign Language has images (or requests for image)
2367 # as heads, + this ASL gloss after.
2368 m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text)
2369 if m2: 2369 ↛ 2370line 2369 didn't jump to line 2370 because the condition on line 2369 was never true
2370 add_related(
2371 wxr,
2372 data,
2373 ["ASL-gloss"],
2374 [m2.group(1)],
2375 text,
2376 True,
2377 is_reconstruction,
2378 head_group,
2379 ruby,
2380 )
2381 continue
2383 parts = list(m.group(0) for m in re.finditer(word_re, desc))
2384 if not parts: 2384 ↛ 2385line 2384 didn't jump to line 2385 because the condition on line 2384 was never true
2385 prev_tags = None
2386 following_tags = None
2387 continue
2389 # Check for certain language-specific header part starts that
2390 # modify
2391 if len(parts) == 2 and language in lang_specific_head_map: 2391 ↛ 2392line 2391 didn't jump to line 2392 because the condition on line 2391 was never true
2392 ht = lang_specific_head_map[language]
2393 if parts[0] in ht:
2394 rem_tags, add_tags = ht[parts[0]]
2395 new_prev_tags1: list[list[str]] = []
2396 tags2: Union[tuple[str, ...], list[str]]
2397 for tags2 in prev_tags or [()]:
2398 if rem_tags is True: # Remove all old tags
2399 tsets = set()
2400 else:
2401 tsets = set(tags2) - set(rem_tags.split())
2402 tsets = tsets | set(add_tags.split())
2403 tags = list(sorted(tsets))
2404 add_related(
2405 wxr,
2406 data,
2407 tags,
2408 [parts[1]],
2409 text,
2410 True,
2411 is_reconstruction,
2412 head_group,
2413 ruby,
2414 )
2415 new_prev_tags1.append(tags)
2416 prev_tags = new_prev_tags1
2417 following_tags = None
2418 continue
2420 # Handle the special case of descriptors that are parenthesized,
2421 # e.g., (archaic or Scotland)
2422 m = re.match(r"\(([^)]+)\)\s+(.*)$", desc)
2423 if m is not None and classify_desc(m.group(1)) == "tags": 2423 ↛ 2424line 2423 didn't jump to line 2424 because the condition on line 2423 was never true
2424 tagpart = m.group(1)
2425 related = [m.group(2)]
2426 tagsets, topics = decode_tags(tagpart, no_unknown_starts=True)
2427 if topics:
2428 wxr.wtp.debug(
2429 "parenthized head part {!r} contains topics: {}".format(
2430 tagpart, topics
2431 ),
2432 sortid="form_descriptions/1647",
2433 )
2434 elif m is not None and re.match(r"in the sense ", m.group(1)): 2434 ↛ 2437line 2434 didn't jump to line 2437 because the condition on line 2434 was never true
2435 # Handle certain ignored cases
2436 # e.g. bord/Danish: in the sense "plank"
2437 related = [m.group(2)]
2438 tagsets = [()]
2439 else:
2440 # Normal parsing of the descriptor
2441 alt_related = None
2442 alt_tagsets = None
2443 tagsets = None
2444 for i in range(len(parts), 0, -1):
2445 related = parts[i:]
2446 tagparts = parts[:i]
2447 # print(" i={} related={} tagparts={}"
2448 # .format(i, related, tagparts))
2449 tagsets, topics = decode_tags(
2450 " ".join(tagparts), no_unknown_starts=True
2451 )
2452 # print("tagparts={!r} tagsets={} topics={} related={} "
2453 # "alt_related={} distw={:.2f}"
2454 # .format(tagparts, tagsets, topics, related,
2455 # alt_related,
2456 # distw(titleparts, parts[i - 1])))
2457 if (
2458 topics
2459 or not tagsets
2460 or any("error-unknown-tag" in x for x in tagsets)
2461 ):
2462 if alt_related is not None: 2462 ↛ 2464line 2462 didn't jump to line 2464 because the condition on line 2462 was never true
2463 # We already had a good division, so let's stop.
2464 break
2465 # Bad division, try deeper
2466 continue
2467 # print(f"{parts[i-1]=}, {parts=}")
2468 if (
2469 i > 1
2470 and len(parts[i - 1]) >= 4
2471 and (
2472 distw(titleparts, parts[i - 1]) <= 0.4
2473 or (
2474 wxr.wtp.section == "English"
2475 and wxr.wtp.title
2476 in WORDS_WITH_FALSE_POSITIVE_TAGS
2477 and parts[i - 1]
2478 in WORDS_WITH_FALSE_POSITIVE_TAGS[wxr.wtp.title]
2479 )
2480 )
2481 # Fixes 'unaccountability' wiktext #1196
2482 and not (
2483 wxr.wtp.section == "English"
2484 and wxr.wtp.title in WORDS_WITH_FALSE_POSITIVE_FORMS
2485 and parts[i - 1]
2486 in WORDS_WITH_FALSE_POSITIVE_FORMS[wxr.wtp.title]
2487 )
2488 # Fixes wiktextract #983, where "participle"
2489 # was too close to "Martinize" and so this accepted
2490 # ["participle", "Martinize"] as matching; this
2491 # kludge prevents this from happening if titleparts
2492 # is shorter than what would be 'related'.
2493 # This breaks if we want to detect stuff that
2494 # actually gets an extra space-separated word when
2495 # 'inflected'.
2496 and (
2497 len(titleparts) >= len(parts[i - 1 :])
2498 or "or" in parts[i - 1 :]
2499 )
2500 ):
2501 # print(f"Reached; {parts=}, {parts[i-1]=}")
2502 alt_related = related
2503 alt_tagsets = tagsets
2504 continue
2505 alt_related = None
2506 alt_tagsets = None
2507 break
2508 else:
2509 if alt_related is None: 2509 ↛ 2541line 2509 didn't jump to line 2541 because the condition on line 2509 was always true
2510 # Check if the parenthesized part is likely a
2511 # romanization
2512 if ( 2512 ↛ 2520line 2512 didn't jump to line 2520 because the condition on line 2512 was never true
2513 (have_ruby or classify_desc(base) == "other")
2514 and classify_desc(paren) == "romanization"
2515 and not (
2516 "categories" in data
2517 and desc in data["categories"]
2518 )
2519 ):
2520 for r in split_at_comma_semi(
2521 paren, extra=[" or "], skipped=links
2522 ):
2523 add_romanization(
2524 wxr,
2525 data,
2526 r,
2527 text,
2528 is_reconstruction,
2529 head_group,
2530 ruby,
2531 )
2532 have_romanization = True
2533 continue
2534 tagsets = [("error-unrecognized-head-form",)]
2535 wxr.wtp.debug(
2536 "unrecognized head form: {}".format(desc),
2537 sortid="form_descriptions/1698",
2538 )
2539 continue
2541 if alt_related is not None: 2541 ↛ 2542line 2541 didn't jump to line 2542 because the condition on line 2541 was never true
2542 related = alt_related
2543 tagsets = alt_tagsets
2545 # print("FORM END: tagsets={} related={}".format(tagsets, related))
2546 # print("==================")
2548 if ( 2548 ↛ 2569line 2548 didn't jump to line 2569 because the condition on line 2548 was never true
2549 len(related) <= 0
2550 and wxr.wtp.section == "English"
2551 and tagsets is not None
2552 and len(tagsets) > 0
2553 and not any(
2554 s.startswith("error-") for tagset in tagsets for s in tagset
2555 )
2556 and any(
2557 s in FORM_ASSOCIATED_TAG_WORDS
2558 for tagset in tagsets
2559 for s in tagset
2560 )
2561 and (
2562 wxr.wtp.title not in FALSE_POSITIVE_MISSING_FORMS
2563 and not any(
2564 rel in FALSE_POSITIVE_MISSING_FORMS[wxr.wtp.title or ""]
2565 for rel in related
2566 )
2567 )
2568 ):
2569 wxr.wtp.debug(
2570 f"Form tags without form: {desc=}, {tagsets=}",
2571 sortid="form_description/20250107",
2572 )
2573 if not tagsets: 2573 ↛ 2574line 2573 didn't jump to line 2574 because the condition on line 2573 was never true
2574 continue
2576 # print(f"{alts=}, {related=}")
2578 assert isinstance(related, (list, tuple))
2579 related_str = " ".join(related)
2580 if "or" in titleparts:
2581 alts = [related_str]
2582 else:
2583 alts = split_at_comma_semi(
2584 related_str, separators=[r"\bor\b"], skipped=links
2585 )
2586 # print(f"{related_str=}, {alts=}")
2587 if not alts:
2588 alts = [""]
2589 for related_str in alts:
2590 if related_str:
2591 if prev_tags and (
2592 all(
2593 all(
2594 t in ["nonstandard", "dialectal"]
2595 or valid_tags[t] == "dialect"
2596 for t in tags
2597 )
2598 for ts in tagsets
2599 )
2600 or (
2601 any("participle" in ts for ts in prev_tags)
2602 and all(
2603 "attributive" in ts
2604 or any(valid_tags[t] == "gender" for t in ts)
2605 for ts in tagsets
2606 )
2607 )
2608 ):
2609 # Merged with previous tags. Don't update previous
2610 # tags here; cf. burn/English/Verb
2611 for tags_l in tagsets:
2612 for ts in prev_tags:
2613 tags_l1 = sorted(set(tags_l) | set(ts))
2614 add_related(
2615 wxr,
2616 data,
2617 tags_l1,
2618 [related_str],
2619 text,
2620 True,
2621 is_reconstruction,
2622 head_group,
2623 ruby,
2624 )
2625 else:
2626 # Not merged with previous tags
2627 for tags_l in tagsets:
2628 if following_tags is not None: 2628 ↛ 2629line 2628 didn't jump to line 2629 because the condition on line 2628 was never true
2629 for ts in following_tags:
2630 tags_l1 = list(
2631 sorted(set(tags_l) | set(ts))
2632 )
2633 add_related(
2634 wxr,
2635 data,
2636 tags_l1,
2637 [related_str],
2638 text,
2639 True,
2640 is_reconstruction,
2641 head_group,
2642 ruby,
2643 )
2644 else:
2645 ret = add_related(
2646 wxr,
2647 data,
2648 tags_l,
2649 [related_str],
2650 text,
2651 True,
2652 is_reconstruction,
2653 head_group,
2654 ruby,
2655 )
2656 if ret is not None: 2656 ↛ 2657line 2656 didn't jump to line 2657 because the condition on line 2656 was never true
2657 following_tags = ret
2658 prev_tags = tagsets
2659 else:
2660 if desc_i < len(new_desc) - 1 and all( 2660 ↛ 2667line 2660 didn't jump to line 2667 because the condition on line 2660 was never true
2661 "participle" in ts or "infinitive" in ts
2662 for ts in tagsets
2663 ):
2664 # Interpret it as a standalone form description
2665 # in the middle, probably followed by forms or
2666 # language-specific descriptors. cf. drikke/Danish
2667 new_prev_tags2 = []
2668 for ts1 in prev_tags or [()]:
2669 for ts2 in tagsets:
2670 ts = tuple(sorted(set(ts1) | set(ts2)))
2671 new_prev_tags2.append(ts)
2672 prev_tags = new_prev_tags2
2673 continue
2674 for tags in tagsets:
2675 data_extend(data, "tags", tags)
2676 prev_tags = tagsets
2677 following_tags = None
2679 # Finally, if we collected hirakana/katakana, add them now
2680 if hiragana: 2680 ↛ 2681line 2680 didn't jump to line 2681 because the condition on line 2680 was never true
2681 add_related(
2682 wxr,
2683 data,
2684 ["hiragana"],
2685 [hiragana],
2686 text,
2687 True,
2688 is_reconstruction,
2689 head_group,
2690 ruby,
2691 )
2692 if katakana: 2692 ↛ 2693line 2692 didn't jump to line 2693 because the condition on line 2692 was never true
2693 add_related(
2694 wxr,
2695 data,
2696 ["katakana"],
2697 [katakana],
2698 text,
2699 True,
2700 is_reconstruction,
2701 head_group,
2702 ruby,
2703 )
2705 # XXX check if this is actually relevant, tags in word root data
2706 # is extremely rare (not sure where they slip through).
2707 tags = data.get("tags", []) # type:ignore
2708 if len(tags) > 0:
2709 # wxr.wtp.debug(
2710 # f"Tags appear in word root data: {data['tags']=}", # type:ignore
2711 # sortid="form_descriptions/2620/20240606",
2712 # ) # Messes up tests.
2713 data["tags"] = sorted(set(tags)) # type:ignore
2716def parse_sense_qualifier(
2717 wxr: WiktextractContext, text: str, data: Union[SenseData, LinkageData]
2718) -> None:
2719 """Parses tags or topics for a sense or some other data. The values are
2720 added into the dictionary ``data``."""
2721 assert isinstance(wxr, WiktextractContext)
2722 assert isinstance(text, str)
2723 assert isinstance(data, dict)
2724 # print("parse_sense_qualifier:", text)
2725 if re.match(r"\([^()]+\)$", text): 2725 ↛ 2726line 2725 didn't jump to line 2726 because the condition on line 2725 was never true
2726 text = text[1:-1]
2727 if re.match(r'"[^"]+"$', text): 2727 ↛ 2728line 2727 didn't jump to line 2728 because the condition on line 2727 was never true
2728 text = text[1:-1]
2729 lst = map_with(xlat_descs_map, [text])
2730 sense_tags: list[str] = []
2731 for text in lst:
2732 for semi in split_at_comma_semi(text):
2733 if not semi: 2733 ↛ 2734line 2733 didn't jump to line 2734 because the condition on line 2733 was never true
2734 continue
2735 orig_semi = semi
2736 idx = semi.find(":")
2737 if idx >= 0: 2737 ↛ 2738line 2737 didn't jump to line 2738 because the condition on line 2737 was never true
2738 semi = semi[:idx]
2739 cls = classify_desc(semi, allow_unknown_tags=True)
2740 # print("parse_sense_qualifier: classify_desc: {} -> {}"
2741 # .format(semi, cls))
2742 if cls == "tags":
2743 tagsets, topics = decode_tags(semi)
2744 data_extend(data, "topics", topics)
2745 # XXX should think how to handle distinct options better,
2746 # e.g., "singular and plural genitive"; that can't really be
2747 # done with changing the calling convention of this function.
2748 # Should split sense if more than one category of tags differs.
2749 for tags in tagsets:
2750 sense_tags.extend(tags)
2751 elif cls == "taxonomic": 2751 ↛ 2752line 2751 didn't jump to line 2752 because the condition on line 2751 was never true
2752 if re.match(r"×[A-Z]", semi):
2753 sense_tags.append("extinct")
2754 semi = semi[1:]
2755 data["taxonomic"] = semi
2756 elif cls == "english":
2757 if "qualifier" in data and data["qualifier"] != orig_semi: 2757 ↛ 2758line 2757 didn't jump to line 2758 because the condition on line 2757 was never true
2758 data["qualifier"] += "; " + orig_semi
2759 else:
2760 data["qualifier"] = orig_semi
2761 else:
2762 wxr.wtp.debug(
2763 "unrecognized sense qualifier: {}".format(text),
2764 sortid="form_descriptions/1831",
2765 )
2766 sense_tags = sorted(set(sense_tags))
2767 data_extend(data, "tags", sense_tags)
2770def parse_pronunciation_tags(
2771 wxr: WiktextractContext, text: str, data: SoundData
2772) -> None:
2773 assert isinstance(wxr, WiktextractContext)
2774 assert isinstance(text, str)
2775 assert isinstance(data, dict)
2776 text = text.strip()
2777 if not text: 2777 ↛ 2778line 2777 didn't jump to line 2778 because the condition on line 2777 was never true
2778 return
2779 cls = classify_desc(text)
2780 notes = []
2781 if cls == "tags":
2782 tagsets, topics = decode_tags(text)
2783 data_extend(data, "topics", topics)
2784 for tagset in tagsets:
2785 for t in tagset:
2786 if " " in t: 2786 ↛ 2787line 2786 didn't jump to line 2787 because the condition on line 2786 was never true
2787 notes.append(t)
2788 else:
2789 data_append(data, "tags", t)
2790 else:
2791 notes.append(text)
2792 if notes:
2793 data["note"] = "; ".join(notes)
2796def parse_translation_desc(
2797 wxr: WiktextractContext, lang: str, text: str, tr: TranslationData
2798) -> None:
2799 assert isinstance(wxr, WiktextractContext)
2800 assert isinstance(lang, str) # The language of ``text``
2801 assert isinstance(text, str)
2802 assert isinstance(tr, dict)
2803 # print("parse_translation_desc:", text)
2805 # Process all parenthesized parts from the translation item
2806 note = None
2807 restore_beginning = ""
2808 restore_end = ""
2809 while True:
2810 beginning = False
2811 # See if we can find a parenthesized expression at the end
2812 m = re.search(r"\s*\((([^()]|\([^()]+\))+)\)\.?$", text)
2813 if m:
2814 par = m.group(1)
2815 text = text[: m.start()]
2816 if par.startswith(("literally ", "lit.")):
2817 continue # Not useful for disambiguation in many idioms
2818 else:
2819 # See if we can find a parenthesized expression at the start
2820 m = re.match(r"^\^?\((([^()]|\([^()]+\))+)\):?(\s+|$)", text)
2821 if m:
2822 par = m.group(1)
2823 text = text[m.end() :]
2824 beginning = True
2825 if re.match(r"^(\d|\s|,| or | and )+$", par): 2825 ↛ 2830line 2825 didn't jump to line 2830 because the condition on line 2825 was never true
2826 # Looks like this beginning parenthesized expression only
2827 # contains digits or their combinations. We assume such
2828 # to be sense descriptions if no sense has been selected,
2829 # or otherwise just ignore them.
2830 if not tr.get("sense"):
2831 tr["sense"] = par
2832 continue
2833 else:
2834 # See if we can find a parenthesized expression in the middle.
2835 # Romanizations are sometimes between word and gender marker,
2836 # e.g. wife/English/Tr/Yiddish.
2837 m = re.search(r"\s+\((([^()]|\([^()]+\))+)\)", text)
2838 if m:
2839 par = m.group(1)
2840 text = text[: m.start()] + text[m.end() :]
2841 else:
2842 # No more parenthesized expressions - break out of the loop
2843 break
2845 # Some cleanup of artifacts that may result from skipping some templates
2846 # in earlier stages
2847 if par.startswith(": "): 2847 ↛ 2848line 2847 didn't jump to line 2848 because the condition on line 2847 was never true
2848 par = par[2:]
2849 if par.endswith(","): 2849 ↛ 2850line 2849 didn't jump to line 2850 because the condition on line 2849 was never true
2850 par = par[:-1]
2851 if re.match(r'^[“"]([^“”"]*)[“”"]$', par): 2851 ↛ 2852line 2851 didn't jump to line 2852 because the condition on line 2851 was never true
2852 par = par[1:-1]
2853 par = par.strip()
2855 # Check for special script pronunciation followed by romanization,
2856 # used in many Asian languages.
2857 lst = par.split(", ")
2858 if len(lst) == 2:
2859 a, r = lst
2860 if classify_desc(a) == "other":
2861 cls = classify_desc(r)
2862 # print("parse_translation_desc: r={} cls={}".format(r, cls))
2863 if cls == "romanization" or (
2864 cls == "english" and len(r.split()) == 1 and r[0].islower()
2865 ):
2866 if tr.get("alt") and tr.get("alt") != a: 2866 ↛ 2867line 2866 didn't jump to line 2867 because the condition on line 2866 was never true
2867 wxr.wtp.debug(
2868 'more than one value in "alt": {} vs. {}'.format(
2869 tr["alt"], a
2870 ),
2871 sortid="form_descriptions/1930",
2872 )
2873 tr["alt"] = a
2874 if tr.get("roman") and tr.get("roman") != r: 2874 ↛ 2875line 2874 didn't jump to line 2875 because the condition on line 2874 was never true
2875 wxr.wtp.debug(
2876 'more than one value in "roman": '
2877 "{} vs. {}".format(tr["roman"], r),
2878 sortid="form_descriptions/1936",
2879 )
2880 tr["roman"] = r
2881 continue
2883 # Check for certain comma-separated tags combined with English text
2884 # at the beginning or end of a comma-separated parenthesized list
2885 while len(lst) > 1:
2886 cls = classify_desc(lst[0])
2887 if cls == "tags": 2887 ↛ 2888line 2887 didn't jump to line 2888 because the condition on line 2887 was never true
2888 tagsets, topics = decode_tags(lst[0])
2889 for t in tagsets:
2890 data_extend(tr, "tags", t)
2891 data_extend(tr, "topics", topics)
2892 lst = lst[1:]
2893 continue
2894 cls = classify_desc(lst[-1])
2895 if cls == "tags":
2896 tagsets, topics = decode_tags(lst[-1])
2897 for t in tagsets:
2898 data_extend(tr, "tags", t)
2899 data_extend(tr, "topics", topics)
2900 lst = lst[:-1]
2901 continue
2902 break
2903 par = ", ".join(lst)
2905 if not par: 2905 ↛ 2906line 2905 didn't jump to line 2906 because the condition on line 2905 was never true
2906 continue
2907 if re.search(tr_ignored_parens_re, par): 2907 ↛ 2908line 2907 didn't jump to line 2908 because the condition on line 2907 was never true
2908 continue
2909 if par.startswith("numeral:"):
2910 par = par[8:].strip()
2912 # Classify the part in parenthesis and process accordingly
2913 cls = classify_desc(par)
2914 # print("parse_translation_desc classify: {!r} -> {}"
2915 # .format(par, cls))
2916 if par == text:
2917 pass
2918 if par == "f": 2918 ↛ 2919line 2918 didn't jump to line 2919 because the condition on line 2918 was never true
2919 data_append(tr, "tags", "feminine")
2920 elif par == "m": 2920 ↛ 2921line 2920 didn't jump to line 2921 because the condition on line 2920 was never true
2921 data_append(tr, "tags", "masculine")
2922 elif cls == "tags":
2923 tagsets, topics = decode_tags(par)
2924 for tags in tagsets:
2925 data_extend(tr, "tags", tags)
2926 data_extend(tr, "topics", topics)
2927 elif cls == "english":
2928 # If the text contains any of certain grammatical words, treat it
2929 # as a "note" instead of "english"
2930 if re.search(tr_note_re, par):
2931 if par.endswith(":"): 2931 ↛ 2932line 2931 didn't jump to line 2932 because the condition on line 2931 was never true
2932 par = par[:-1]
2933 if par not in ("see entry for forms",): 2933 ↛ 2809line 2933 didn't jump to line 2809 because the condition on line 2933 was always true
2934 if note: 2934 ↛ 2935line 2934 didn't jump to line 2935 because the condition on line 2934 was never true
2935 note = note + ";" + par
2936 else:
2937 note = par
2938 else:
2939 # There can be more than one parenthesized english item, see
2940 # e.g. Aunt/English/Translations/Tamil
2941 if "translation" in tr and "english" in tr:
2942 tr["english"] += "; " + par # DEPRECATED for "translation"
2943 tr["translation"] += "; " + par
2944 else:
2945 tr["english"] = par # DEPRECATED for "translation"
2946 tr["translation"] = par
2947 elif cls == "romanization":
2948 # print("roman text={!r} text cls={}"
2949 # .format(text, classify_desc(text)))
2950 if classify_desc(text) in (
2951 "english",
2952 "romanization",
2953 ) and lang not in ("Egyptian",):
2954 if beginning:
2955 restore_beginning += "({}) ".format(par)
2956 else:
2957 restore_end = " ({})".format(par) + restore_end
2958 else:
2959 if tr.get("roman"): 2959 ↛ 2960line 2959 didn't jump to line 2960 because the condition on line 2959 was never true
2960 wxr.wtp.debug(
2961 'more than one value in "roman": {} vs. {}'.format(
2962 tr["roman"], par
2963 ),
2964 sortid="form_descriptions/2013",
2965 )
2966 tr["roman"] = par
2967 elif cls == "taxonomic": 2967 ↛ 2968line 2967 didn't jump to line 2968 because the condition on line 2967 was never true
2968 if tr.get("taxonomic"):
2969 wxr.wtp.debug(
2970 'more than one value in "taxonomic": {} vs. {}'.format(
2971 tr["taxonomic"], par
2972 ),
2973 sortid="form_descriptions/2019",
2974 )
2975 if re.match(r"×[A-Z]", par):
2976 data_append(tr, "tags", "extinct")
2977 par = par[1:]
2978 tr["taxonomic"] = par
2979 elif cls == "other": 2979 ↛ 2989line 2979 didn't jump to line 2989 because the condition on line 2979 was always true
2980 if tr.get("alt"): 2980 ↛ 2981line 2980 didn't jump to line 2981 because the condition on line 2980 was never true
2981 wxr.wtp.debug(
2982 'more than one value in "alt": {} vs. {}'.format(
2983 tr["alt"], par
2984 ),
2985 sortid="form_descriptions/2028",
2986 )
2987 tr["alt"] = par
2988 else:
2989 wxr.wtp.debug(
2990 "parse_translation_desc unimplemented cls {}: {}".format(
2991 cls, par
2992 ),
2993 sortid="form_descriptions/2033",
2994 )
2996 # Check for gender indications in suffix
2997 text, final_tags = parse_head_final_tags(wxr, lang, text)
2998 data_extend(tr, "tags", final_tags)
3000 # Restore those parts that we did not want to remove (they are often
3001 # optional words or words that are always used with the given translation)
3002 text = restore_beginning + text + restore_end
3004 if note:
3005 tr["note"] = note.strip()
3006 if text and text not in ignored_translations:
3007 tr["word"] = text.strip()
3009 # Sometimes gender seems to be at the end of "roman" field, see e.g.
3010 # fire/English/Noun/Translations/Egyptian (for "oxidation reaction")
3011 roman = tr.get("roman")
3012 if roman:
3013 if roman.endswith(" f"): 3013 ↛ 3014line 3013 didn't jump to line 3014 because the condition on line 3013 was never true
3014 data_append(tr, "tags", "feminine")
3015 tr["roman"] = roman[:-2].strip()
3016 elif roman.endswith(" m"): 3016 ↛ 3017line 3016 didn't jump to line 3017 because the condition on line 3016 was never true
3017 data_append(tr, "tags", "masculine")
3018 tr["roman"] = roman[:-2].strip()
3020 # If the word now has "translation" field but no "roman" field, and
3021 # the word would be classified "other" (generally non-latin
3022 # characters), and the value in "translation" is only one lowercase
3023 # word, move it to "roman". This happens semi-frequently when the
3024 # translation is transliterated the same as some English word.
3025 roman = tr.get("roman")
3026 english = tr.get("translation")
3027 if english and not roman and "word" in tr:
3028 cls = classify_desc(tr["word"])
3029 if cls == "other" and " " not in english and english[0].islower():
3030 del tr["translation"]
3031 if "english" in tr: # DEPRECATED for "translation" 3031 ↛ 3033line 3031 didn't jump to line 3033 because the condition on line 3031 was always true
3032 del tr["english"]
3033 tr["roman"] = english
3035 # If the entry now has both tr["roman"] and tr["word"] and they have
3036 # the same value, delete tr["roman"] (e.g., man/English/Translations
3037 # Evenki)
3038 if tr.get("word") and tr.get("roman") == tr.get("word"): 3038 ↛ 3039line 3038 didn't jump to line 3039 because the condition on line 3038 was never true
3039 del tr["roman"]
3042def parse_alt_or_inflection_of(
3043 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str]
3044) -> Optional[tuple[list[str], Optional[list[AltOf]]]]:
3045 """Tries to parse an inflection-of or alt-of description. If successful,
3046 this returns (tags, alt-of/inflection-of-dict). If the description cannot
3047 be parsed, this returns None. This may also return (tags, None) when the
3048 gloss describes a form (or some other tags were extracted from it), but
3049 there was no alt-of/form-of/synonym-of word."""
3050 # print("parse_alt_or_inflection_of: {!r}".format(gloss))
3051 # Occasionally inflection_of/alt_of have "A(n) " etc. at the beginning.
3053 # Never interpret a gloss that is equal to the word itself as a tag
3054 # (e.g., instrumental/Romanian, instrumental/Spanish).
3055 if gloss.lower() == wxr.wtp.title.lower() or ( # type:ignore[union-attr]
3056 len(gloss) >= 5 and distw([gloss.lower()], wxr.wtp.title.lower()) < 0.2 # type:ignore[union-attr]
3057 ):
3058 return None
3060 # First try parsing it as-is
3061 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args)
3062 if parsed is not None:
3063 return parsed
3065 # Next try parsing it with the first character converted to lowercase if
3066 # it was previously uppercase.
3067 if gloss and gloss[0].isupper():
3068 gloss = gloss[0].lower() + gloss[1:]
3069 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args)
3070 if parsed is not None:
3071 return parsed
3073 return None
3076# These tags are not allowed in alt-or-inflection-of parsing
3077alt_infl_disallowed: set[str] = set(
3078 [
3079 "error-unknown-tag",
3080 "place", # Not in inflected forms and causes problems e.g. house/English
3081 ]
3082)
3085def parse_alt_or_inflection_of1(
3086 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str]
3087) -> Optional[tuple[list[str], Optional[list[AltOf]]]]:
3088 """Helper function for parse_alt_or_inflection_of. This handles a single
3089 capitalization."""
3090 if not gloss or not gloss.strip(): 3090 ↛ 3091line 3090 didn't jump to line 3091 because the condition on line 3090 was never true
3091 return None
3093 # Prevent some common errors where we would parse something we shouldn't
3094 if re.search(r"(?i)form of address ", gloss): 3094 ↛ 3095line 3094 didn't jump to line 3095 because the condition on line 3094 was never true
3095 return None
3097 gloss = re.sub(r"only used in [^,]+, ", "", gloss)
3099 # First try all formats ending with "of" (or other known last words that
3100 # can end a form description)
3101 matches = list(re.finditer(r"\b(of|for|by|as|letter|number) ", gloss))
3102 m: Optional[re.Match]
3103 for m in reversed(matches):
3104 desc = gloss[: m.end()].strip()
3105 base = gloss[m.end() :].strip()
3106 tagsets, topics = decode_tags(desc, no_unknown_starts=True)
3107 if not topics and any(
3108 not (alt_infl_disallowed & set(ts)) for ts in tagsets
3109 ):
3110 # Successfully parsed, including "of" etc.
3111 tags: list[str] = []
3112 # If you have ("Western-Armenian", ..., "form-of") as your
3113 # tag set, it's most probable that it's something like
3114 # "Western Armenian form of խոսել (xosel)", which should
3115 # get "alt-of" instead of "form-of" (inflection).
3116 # խօսիլ/Armenian
3117 for ts_t in tagsets:
3118 if "form-of" in ts_t and any(
3119 valid_tags.get(tk) == "dialect" for tk in ts_t
3120 ):
3121 ts_s = (set(ts_t) - {"form-of"}) | {"alt-of"}
3122 else:
3123 ts_s = set(ts_t)
3124 if not (alt_infl_disallowed & ts_s): 3124 ↛ 3117line 3124 didn't jump to line 3117 because the condition on line 3124 was always true
3125 tags.extend(ts_s)
3126 if (
3127 "alt-of" in tags
3128 or "form-of" in tags
3129 or "synonym-of" in tags
3130 or "compound-of" in tags
3131 ):
3132 break
3133 if m.group(1) == "of":
3134 # Try parsing without the final "of". This is commonly used in
3135 # various form-of expressions.
3136 desc = gloss[: m.start()]
3137 base = gloss[m.end() :]
3138 tagsets, topics = decode_tags(desc, no_unknown_starts=True)
3139 # print("ALT_OR_INFL: desc={!r} base={!r} tagsets={} topics={}"
3140 # .format(desc, base, tagsets, topics))
3141 if not topics and any(
3142 not (alt_infl_disallowed & set(t)) for t in tagsets
3143 ):
3144 tags = []
3145 for t in tagsets:
3146 if not (alt_infl_disallowed & set(t)): 3146 ↛ 3145line 3146 didn't jump to line 3145 because the condition on line 3146 was always true
3147 tags.extend(t)
3148 # It must have at least one tag from form_of_tags
3149 if set(tags) & form_of_tags:
3150 # Accept this as form-of
3151 tags.append("form-of")
3152 break
3153 if set(tags) & alt_of_tags:
3154 # Accept this as alt-of
3155 tags.append("alt-of")
3156 break
3158 else:
3159 # Did not find a form description based on last word; see if the
3160 # whole description is tags
3161 tagsets, topics = decode_tags(gloss, no_unknown_starts=True)
3162 if not topics and any(
3163 not (alt_infl_disallowed & set(ts)) and form_of_tags & set(ts)
3164 for ts in tagsets
3165 ):
3166 tags = []
3167 for ts in tagsets:
3168 if not (alt_infl_disallowed & set(ts)) and form_of_tags & set( 3168 ↛ 3167line 3168 didn't jump to line 3167 because the condition on line 3168 was always true
3169 ts
3170 ):
3171 tags.extend(ts)
3172 base = ""
3173 else:
3174 return None
3176 # kludge for Spanish (again): 'x of [word] combined with [clitic]'
3177 m = re.search(r"combined with \w+$", base)
3178 if m: 3178 ↛ 3179line 3178 didn't jump to line 3179 because the condition on line 3178 was never true
3179 tagsets, topics = decode_tags(m.group(0), no_unknown_starts=True)
3180 if not topics:
3181 for ts in tagsets:
3182 tags.extend(ts)
3183 base = base[: m.start()]
3185 # It is fairly common for form_of glosses to end with something like
3186 # "ablative case" or "in instructive case". Parse that ending.
3187 base = base.strip()
3188 lst = base.split()
3189 # print("parse_alt_or_inflection_of: lst={}".format(lst))
3190 if len(lst) >= 3 and lst[-1] in ("case", "case."): 3190 ↛ 3191line 3190 didn't jump to line 3191 because the condition on line 3190 was never true
3191 node = valid_sequences.children.get(lst[-2])
3192 if node and node.end:
3193 for s in node.tags:
3194 tags.extend(s.split(" "))
3195 lst = lst[:-2]
3196 if lst[-1] == "in" and len(lst) > 1:
3197 lst = lst[:-1]
3199 # Eliminate empty and duplicate tags
3200 tags = sorted(set(t for t in tags if t))
3202 # Clean up some extra stuff from the linked word, separating the text
3203 # into ``base`` (the linked word) and ``extra`` (additional information,
3204 # such as English translation or clarifying word sense information).
3205 orig_base = base
3206 base = re.sub(alt_of_form_of_clean_re, "", orig_base)
3207 base = re.sub(r" [(⟨][^()]*[)⟩]", "", base) # Remove all (...) groups
3208 extra = orig_base[len(base) :]
3209 extra = re.sub(r"^[- :;.,,—]+", "", extra)
3210 if extra.endswith(".") and extra.count(".") == 1:
3211 extra = extra[:-1].strip()
3212 m = re.match(r"^\(([^()]*)\)$", extra)
3213 if m: 3213 ↛ 3214line 3213 didn't jump to line 3214 because the condition on line 3213 was never true
3214 extra = m.group(1)
3215 else:
3216 # These weird backets used in "slash mark"
3217 m = re.match(r"^⟨([^()]*)⟩$", extra)
3218 if m: 3218 ↛ 3219line 3218 didn't jump to line 3219 because the condition on line 3218 was never true
3219 extra = m.group(1)
3220 m = re.match(r'^[“"]([^"“”]*)["”]$', extra)
3221 if m: 3221 ↛ 3222line 3221 didn't jump to line 3222 because the condition on line 3221 was never true
3222 extra = m.group(1)
3223 # Note: base might still contain comma-separated values and values
3224 # separated by "and"
3225 base = base.strip()
3226 if base.endswith(",") and len(base) > 2: 3226 ↛ 3227line 3226 didn't jump to line 3227 because the condition on line 3226 was never true
3227 base = base[:-1].strip()
3228 while (
3229 base.endswith(".")
3230 and not wxr.wtp.page_exists(base)
3231 and base not in gloss_template_args
3232 ):
3233 base = base[:-1].strip()
3234 if base.endswith('(\u201cconjecture")'): 3234 ↛ 3235line 3234 didn't jump to line 3235 because the condition on line 3234 was never true
3235 base = base[:-14].strip()
3236 tags.append("conjecture")
3237 while ( 3237 ↛ 3242line 3237 didn't jump to line 3242 because the condition on line 3237 was never true
3238 base.endswith(".")
3239 and not wxr.wtp.page_exists(base)
3240 and base not in gloss_template_args
3241 ):
3242 base = base[:-1].strip()
3243 if ( 3243 ↛ 3248line 3243 didn't jump to line 3248 because the condition on line 3243 was never true
3244 base.endswith(".")
3245 and base not in gloss_template_args
3246 and base[:-1] in gloss_template_args
3247 ):
3248 base = base[:-1]
3249 base = base.strip()
3250 if not base:
3251 return tags, None
3253 # Kludge: Spanish verb forms seem to have a dot added at the end.
3254 # Remove it; we know of no Spanish verbs ending with a dot.
3255 language = wxr.wtp.section
3256 pos = wxr.wtp.subsection
3257 # print("language={} pos={} base={}".format(language, pos, base))
3258 if ( 3258 ↛ 3264line 3258 didn't jump to line 3264 because the condition on line 3258 was never true
3259 base.endswith(".")
3260 and len(base) > 1
3261 and base[-2].isalpha()
3262 and (language == "Spanish" and pos == "Verb")
3263 ):
3264 base = base[:-1]
3266 # Split base to alternatives when multiple alternatives provided
3267 parts = split_at_comma_semi(base, extra=[" / ", "/", r" \+ "])
3268 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "")
3269 if (
3270 len(parts) <= 1
3271 or base.startswith("/")
3272 or base.endswith("/")
3273 or "/" in titleword
3274 ):
3275 parts = [base]
3276 # Split base to alternatives when of form "a or b" and "a" and "b" are
3277 # similar (generally spelling variants of the same word or similar words)
3278 if len(parts) == 1:
3279 pp = base.split()
3280 if len(pp) == 3 and pp[1] == "or" and distw([pp[0]], pp[2]) < 0.4:
3281 parts = [pp[0], pp[2]]
3283 # Create form-of/alt-of entries based on the extracted data
3284 dt_lst: list[AltOf] = []
3285 for p in parts:
3286 # Check for some suspicious base forms
3287 m = re.search(r"[.,] |[{}()]", p)
3288 if m and not wxr.wtp.page_exists(p): 3288 ↛ 3289line 3288 didn't jump to line 3289 because the condition on line 3288 was never true
3289 wxr.wtp.debug(
3290 "suspicious alt_of/form_of with {!r}: {}".format(m.group(0), p),
3291 sortid="form_descriptions/2278",
3292 )
3293 if p.startswith("*") and len(p) >= 3 and p[1].isalpha(): 3293 ↛ 3294line 3293 didn't jump to line 3294 because the condition on line 3293 was never true
3294 p = p[1:]
3295 dt: AltOf = {"word": p}
3296 if extra:
3297 dt["extra"] = extra
3298 dt_lst.append(dt)
3299 # print("alt_or_infl_of returning tags={} lst={} base={!r}"
3300 # .format(tags, lst, base))
3301 return tags, dt_lst
3304@functools.lru_cache(maxsize=65536)
3305def classify_desc(
3306 desc: str,
3307 allow_unknown_tags=False,
3308 no_unknown_starts=False,
3309 accepted: Union[tuple[str, ...], frozenset[str]] = tuple(),
3310) -> str:
3311 """Determines whether the given description is most likely tags, english,
3312 a romanization, or something else. Returns one of: "tags", "english",
3313 "romanization", or "other". If ``allow_unknown_tags`` is True, then
3314 allow "tags" classification even when the only tags are those starting
3315 with a word in allowed_unknown_starts."""
3316 assert isinstance(desc, str)
3317 # Empty and whitespace-only strings are treated as "other"
3318 desc = desc.strip()
3319 if not desc:
3320 return "other"
3322 normalized_desc = unicodedata.normalize("NFKD", desc)
3324 # If it can be fully decoded as tags without errors, treat as tags
3325 tagsets, topics = decode_tags(desc, no_unknown_starts=no_unknown_starts)
3326 for tagset in tagsets:
3327 assert isinstance(tagset, (list, tuple, set))
3328 if "error-unknown-tag" not in tagset and (
3329 topics or allow_unknown_tags or any(" " not in x for x in tagset)
3330 ):
3331 return "tags"
3333 # Check if it looks like the taxonomic name of a species
3334 if desc in known_species:
3335 return "taxonomic"
3336 desc1 = re.sub(r"^×([A-Z])", r"\1", desc)
3337 desc1 = re.sub(r"\s*×.*", "", desc1)
3338 lst = desc1.split()
3339 if len(lst) > 1 and len(lst) <= 5 and lst[0] in known_firsts:
3340 have_non_english = 1 if lst[0].lower() not in english_words else 0
3341 for x in lst[1:]:
3342 if x in ("A", "B", "C", "D", "E", "F", "I", "II", "III", "IV", "V"):
3343 continue
3344 if x[0].isupper():
3345 break
3346 if x not in english_words:
3347 have_non_english += 1
3348 else:
3349 # Starts with known taxonomic term, does not contain uppercase
3350 # words (except allowed letters) and at least one word is not
3351 # English
3352 if have_non_english >= len(lst) - 1 and have_non_english > 0: 3352 ↛ 3358line 3352 didn't jump to line 3358 because the condition on line 3352 was always true
3353 return "taxonomic"
3355 # If all words are in our English dictionary, interpret as English.
3356 # [ -~] is regex black magic, "ALL CHARACTERS from space to tilde"
3357 # in ASCII. Took me a while to figure out.
3358 if re.match(r"[ -~―—“”…'‘’ʹ€]+$", normalized_desc) and len(desc) > 1:
3359 if desc in english_words and desc[0].isalpha():
3360 return "english" # Handles ones containing whitespace
3361 desc1 = re.sub(
3362 tokenizer_fixup_re, lambda m: tokenizer_fixup_map[m.group(0)], desc
3363 )
3364 tokens = tokenizer.tokenize(desc1)
3365 if not tokens: 3365 ↛ 3366line 3365 didn't jump to line 3366 because the condition on line 3365 was never true
3366 return "other"
3367 lst_bool = list(
3368 x not in not_english_words
3369 and
3370 # not x.isdigit() and
3371 (
3372 x in english_words
3373 or x.lower() in english_words
3374 or x in known_firsts
3375 or x[0].isdigit()
3376 or x in accepted
3377 or
3378 # (x[0].isupper() and x.find("-") < 0 and x.isascii()) or
3379 (
3380 x.endswith("s") and len(x) >= 4 and x[:-1] in english_words
3381 ) # Plural
3382 or (
3383 x.endswith("ies")
3384 and len(x) >= 5
3385 and x[:-3] + "y" in english_words
3386 ) # E.g. lily - lilies
3387 or (
3388 x.endswith("ing")
3389 and len(x) >= 5
3390 and x[:-3] in english_words
3391 ) # E.g. bring - bringing
3392 or (
3393 x.endswith("ing")
3394 and len(x) >= 5
3395 and x[:-3] + "e" in english_words
3396 ) # E.g., tone - toning
3397 or (
3398 x.endswith("ed") and len(x) >= 5 and x[:-2] in english_words
3399 ) # E.g. hang - hanged
3400 or (
3401 x.endswith("ed")
3402 and len(x) >= 5
3403 and x[:-2] + "e" in english_words
3404 ) # E.g. atone - atoned
3405 or (x.endswith("'s") and x[:-2] in english_words)
3406 or (x.endswith("s'") and x[:-2] in english_words)
3407 or (
3408 x.endswith("ise")
3409 and len(x) >= 5
3410 and x[:-3] + "ize" in english_words
3411 )
3412 or (
3413 x.endswith("ised")
3414 and len(x) >= 6
3415 and x[:-4] + "ized" in english_words
3416 )
3417 or (
3418 x.endswith("ising")
3419 and len(x) >= 7
3420 and x[:-5] + "izing" in english_words
3421 )
3422 or (
3423 re.search(r"[-/]", x)
3424 and all(
3425 ((y in english_words and len(y) > 2) or not y)
3426 for y in re.split(r"[-/]", x)
3427 )
3428 )
3429 )
3430 for x in tokens
3431 )
3432 cnt = lst_bool.count(True)
3433 rejected_words = tuple(
3434 x for i, x in enumerate(tokens) if not lst_bool[i]
3435 )
3436 if (
3437 any(
3438 lst_bool[i] and x[0].isalpha() and len(x) > 1
3439 for i, x in enumerate(tokens)
3440 )
3441 and not desc.startswith("-")
3442 and not desc.endswith("-")
3443 and re.search(r"\w+", desc)
3444 and (
3445 cnt == len(lst_bool)
3446 or (
3447 any(
3448 lst_bool[i] and len(x) > 3 for i, x in enumerate(tokens)
3449 )
3450 and cnt >= len(lst_bool) - 1
3451 )
3452 or cnt / len(lst_bool) >= 0.8
3453 or (
3454 all(x in potentially_english_words for x in rejected_words)
3455 and cnt / len(lst_bool) >= 0.50
3456 )
3457 )
3458 ):
3459 return "english"
3460 # Some translations have apparent pronunciation descriptions in /.../
3461 # which we'll put in the romanization field (even though they probably are
3462 # not exactly romanizations).
3463 if desc.startswith("/") and desc.endswith("/"):
3464 return "romanization"
3465 # If all characters are in classes that could occur in romanizations,
3466 # treat as romanization
3467 classes = list(
3468 unicodedata.category(x) if x not in ("-", ",", ":", "/", '"') else "OK"
3469 for x in normalized_desc
3470 )
3471 classes1 = []
3472 num_latin = 0
3473 num_greek = 0
3474 # part = ""
3475 # for ch, cl in zip(normalized_desc, classes):
3476 # part += f"{ch}({cl})"
3477 # print(part)
3478 for ch, cl in zip(normalized_desc, classes):
3479 if ch in (
3480 "'", # ' in Arabic, / in IPA-like parenthesized forms
3481 ".", # e.g., "..." in translations
3482 ";",
3483 ":",
3484 "!",
3485 "‘",
3486 "’",
3487 '"',
3488 "“",
3489 "”",
3490 "/",
3491 "?",
3492 "…", # alternative to "..."
3493 "⁉", # 見る/Japanese automatic transcriptions...
3494 "?",
3495 "!",
3496 "⁻", # superscript -, used in some Cantonese roman, e.g. "we"
3497 "ʔ",
3498 "ʼ",
3499 "ʾ",
3500 "ʹ",
3501 ): # ʹ e.g. in understand/English/verb Russian transl
3502 classes1.append("OK")
3503 continue
3504 if cl not in ("Ll", "Lu"):
3505 classes1.append(cl)
3506 continue
3507 try:
3508 name = unicodedata.name(ch)
3509 first = name.split()[0]
3510 if first == "LATIN":
3511 num_latin += 1
3512 elif first == "GREEK":
3513 num_greek += 1
3514 elif first == "COMBINING": # Combining diacritic 3514 ↛ 3515line 3514 didn't jump to line 3515 because the condition on line 3514 was never true
3515 cl = "OK"
3516 elif re.match(non_latin_scripts_re, name): 3516 ↛ 3520line 3516 didn't jump to line 3520 because the condition on line 3516 was always true
3517 cl = "NO" # Not acceptable in romanizations
3518 except ValueError:
3519 cl = "NO" # Not acceptable in romanizations
3520 classes1.append(cl)
3521 # print("classify_desc: {!r} classes1: {}".format(desc, classes1))
3522 # print(set(classes1) )
3523 if all(
3524 x in ("Ll", "Lu", "Lt", "Lm", "Mn", "Mc", "Zs", "Nd", "OK")
3525 for x in classes1
3526 ):
3527 if (
3528 (num_latin >= num_greek + 2 or num_greek == 0)
3529 and classes1.count("OK") < len(classes1)
3530 and classes1.count("Nd") < len(classes1)
3531 ):
3532 return "romanization"
3533 # Otherwise it is something else, such as hanji version of the word
3534 return "other"
3537def remove_text_in_parentheses(text: str) -> str:
3538 parentheses = 0
3539 new_text = ""
3540 for c in text:
3541 if c == "(":
3542 parentheses += 1
3543 elif c == ")":
3544 parentheses -= 1
3545 elif parentheses == 0:
3546 new_text += c
3547 return new_text