Coverage for src / wiktextract / extractor / en / form_descriptions.py: 78%
1324 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-09 05:43 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-09 05:43 +0000
1# Code for parsing linguistic form descriptions and tags for word senses
2# (both the word entry head - initial part and parenthesized parts -
3# and tags at the beginning of word senses)
4#
5# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
7import functools
8import re
9import unicodedata
10from typing import (
11 Any,
12 Literal,
13 Optional,
14 Sequence,
15 Union,
16)
18import Levenshtein
19from nltk import TweetTokenizer # type:ignore[import-untyped]
21from ...datautils import data_append, data_extend, split_at_comma_semi
22from ...tags import (
23 alt_of_tags,
24 form_of_tags,
25 head_final_bantu_langs,
26 head_final_bantu_map,
27 head_final_numeric_langs,
28 head_final_other_langs,
29 head_final_other_map,
30 head_final_semitic_langs,
31 head_final_semitic_map,
32 uppercase_tags,
33 valid_tags,
34 xlat_descs_map,
35 xlat_head_map,
36 xlat_tags_map,
37)
38from ...topics import topic_generalize_map, valid_topics
39from ...wxr_context import WiktextractContext
40from .english_words import (
41 english_words,
42 not_english_words,
43 potentially_english_words,
44)
45from .form_descriptions_known_firsts import known_firsts
46from .taxondata import known_species
47from .type_utils import (
48 AltOf,
49 FormData,
50 LinkageData,
51 SenseData,
52 SoundData,
53 TranslationData,
54 WordData,
55)
57# Tokenizer for classify_desc()
58tokenizer = TweetTokenizer()
60# These are ignored as the value of a related form in form head.
61IGNORED_RELATED: set[str] = set(
62 [
63 "-",
64 "־",
65 "᠆",
66 "‐",
67 "‑",
68 "‒",
69 "–",
70 "—",
71 "―",
72 "−",
73 "⸺",
74 "⸻",
75 "﹘",
76 "﹣",
77 "-",
78 "?",
79 "(none)",
80 ]
81)
84# First words of unicodedata.name() that indicate scripts that cannot be
85# accepted in romanizations or english (i.e., should be considered "other"
86# in classify_desc()).
87non_latin_scripts: list[str] = [
88 "ADLAM",
89 "ARABIC",
90 "ARABIC-INDIC",
91 "ARMENIAN",
92 "BALINESE",
93 "BENGALI",
94 "BRAHMI",
95 "BRAILLE",
96 "CANADIAN",
97 "CHAKMA",
98 "CHAM",
99 "CHEROKEE",
100 "CJK",
101 "COPTIC",
102 "COUNTING ROD",
103 "CUNEIFORM",
104 "CYRILLIC",
105 "DOUBLE-STRUCK",
106 "EGYPTIAN",
107 "ETHIOPIC",
108 "EXTENDED ARABIC-INDIC",
109 "GEORGIAN",
110 "GLAGOLITIC",
111 "GOTHIC",
112 "GREEK",
113 "GUJARATI",
114 "GURMUKHI",
115 "HANGUL",
116 "HANIFI ROHINGYA",
117 "HEBREW",
118 "HIRAGANA",
119 "JAVANESE",
120 "KANNADA",
121 "KATAKANA",
122 "KAYAH LI",
123 "KHMER",
124 "KHUDAWADI",
125 "LAO",
126 "LEPCHA",
127 "LIMBU",
128 "MALAYALAM",
129 "MEETEI",
130 "MYANMAR",
131 "NEW TAI LUE",
132 "NKO",
133 "OL CHIKI",
134 "OLD PERSIAN",
135 "OLD SOUTH ARABIAN",
136 "ORIYA",
137 "OSMANYA",
138 "PHOENICIAN",
139 "SAURASHTRA",
140 "SHARADA",
141 "SINHALA",
142 "SUNDANESE",
143 "SYLOTI",
144 "TAI THAM",
145 "TAKRI",
146 "TAMIL",
147 "TELUGU",
148 "THAANA",
149 "THAI",
150 "TIBETAN",
151 "TIFINAGH",
152 "TIRHUTA",
153 "UGARITIC",
154 "WARANG CITI",
155 "YI",
156]
157non_latin_scripts_re = re.compile(
158 r"(" + r"|".join(re.escape(x) for x in non_latin_scripts) + r")\b"
159)
161# Sanity check xlat_head_map values
162for k, v in xlat_head_map.items():
163 if v.startswith("?"):
164 v = v[1:]
165 for tag in v.split():
166 if tag not in valid_tags: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true
167 print(
168 "WARNING: xlat_head_map[{}] contains unrecognized tag {}".format(
169 k, tag
170 )
171 )
173# Regexp for finding nested translations from translation items (these are
174# used in, e.g., year/English/Translations/Arabic). This is actually used
175# in page.py.
176nested_translations_re = re.compile(
177 r"\s+\((({}): ([^()]|\([^()]+\))+)\)".format(
178 "|".join(
179 re.escape(x.removeprefix("?"))
180 for x in sorted(xlat_head_map.values(), key=len, reverse=True)
181 if x and not x.startswith("class-")
182 )
183 )
184)
186# Regexp that matches head tag specifiers. Used to match tags from end of
187# translations and linkages
188head_final_re_text = r"( -)?( ({}))+".format(
189 "|".join(
190 re.escape(x)
191 for x in
192 # The sort is to put longer ones first, preferring them in
193 # the regexp match
194 sorted(xlat_head_map.keys(), key=len, reverse=True)
195 )
196)
197head_final_re = re.compile(head_final_re_text + "$")
199# Regexp used to match head tag specifiers at end of a form for certain
200# Bantu languages (particularly Swahili and similar languages).
201head_final_bantu_re_text = r" ({})".format(
202 "|".join(re.escape(x) for x in head_final_bantu_map.keys())
203)
204head_final_bantu_re = re.compile(head_final_bantu_re_text + "$")
206# Regexp used to match head tag specifiers at end of a form for certain
207# Semitic languages (particularly Arabic and similar languages).
208head_final_semitic_re_text = r" ({})".format(
209 "|".join(re.escape(x) for x in head_final_semitic_map.keys())
210)
211head_final_semitic_re = re.compile(head_final_semitic_re_text + "$")
213# Regexp used to match head tag specifiers at end of a form for certain
214# other languages (e.g., Lithuanian, Finnish, French).
215head_final_other_re_text = r" ({})".format(
216 "|".join(re.escape(x) for x in head_final_other_map.keys())
217)
218head_final_other_re = re.compile(head_final_other_re_text + "$")
220# Regexp for splitting heads. See parse_word_head().
221head_split_re_text = (
222 "("
223 + head_final_re_text
224 + "|"
225 + head_final_bantu_re_text
226 + "|"
227 + head_final_semitic_re_text
228 + "|"
229 + head_final_other_re_text
230 + ")?( or |[,;]+)"
231)
232head_split_re = re.compile(head_split_re_text)
233head_split_re_parens = 0
234for m in re.finditer(r"(^|[^\\])[(]+", head_split_re_text):
235 head_split_re_parens += m.group(0).count("(")
237# Parenthesized parts that are ignored in translations
238tr_ignored_parens: set[str] = set(
239 [
240 "please verify",
241 "(please verify)",
242 "transliteration needed",
243 "(transliteration needed)",
244 "in words with back vowel harmony",
245 "(in words with back vowel harmony)",
246 "in words with front vowel harmony",
247 "(in words with front vowel harmony)",
248 "see below",
249 "see usage notes below",
250 ]
251)
252tr_ignored_parens_re = re.compile(
253 r"^("
254 + "|".join(re.escape(x) for x in tr_ignored_parens)
255 + ")$"
256 + r"|^(Can we clean up|Can we verify|for other meanings see "
257 r"lit\. )"
258)
260# Translations that are ignored
261ignored_translations: set[str] = set(
262 [
263 "[script needed]",
264 "please add this translation if you can",
265 ]
266)
268# Put english text into the "note" field in a translation if it contains one
269# of these words
270tr_note_re = re.compile(
271 r"(\b(article|definite|indefinite|superlative|comparative|pattern|"
272 r"adjective|adjectives|clause|clauses|pronoun|pronouns|preposition|prep|"
273 r"postposition|postp|action|actions|articles|"
274 r"adverb|adverbs|noun|nouns|verb|verbs|before|"
275 r"after|placed|prefix|suffix|used with|translated|"
276 r"nominative|genitive|dative|infinitive|participle|past|perfect|imperfect|"
277 r"perfective|imperfective|auxiliary|negative|future|present|tense|aspect|"
278 r"conjugation|declension|class|category|plural|singular|positive|"
279 r"seldom used|formal|informal|familiar|unspoken|spoken|written|"
280 r"indicative|progressive|conditional|potential|"
281 r"accusative|adessive|inessive|superessive|elative|allative|"
282 r"dialect|dialects|object|subject|predicate|movies|recommended|language|"
283 r"locative|continuous|simple|continuousness|gerund|subjunctive|"
284 r"periphrastically|no equivalent|not used|not always used|"
285 r"used only with|not applicable|use the|signifying|wordplay|pronounced|"
286 r"preconsonantal|spelled|spelling|respelling|respellings|phonetic|"
287 r"may be replaced|stricter sense|for nonhumans|"
288 r"sense:|used:|in full:|informally used|followed by|"
289 r"not restricted to|pertaining to|or optionally with|are optional|"
290 r"in conjunction with|in compounds|depending on the relationship|"
291 r"person addressed|one person|multiple persons|may be replaced with|"
292 r"optionally completed with|in the phrase|in response to|"
293 r"before a|before an|preceded by|verbs ending|very common|after a verb|"
294 r"with verb|with uncountable|with the objects|with stative|"
295 r"can be replaced by|often after|used before|used after|"
296 r"used in|clipping of|spoken|somewhat|capitalized|"
297 r"short form|shortening of|shortened form|initialism of|"
298 r"said to|rare:|rarer also|is rarer|negatively connoted|"
299 r"previously mentioned|uncountable noun|countable noun|"
300 r"countable nouns|uncountable nouns|"
301 r"with predicative|with -|with imperfect|with a negated|"
302 r"colloquial|misspelling|holophrastic|frequently|esp\.|especially|"
303 r'"|'
304 r"general term|after a vowel|before a vowel|"
305 r"form|regular|irregular|alternative)"
306 r")($|[) ])|^("
307 # Following are only matched at the beginning of the string
308 r"pl|pl\.|see:|pl:|sg:|plurals:|e\.g\.|e\.g\.:|e\.g\.,|cf\.|compare|such as|"
309 r"see|only|often|usually|used|usage:|of|not|in|compare|usu\.|"
310 r"as|about|abbrv\.|abbreviation|abbr\.|that:|optionally|"
311 r"mainly|from|for|also|also:|acronym|"
312 r"\+|with) "
313)
314# \b does not work at the end???
316# Related forms matching this regexp will be considered suspicious if the
317# page title does not also match one of these.
318suspicious_related_re = re.compile(
319 r"(^| )(f|m|n|c|or|pl|sg|inan|anim|pers|anml|impf|pf|vir|nvir)( |$)"
320 r"|[][:=<>&#*|]"
321 r"| \d+$"
322)
324# Word forms (head forms, translations, etc) that will be considered ok and
325# silently accepted even if they would otherwise trigger a suspicious
326# form warning.
327ok_suspicious_forms: set[str] = set(
328 [
329 "but en or", # "golden goal"/English/Tr/French
330 "cœur en or", # "heart of gold"/Eng/Tr/French
331 "en or", # golden/Eng/Tr/French
332 "men du", # jet/Etym2/Noun/Tr/Cornish
333 "parachute en or", # "golden parachute"/Eng/Tr/French
334 "vieil or", # "old gold"/Eng/Tr/French
335 # "all that glitters is not gold"/Eng/Tr/French
336 "tout ce qui brille n’est pas or",
337 "μη αποκλειστικό or", # inclusive or/Eng/Tr/Greek
338 "period or full stop",
339 ]
340)
343# Replacements to be done in classify_desc before tokenizing. This is a
344# workaround for shortcomings in TweetTokenizer.
345tokenizer_fixup_map = {
346 r"a.m.": "AM",
347 r"p.m.": "PM",
348}
349tokenizer_fixup_re = re.compile(
350 r"\b("
351 + "|".join(
352 re.escape(x)
353 for x in sorted(
354 tokenizer_fixup_map.keys(), key=lambda x: len(x), reverse=True
355 )
356 )
357 + r")"
358)
360# Unknown tags starting with these words will be silently ignored.
361ignored_unknown_starts: set[str] = set(
362 [
363 "originally",
364 "e.g.",
365 "c.f.",
366 "supplanted by",
367 "supplied by",
368 ]
369)
371ignored_unknown_starts_re = re.compile(
372 r"^("
373 + "|".join(
374 re.escape(x)
375 for x in sorted(ignored_unknown_starts, key=lambda x: -len(x))
376 )
377 + ") "
378)
380# If an unknown sequence starts with one of these, it will continue as an
381# unknown sequence until the end, unless it turns out to have a replacement.
382allowed_unknown_starts: set[str] = set(
383 [
384 "Relating",
385 "accompanied",
386 "added",
387 "after",
388 "answering",
389 "as",
390 "based",
391 "before",
392 "conjugated",
393 "conjunction",
394 "construed",
395 "especially",
396 "expression:",
397 "figurative:",
398 "followed",
399 "for",
400 "forms",
401 "from",
402 "governs",
403 "in",
404 "indicating",
405 "modifying",
406 "normally",
407 "not",
408 "of",
409 "preceding",
410 "prefixed",
411 "referring",
412 "relating",
413 "revived",
414 "said",
415 "since",
416 "takes",
417 "used",
418 "with",
419 "With",
420 "without",
421 ]
422)
423# Allow the ignored unknown starts without complaining
424allowed_unknown_starts.update(ignored_unknown_starts)
426# Full unknown tags that will be ignored in decode_tags()
427# XXX this is unused, ask Tatu where the contents is now
428ignored_unknown_tags: set[str] = set([])
430# Head endings that are mapped to tags
431head_end_map = {
432 " 1st conj.": "conjugation-1",
433 " 2nd conj.": "conjugation-2",
434 " 3rd conj.": "conjugation-3",
435 " 4th conj.": "conjugation-4",
436 " 5th conj.": "conjugation-5",
437 " 6th conj.": "conjugation-6",
438 " 7th conj.": "conjugation-7",
439}
440head_end_re = re.compile(
441 r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$"
442)
445# Dictionary of language-specific parenthesized head part starts that
446# either introduce new tags or modify previous tags. The value for each
447# language is a dictionary that maps the first word of the head part to
448# (rem_tags, add_tags), where ``rem_tags`` can be True to remove all previous
449# tags or a space-separated string of tags to remove, and ``add_tags`` should
450# be a string of tags to add.
451lang_specific_head_map: dict[
452 str, dict[str, Union[tuple[str, str], tuple[Literal[True], str]]]
453] = {
454 "Danish": {
455 # prefix: (rem_tags space separate string/True, add_tags s-sep str)
456 "c": ("neuter", "common-gender"),
457 "n": ("common-gender", "neuter"),
458 "pl": ("singular neuter common-gender", "plural"),
459 "sg": ("plural neuter common-gender", "singular"),
460 },
461}
464# Regular expression used to strip additional stuff from the end of alt_of and
465# form_of.
466alt_of_form_of_clean_re = re.compile(
467 r"(?s)("
468 + "|".join(
469 [
470 r":",
471 r'[“"]',
472 r";",
473 r" \(",
474 r" - ",
475 r" ־ ",
476 r" ᠆ ",
477 r" ‐ ",
478 r" ‑ ",
479 r" ‒ ",
480 r" – ",
481 r" — ",
482 r" ― ",
483 r" − ",
484 r" ⸺ ",
485 r" ⸻ ",
486 r" ﹘ ",
487 r" ﹣ ",
488 r" - ",
489 r" \+ ",
490 r" \(with ",
491 r" with -ra/-re",
492 r"\. Used ",
493 r"\. Also ",
494 r"\. Since ",
495 r"\. A ",
496 r"\.\. A ",
497 r"\. An ",
498 r"\.\. An ",
499 r"\. an ",
500 r"\. The ",
501 r"\. Spanish ",
502 r"\. Language ",
503 r"\. former name of ",
504 r"\. AIM",
505 r"\. OT",
506 r"\. Not ",
507 r"\. Now ",
508 r"\. Nowadays ",
509 r"\. Early ",
510 r"\. ASEAN",
511 r"\. UN",
512 r"\. IMF",
513 r"\. WHO",
514 r"\. WIPO",
515 r"\. AC",
516 r"\. DC",
517 r"\. DNA",
518 r"\. RNA",
519 r"\. SOB",
520 r"\. IMO",
521 r"\. Behavior",
522 r"\. Income ",
523 r"\. More ",
524 r"\. Most ",
525 r"\. Only ",
526 r"\. Also ",
527 r"\. From ",
528 r"\. Of ",
529 r"\.\. Of ",
530 r"\. To ",
531 r"\. For ",
532 r"\. If ",
533 r"\. Praenominal ",
534 r"\. This ",
535 r"\. Replaced ",
536 r"\. CHCS is the ",
537 r"\. Equivalent ",
538 r"\. Initialism ",
539 r"\. Note ",
540 r"\. Alternative ",
541 r"\. Compare ",
542 r"\. Cf\. ",
543 r"\. Comparable ",
544 r"\. Involves ",
545 r"\. Sometimes ",
546 r"\. Commonly ",
547 r"\. Often ",
548 r"\. Typically ",
549 r"\. Possibly ",
550 r"\. Although ",
551 r"\. Rare ",
552 r"\. Instead ",
553 r"\. Integrated ",
554 r"\. Distinguished ",
555 r"\. Given ",
556 r"\. Found ",
557 r"\. Was ",
558 r"\. In ",
559 r"\. It ",
560 r"\.\. It ",
561 r"\. One ",
562 r"\. Any ",
563 r"\. They ",
564 r"\. Members ",
565 r"\. Each ",
566 r"\. Original ",
567 r"\. Especially ",
568 r"\. Usually ",
569 r"\. Known ",
570 r"\.\. Known ",
571 r"\. See ",
572 r"\. see ",
573 r"\. target was not ",
574 r"\. Popular ",
575 r"\. Pedantic ",
576 r"\. Positive ",
577 r"\. Society ",
578 r"\. Plan ",
579 r"\. Environmentally ",
580 r"\. Affording ",
581 r"\. Encompasses ",
582 r"\. Expresses ",
583 r"\. Indicates ",
584 r"\. Text ",
585 r"\. Large ",
586 r"\. Sub-sorting ",
587 r"\. Sax",
588 r"\. First-person ",
589 r"\. Second-person ",
590 r"\. Third-person ",
591 r"\. 1st ",
592 r"\. 2nd ",
593 r"\. 3rd ",
594 r"\. Term ",
595 r"\. Northeastern ",
596 r"\. Northwestern ",
597 r"\. Southeast ",
598 r"\. Egyptian ",
599 r"\. English ",
600 r"\. Cape Province was split into ",
601 r"\. Pañcat",
602 r"\. of the ",
603 r"\. is ",
604 r"\. after ",
605 r"\. or ",
606 r"\. chromed",
607 r"\. percussion",
608 r"\. with his ",
609 r"\. a\.k\.a\. ",
610 r"\. comparative form ",
611 r"\. singular ",
612 r"\. plural ",
613 r"\. present ",
614 r"\. his ",
615 r"\. her ",
616 r"\. equivalent ",
617 r"\. measuring ",
618 r"\. used in ",
619 r"\. cutely ",
620 r"\. Protects",
621 r'\. "',
622 r"\.^",
623 r"\. \+ ",
624 r"\., ",
625 r". — ",
626 r", a ",
627 r", an ",
628 r", the ",
629 r", obsolete ",
630 r", possessed", # 'd/English
631 r", imitating", # 1/English
632 r", derived from",
633 r", called ",
634 r", especially ",
635 r", slang for ",
636 r" corresponding to ",
637 r" equivalent to ",
638 r" popularized by ",
639 r" denoting ",
640 r" in its various senses\.",
641 r" used by ",
642 r" but not for ",
643 r" since ",
644 r" i\.e\. ",
645 r" i\. e\. ",
646 r" e\.g\. ",
647 r" eg\. ",
648 r" etc\. ",
649 r"\[http",
650 r" — used as ",
651 r" by K\. Forsyth ",
652 r" by J\. R\. Allen ",
653 r" by S\. Ferguson ",
654 r" by G\. Donaldson ",
655 r" May refer to ",
656 r" An area or region ",
657 ]
658 )
659 + r").*$"
660)
663class ValidNode:
664 """Node in the valid_sequences tree. Each node is part of a chain
665 or chains that form sequences built out of keys in key->tags
666 maps like xlat_tags, etc. The ValidNode's 'word' is the key
667 by which it is refered to in the root dict or a `children` dict,
668 `end` marks that the node is the end-terminus of a sequence (but
669 it can still continue if the sequence is shared by the start of
670 other sequences: "nominative$" and "nominative plural$" for example),
671 `tags` and `topics` are the dicts containing tag and topic strings
672 for terminal nodes (end==True)."""
674 __slots__ = (
675 "end",
676 "tags",
677 "topics",
678 "children",
679 )
681 def __init__(
682 self,
683 end=False,
684 tags: Optional[list[str]] = None,
685 topics: Optional[list[str]] = None,
686 children: Optional[dict[str, "ValidNode"]] = None,
687 ) -> None:
688 self.end = end
689 self.tags: list[str] = tags or []
690 self.topics: list[str] = topics or []
691 self.children: dict[str, "ValidNode"] = children or {}
694def add_to_valid_tree(tree: ValidNode, desc: str, v: Optional[str]) -> None:
695 """Helper function for building trees of valid tags/sequences during
696 initialization."""
697 assert isinstance(tree, ValidNode)
698 assert isinstance(desc, str)
699 assert v is None or isinstance(v, str)
700 node = tree
702 # Build the tree structure: each node has children nodes
703 # whose names are denoted by their dict key.
704 for w in desc.split(" "):
705 if w in node.children:
706 node = node.children[w]
707 else:
708 new_node = ValidNode()
709 node.children[w] = new_node
710 node = new_node
711 if not node.end:
712 node.end = True
713 if not v:
714 return None # Terminate early because there are no tags
716 tagslist = []
717 topicslist = []
718 for vv in v.split():
719 if vv in valid_tags:
720 tagslist.append(vv)
721 elif vv in valid_topics: 721 ↛ 724line 721 didn't jump to line 724 because the condition on line 721 was always true
722 topicslist.append(vv)
723 else:
724 print(
725 "WARNING: tag/topic {!r} maps to unknown {!r}".format(desc, vv)
726 )
727 topics = " ".join(topicslist)
728 tags = " ".join(tagslist)
729 # Changed to "_tags" and "_topics" to avoid possible key-collisions.
730 if topics:
731 node.topics.extend([topics])
732 if tags:
733 node.tags.extend([tags])
736def add_to_valid_tree1(
737 tree: ValidNode,
738 k: str,
739 v: Union[list[str], tuple[str, ...], str],
740 valid_values: Union[set[str], dict[str, Any]],
741) -> list[str]:
742 assert isinstance(tree, ValidNode)
743 assert isinstance(k, str)
744 assert v is None or isinstance(v, (list, tuple, str))
745 assert isinstance(valid_values, (set, dict))
746 if not v: 746 ↛ 747line 746 didn't jump to line 747 because the condition on line 746 was never true
747 add_to_valid_tree(valid_sequences, k, None)
748 return []
749 elif isinstance(v, str):
750 v = [v]
751 q = []
752 for vv in v:
753 assert isinstance(vv, str)
754 add_to_valid_tree(valid_sequences, k, vv)
755 vvs = vv.split()
756 for x in vvs:
757 q.append(x)
758 # return each individual tag
759 return q
762def add_to_valid_tree_mapping(
763 tree: ValidNode,
764 mapping: Union[dict[str, Union[list[str], str]], dict[str, str]],
765 valid_values: Union[set[str], dict[str, Any]],
766 recurse: bool,
767) -> None:
768 assert isinstance(tree, ValidNode)
769 assert isinstance(mapping, dict)
770 assert isinstance(valid_values, (set, dict))
771 assert recurse in (True, False)
772 for k, v in mapping.items():
773 assert isinstance(k, str)
774 assert isinstance(v, (list, str))
775 if isinstance(v, str):
776 q = add_to_valid_tree1(tree, k, [v], valid_values)
777 else:
778 q = add_to_valid_tree1(tree, k, v, valid_values)
779 if recurse:
780 visited = set()
781 while q:
782 v = q.pop()
783 if v in visited:
784 continue
785 visited.add(v)
786 if v not in mapping:
787 continue
788 vv = mapping[v]
789 qq = add_to_valid_tree1(tree, k, vv, valid_values)
790 q.extend(qq)
793# Tree of sequences considered to be tags (includes sequences that are
794# mapped to something that becomes one or more valid tags)
795valid_sequences = ValidNode()
796sequences_with_slashes: set[str] = set()
797for tag in valid_tags:
798 # The basic tags used in our tag system; some are a bit weird, but easier
799 # to implement this with 'false' positives than filter out stuff no one else
800 # uses.
801 if "/" in tag:
802 sequences_with_slashes.add(tag)
803 add_to_valid_tree(valid_sequences, tag, tag)
804for tag in uppercase_tags:
805 hyphenated = re.sub(r"\s+", "-", tag)
806 if "/" in tag:
807 sequences_with_slashes.add(tag)
808 add_to_valid_tree(valid_sequences, tag, hyphenated)
810# xlat_tags_map!
811add_to_valid_tree_mapping(valid_sequences, xlat_tags_map, valid_tags, False)
812for k in xlat_tags_map:
813 if "/" in k:
814 sequences_with_slashes.add(k)
815# Add topics to the same table, with all generalized topics also added
816for topic in valid_topics:
817 assert " " not in topic
818 if "/" in topic: 818 ↛ 819line 818 didn't jump to line 819 because the condition on line 818 was never true
819 sequences_with_slashes.add(topic)
820 add_to_valid_tree(valid_sequences, topic, topic)
821# Let each original topic value stand alone. These are not generally on
822# valid_topics. We add the original topics with spaces replaced by hyphens.
823for topic in topic_generalize_map.keys():
824 hyphenated = re.sub(r"\s+", "-", topic)
825 if "/" in topic: 825 ↛ 826line 825 didn't jump to line 826 because the condition on line 825 was never true
826 sequences_with_slashes.add(topic)
827 add_to_valid_tree(valid_sequences, topic, hyphenated)
828# Add canonicalized/generalized topic values
829add_to_valid_tree_mapping(
830 valid_sequences, topic_generalize_map, valid_topics, True
831)
833# Regex used to divide a decode candidate into parts that shouldn't
834# have their slashes turned into spaces
835slashes_re = re.compile(
836 r"(" + "|".join((re.escape(s) for s in sequences_with_slashes)) + r")"
837)
839# Regexp used to find "words" from word heads and linguistic descriptions
840word_pattern = (
841 r"[^ ,;()\u200e]+|"
842 r"\([^ ,;()\u200e]+\)[^ ,;()\u200e]+|"
843 r"[\u2800-\u28ff]|" # Braille characters
844 r"\(([^()]|\([^()]*\))*\)"
845)
847word_re_global = re.compile(word_pattern)
850def distw(titleparts: Sequence[str], word: str) -> float:
851 """Computes how distinct ``word`` is from the most similar word in
852 ``titleparts``. Returns 1 if words completely distinct, 0 if
853 identical, or otherwise something in between."""
854 assert isinstance(titleparts, (list, tuple))
855 assert isinstance(word, str)
856 w = min(
857 Levenshtein.distance(word, tw) / max(len(tw), len(word))
858 for tw in titleparts
859 )
860 return w
863def map_with(
864 ht: dict[str, str | list[str]] | dict[str, str],
865 lst: Sequence[str],
866) -> list[str]:
867 """Takes alternatives from ``lst``, maps them using ``ht`` to zero or
868 more alternatives each, and returns a combined list of alternatives."""
869 assert isinstance(ht, dict)
870 assert isinstance(lst, (list, tuple))
871 ret = []
872 for x in lst:
873 assert isinstance(x, str)
874 x = x.strip()
875 x = ht.get(x, x)
876 if isinstance(x, str): 876 ↛ 879line 876 didn't jump to line 879 because the condition on line 876 was always true
877 if x: 877 ↛ 872line 877 didn't jump to line 872 because the condition on line 877 was always true
878 ret.append(x)
879 elif isinstance(x, (list, tuple)):
880 ret.extend(x)
881 else:
882 raise RuntimeError("map_with unexpected value: {!r}".format(x))
883 return ret
886TagList = list[str]
887PosPathStep = tuple[int, TagList, TagList]
890def check_unknown(
891 from_i: int,
892 to_i: int,
893 i: int,
894 wordlst: Sequence[str],
895 allow_any: bool,
896 no_unknown_starts: bool,
897) -> list[PosPathStep]:
898 """Check if the current section from_i->to_i is actually unknown
899 or if it needs some special handling. We already presupposed that
900 this is UNKNOWN; this is just called to see what *kind* of UNKNOWN."""
901 assert isinstance(to_i, int)
902 assert isinstance(from_i, int)
903 assert isinstance(i, int)
904 # Adds unknown tag if needed. Returns new last_i
905 # print("check_unknown to_i={} from_i={} i={}"
906 # .format(to_i, from_i, i))
907 if from_i >= to_i:
908 return []
909 words = wordlst[from_i:to_i]
910 tag = " ".join(words)
911 assert tag
912 # print(f"{tag=}")
913 if re.match(ignored_unknown_starts_re, tag):
914 # Tags with this start are to be ignored
915 return [(from_i, ["UNKNOWN"], [])]
916 if tag in ignored_unknown_tags: 916 ↛ 917line 916 didn't jump to line 917 because the condition on line 916 was never true
917 return [] # One of the tags listed as to be ignored
918 if tag in ("and", "or"):
919 return []
920 if (
921 not allow_any
922 and not words[0].startswith("~")
923 and (
924 no_unknown_starts
925 or words[0] not in allowed_unknown_starts
926 or len(words) <= 1
927 )
928 ):
929 # print("ERR allow_any={} words={}"
930 # .format(allow_any, words))
931 return [
932 (from_i, ["UNKNOWN"], ["error-unknown-tag"])
933 ] # Add ``tag`` here to include
934 else:
935 return [(from_i, ["UNKNOWN"], [tag])]
938def add_new1(
939 node: ValidNode,
940 i: int,
941 start_i: int,
942 last_i: int,
943 new_paths: list[list[PosPathStep]],
944 new_nodes: list[tuple[ValidNode, int, int]],
945 pos_paths: list[list[list[PosPathStep]]],
946 wordlst: list[str],
947 allow_any: bool,
948 no_unknown_starts: bool,
949 max_last_i: int,
950) -> int:
951 assert isinstance(new_paths, list)
952 # print("add_new: start_i={} last_i={}".format(start_i, last_i))
953 # print("$ {} last_i={} start_i={}"
954 # .format(w, last_i, start_i))
955 max_last_i = max(max_last_i, last_i) # if last_i has grown
956 if (node, start_i, last_i) not in new_nodes:
957 new_nodes.append((node, start_i, last_i))
958 if node.end:
959 # We can see a terminal point in the search tree.
960 u = check_unknown(
961 last_i, start_i, i, wordlst, allow_any, no_unknown_starts
962 )
963 # Create new paths candidates based on different past possible
964 # paths; pos_path[last_i] contains possible paths, so add this
965 # new one at the beginning(?)
966 # The list comprehension inside the parens generates an iterable
967 # of lists, so this is .extend( [(last_i...)], [(last_i...)], ... )
968 # XXX: this is becoming impossible to annotate, nodes might
969 # need to become classed objects and not just dicts, or at least
970 # a TypedDict with a "children" node
971 new_paths.extend(
972 [(last_i, node.tags, node.topics)] + u + x
973 for x in pos_paths[last_i]
974 )
975 max_last_i = i + 1
976 return max_last_i
979@functools.lru_cache(maxsize=65536)
980def decode_tags(
981 src: str,
982 allow_any=False,
983 no_unknown_starts=False,
984) -> tuple[list[tuple[str, ...]], list[str]]:
985 tagsets, topics = decode_tags1(src, allow_any, no_unknown_starts)
986 # print(f"decode_tags: {src=}, {tagsets=}")
988 # Insert retry-code here that modifies the text source
989 if (
990 any(s.startswith("error-") for tagset in tagsets for s in tagset)
991 # I hate Python's *nested* list comprehension syntax ^
992 or any(s.startswith("error-") for s in topics)
993 ):
994 new_tagsets: list[tuple[str, ...]] = []
995 new_topics: list[str] = []
997 if "/" in src:
998 # slashes_re contains valid key entries with slashes; we're going
999 # to skip them by splitting the string and skipping handling every
1000 # second entry, which contains the splitting group like "masculine/
1001 # feminine" style keys.
1002 split_parts = re.split(slashes_re, src)
1003 new_parts: list[str] = []
1004 if len(split_parts) > 1:
1005 for i, s in enumerate(split_parts):
1006 if i % 2 == 0:
1007 new_parts.append(s.replace("/", " "))
1008 else:
1009 new_parts.append(s)
1010 new_src = "".join(new_parts)
1011 else:
1012 new_src = src
1013 new_tagsets, new_topics = decode_tags1(
1014 new_src, allow_any, no_unknown_starts
1015 )
1016 elif " or " in src or " and " in src:
1017 # Annoying kludge.
1018 new_src = src.replace(" and ", " ")
1019 new_src = new_src.replace(" or ", " ")
1020 new_tagsets, new_topics = decode_tags1(
1021 new_src, allow_any, no_unknown_starts
1022 )
1023 # print(f"{new_tagsets=}")
1025 if new_tagsets or new_topics:
1026 old_errors = sum(
1027 1 for tagset in tagsets for s in tagset if s.startswith("error")
1028 )
1029 old_errors += sum(1 for s in topics if s.startswith("error"))
1030 new_errors = sum(
1031 1
1032 for new_tagset in new_tagsets
1033 for s in new_tagset
1034 if s.startswith("error")
1035 )
1036 new_errors += sum(1 for s in new_topics if s.startswith("error"))
1038 if new_errors <= old_errors: 1038 ↛ 1041line 1038 didn't jump to line 1041 because the condition on line 1038 was always true
1039 return new_tagsets, new_topics
1041 return tagsets, topics
1044def decode_tags1(
1045 src: str,
1046 allow_any=False,
1047 no_unknown_starts=False,
1048) -> tuple[list[tuple[str, ...]], list[str]]:
1049 """Decodes tags, doing some canonicalizations. This returns a list of
1050 lists of tags and a list of topics."""
1051 assert isinstance(src, str)
1053 # print("decode_tags: src={!r}".format(src))
1055 pos_paths: list[list[list[PosPathStep]]] = [[[]]]
1056 wordlst: list[str] = []
1057 max_last_i = 0 # pre-initialized here so that it can be used as a ref
1059 add_new = functools.partial(
1060 add_new1, # pre-set parameters and references for function
1061 pos_paths=pos_paths,
1062 wordlst=wordlst,
1063 allow_any=allow_any,
1064 no_unknown_starts=no_unknown_starts,
1065 max_last_i=max_last_i,
1066 )
1067 # First split the tags at commas and semicolons. Their significance is that
1068 # a multi-word sequence cannot continue across them.
1069 parts = split_at_comma_semi(src, extra=[";", ":"])
1071 for part in parts:
1072 max_last_i = len(wordlst) # "how far have we gone?"
1073 lst1 = part.split()
1074 if not lst1:
1075 continue
1076 wordlst.extend(lst1)
1077 cur_nodes: list[tuple[ValidNode, int, int]] = [] # Currently seen
1078 for w in lst1:
1079 i = len(pos_paths) - 1
1080 new_nodes: list[tuple[ValidNode, int, int]] = []
1081 # replacement nodes for next loop
1082 new_paths: list[list[PosPathStep]] = []
1083 # print("ITER i={} w={} max_last_i={} wordlst={}"
1084 # .format(i, w, max_last_i, wordlst))
1085 node: ValidNode
1086 start_i: int
1087 last_i: int
1088 for node, start_i, last_i in cur_nodes:
1089 # ValidNodes are part of a search tree that checks if a
1090 # phrase is found in xlat_tags_map and other text->tags dicts.
1091 if w in node.children:
1092 # the phrase continues down the tree
1093 # print("INC", w)
1094 max_last_i = add_new(
1095 node.children[w],
1096 i,
1097 start_i,
1098 last_i,
1099 new_paths,
1100 new_nodes,
1101 )
1102 if node.end:
1103 # we've hit an end point, the tags and topics have already
1104 # been gathered at some point, don't do anything with the
1105 # old stuff
1106 if w in valid_sequences.children:
1107 # This starts a *new* possible section
1108 max_last_i = add_new(
1109 valid_sequences.children[w], # root->
1110 i,
1111 i,
1112 i,
1113 new_paths,
1114 new_nodes,
1115 )
1116 if w not in node.children and not node.end:
1117 # print("w not in node and $: i={} last_i={} wordlst={}"
1118 # .format(i, last_i, wordlst))
1119 # If i == last_i == 0, for example (beginning)
1120 if (
1121 i == last_i
1122 or no_unknown_starts
1123 or wordlst[last_i] not in allowed_unknown_starts
1124 ):
1125 # print("NEW", w)
1126 if w in valid_sequences.children:
1127 # Start new sequences here
1128 max_last_i = add_new(
1129 valid_sequences.children[w],
1130 i,
1131 i,
1132 last_i,
1133 new_paths,
1134 new_nodes,
1135 )
1136 if not new_nodes:
1137 # This is run at the start when i == max_last_i == 0,
1138 # which is what populates the first node in new_nodes.
1139 # Some initial words cause the rest to be interpreted as unknown
1140 # print("not new nodes: i={} last_i={} wordlst={}"
1141 # .format(i, max_last_i, wordlst))
1142 if (
1143 i == max_last_i
1144 or no_unknown_starts
1145 or wordlst[max_last_i] not in allowed_unknown_starts
1146 ):
1147 # print("RECOVER w={} i={} max_last_i={} wordlst={}"
1148 # .format(w, i, max_last_i, wordlst))
1149 if w in valid_sequences.children:
1150 max_last_i = add_new(
1151 # new sequence from root
1152 valid_sequences.children[w],
1153 i,
1154 i,
1155 max_last_i,
1156 new_paths,
1157 new_nodes,
1158 )
1159 cur_nodes = new_nodes # Completely replace nodes!
1160 # 2023-08-18, fix to improve performance
1161 # Decode tags does a big search of the best-shortest matching
1162 # sequences of tags, but the original algorithm didn't have
1163 # any culling happen during operation, so in a case with
1164 # a lot of tags (for example, big blocks of text inserted
1165 # somewhere by mistake that is processed by decode_tags),
1166 # it would lead to exponential growth of new_paths contents.
1167 # This culling, using the same weighting algorithm code as
1168 # in the original is just applied to new_paths before it is
1169 # added to pos_paths. Basically it's "take the 10 best paths".
1170 # This *can* cause bugs if it gets stuck in a local minimum
1171 # or something, but this whole process is one-dimensional
1172 # and not that complex, so hopefully it works out...
1173 pw = []
1174 path: list[PosPathStep]
1175 for path in new_paths:
1176 weight = len(path)
1177 if any(x[1] == ["UNKNOWN"] for x in path):
1178 weight += 100 # Penalize unknown paths
1179 pw.append((weight, path))
1180 new_paths = [weightpath[1] for weightpath in sorted(pw)[:10]]
1181 pos_paths.append(new_paths)
1183 # print("END max_last_i={} len(wordlst)={} len(pos_paths)={}"
1184 # .format(max_last_i, len(wordlst), len(pos_paths)))
1186 if cur_nodes:
1187 # print("END HAVE_NODES")
1188 for node, start_i, last_i in cur_nodes:
1189 if node.end:
1190 # print("$ END start_i={} last_i={}"
1191 # .format(start_i, last_i))
1192 for path in pos_paths[start_i]:
1193 pos_paths[-1].append(
1194 [(last_i, node.tags, node.topics)] + path
1195 )
1196 else:
1197 # print("UNK END start_i={} last_i={} wordlst={}"
1198 # .format(start_i, last_i, wordlst))
1199 u = check_unknown(
1200 last_i,
1201 len(wordlst),
1202 len(wordlst),
1203 wordlst,
1204 allow_any,
1205 no_unknown_starts,
1206 )
1207 if pos_paths[start_i]:
1208 for path in pos_paths[start_i]:
1209 pos_paths[-1].append(u + path)
1210 else:
1211 pos_paths[-1].append(u)
1212 else:
1213 # Check for a final unknown tag
1214 # print("NO END NODES max_last_i={}".format(max_last_i))
1215 paths = pos_paths[max_last_i] or [[]]
1216 u = check_unknown(
1217 max_last_i,
1218 len(wordlst),
1219 len(wordlst),
1220 wordlst,
1221 allow_any,
1222 no_unknown_starts,
1223 )
1224 if u:
1225 # print("end max_last_i={}".format(max_last_i))
1226 for path in list(paths): # Copy in case it is the last pos
1227 pos_paths[-1].append(u + path)
1229 # import json
1230 # print("POS_PATHS:", json.dumps(pos_paths, indent=2, sort_keys=True))
1232 if not pos_paths[-1]:
1233 # print("decode_tags: {}: EMPTY POS_PATHS[-1]".format(src))
1234 return [], []
1236 # Find the best path
1237 pw = []
1238 for path in pos_paths[-1]:
1239 weight = len(path)
1240 if any(x[1] == ["UNKNOWN"] for x in path):
1241 weight += 100 # Penalize unknown paths
1242 pw.append((weight, path))
1243 path = min(pw)[1]
1245 # Convert the best path to tagsets and topics
1246 tagsets: list[list[str]] = [[]]
1247 topics: list[str] = []
1248 for i, tagspec, topicspec in path:
1249 if len(tagsets or "") > 16:
1250 # ctx.error("Too many tagsets! This is probably exponential",
1251 # sortid="form_descriptions/20230818")
1252 return [("error-unknown-tag", "error-exponential-tagsets")], []
1253 if tagspec == ["UNKNOWN"]:
1254 new_tagsets = []
1255 for x in tagsets:
1256 new_tagsets.append(x + topicspec)
1257 tagsets = new_tagsets
1258 continue
1259 if tagspec:
1260 new_tagsets = []
1261 for x in tagsets:
1262 for t in tagspec:
1263 if t: 1263 ↛ 1270line 1263 didn't jump to line 1270 because the condition on line 1263 was always true
1264 new_tags = list(x)
1265 for tag in t.split():
1266 if tag not in new_tags:
1267 new_tags.append(tag)
1268 new_tagsets.append(new_tags)
1269 else:
1270 new_tagsets.append(x)
1271 tagsets = new_tagsets
1272 if topicspec:
1273 for t in topicspec:
1274 for topic in t.split():
1275 if topic not in topics:
1276 topics.append(topic)
1278 # print("unsorted tagsets:", tagsets)
1279 ret_tagsets = sorted(set(tuple(sorted(set(tags))) for tags in tagsets))
1280 # topics = list(sorted(set(topics))) XXX tests expect not sorted
1281 # print("decode_tags: {} -> {} topics {}".format(src, tagsets, topics))
1282 # Yes, ret_tagsets is a list of tags in tuples, while topics is a LIST
1283 # of tags. Turning topics into a tuple breaks tests, turning the tuples
1284 # inside tagsets into lists breaks tests, I'm leaving them mismatched
1285 # for now. XXX
1286 return ret_tagsets, topics
1289def parse_head_final_tags(
1290 wxr: WiktextractContext, lang: str, form: str
1291) -> tuple[str, list[str]]:
1292 """Parses tags that are allowed at the end of a form head from the end
1293 of the form. This can also be used for parsing the final gender etc tags
1294 from translations and linkages."""
1295 assert isinstance(wxr, WiktextractContext)
1296 assert isinstance(lang, str) # Should be language that "form" is for
1297 assert isinstance(form, str)
1299 # print("parse_head_final_tags: lang={} form={!r}".format(lang, form))
1301 # Make sure there are no double spaces in the form as this code does not
1302 # handle them otherwise.
1303 form = re.sub(r"\s+", " ", form.strip())
1304 if not form:
1305 return form, []
1307 origform = form
1309 tags = []
1311 # If parsing for certain Bantu languages (e.g., Swahili), handle
1312 # some extra head-final tags first
1313 if lang in head_final_bantu_langs:
1314 m = re.search(head_final_bantu_re, form)
1315 if m is not None:
1316 tagkeys = m.group(1)
1317 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1317 ↛ 1332line 1317 didn't jump to line 1332 because the condition on line 1317 was always true
1318 form = form[: m.start()]
1319 v = head_final_bantu_map[tagkeys]
1320 if v.startswith("?"): 1320 ↛ 1321line 1320 didn't jump to line 1321 because the condition on line 1320 was never true
1321 v = v[1:]
1322 wxr.wtp.debug(
1323 "suspicious suffix {!r} in language {}: {}".format(
1324 tagkeys, lang, origform
1325 ),
1326 sortid="form_descriptions/1028",
1327 )
1328 tags.extend(v.split())
1330 # If parsing for certain Semitic languages (e.g., Arabic), handle
1331 # some extra head-final tags first
1332 if lang in head_final_semitic_langs:
1333 m = re.search(head_final_semitic_re, form)
1334 if m is not None:
1335 tagkeys = m.group(1)
1336 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1336 ↛ 1351line 1336 didn't jump to line 1351 because the condition on line 1336 was always true
1337 form = form[: m.start()]
1338 v = head_final_semitic_map[tagkeys]
1339 if v.startswith("?"): 1339 ↛ 1340line 1339 didn't jump to line 1340 because the condition on line 1339 was never true
1340 v = v[1:]
1341 wxr.wtp.debug(
1342 "suspicious suffix {!r} in language {}: {}".format(
1343 tagkeys, lang, origform
1344 ),
1345 sortid="form_descriptions/1043",
1346 )
1347 tags.extend(v.split())
1349 # If parsing for certain other languages (e.g., Lithuanian,
1350 # French, Finnish), handle some extra head-final tags first
1351 if lang in head_final_other_langs:
1352 m = re.search(head_final_other_re, form)
1353 if m is not None:
1354 tagkeys = m.group(1)
1355 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1355 ↛ 1360line 1355 didn't jump to line 1360 because the condition on line 1355 was always true
1356 form = form[: m.start()]
1357 tags.extend(head_final_other_map[tagkeys].split(" "))
1359 # Handle normal head-final tags
1360 m = re.search(head_final_re, form)
1361 if m is not None:
1362 tagkeys = m.group(3)
1363 # Only replace tags ending with numbers in languages that have
1364 # head-final numeric tags (e.g., Bantu classes); also, don't replace
1365 # tags if the main title ends with them (then presume they are part
1366 # of the word)
1367 # print("head_final_tags form={!r} tagkeys={!r} lang={}"
1368 # .format(form, tagkeys, lang))
1369 tagkeys_contains_digit = re.search(r"\d", tagkeys)
1370 if (
1371 (not tagkeys_contains_digit or lang in head_final_numeric_langs)
1372 and not wxr.wtp.title.endswith(" " + tagkeys) # type:ignore[union-attr]
1373 and
1374 # XXX the above test does not capture when the whole word is a
1375 # xlat_head_map key, so I added the below test to complement
1376 # it; does this break anything?
1377 not wxr.wtp.title == tagkeys
1378 ): # defunct/English,
1379 # "more defunct" -> "more" ["archaic"]
1380 if not tagkeys_contains_digit or lang in head_final_numeric_langs: 1380 ↛ 1394line 1380 didn't jump to line 1394 because the condition on line 1380 was always true
1381 form = form[: m.start()]
1382 v = xlat_head_map[tagkeys]
1383 if v.startswith("?"): 1383 ↛ 1384line 1383 didn't jump to line 1384 because the condition on line 1383 was never true
1384 v = v[1:]
1385 wxr.wtp.debug(
1386 "suspicious suffix {!r} in language {}: {}".format(
1387 tagkeys, lang, origform
1388 ),
1389 sortid="form_descriptions/1077",
1390 )
1391 tags.extend(v.split())
1393 # Generate warnings about words ending in " or" after processing
1394 if (
1395 (form.endswith(" or") and not origform.endswith(" or"))
1396 or re.search(
1397 r" (1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|"
1398 r"1a|2a|9a|10a|m1|f1|f2|m2|f3|m3|f4|m4|f5|m5|or|\?)"
1399 r"($|/| (f|m|sg|pl|anim|inan))",
1400 form,
1401 )
1402 or form.endswith(" du")
1403 ):
1404 if form not in ok_suspicious_forms:
1405 wxr.wtp.debug(
1406 "suspicious unhandled suffix in {}: {!r}, originally {!r}".format(
1407 lang, form, origform
1408 ),
1409 sortid="form_descriptions/1089",
1410 )
1412 # print("parse_head_final_tags: form={!r} tags={}".format(form, tags))
1413 return form, tags
1416def quote_kept_parens(s: str) -> str:
1417 """Changes certain parenthesized expressions so that they won't be
1418 interpreted as parentheses. This is used for parts that are kept as
1419 part of the word, such as "read admiral (upper half)"."""
1420 return re.sub(
1421 r"\((lower half|upper half|k|s|n|II|III|A|C|G|U|Y|"
1422 r"vinyl|p-phenylene vinylene|\(\(\s*\)\))\)",
1423 r"__lpar__\1__rpar__",
1424 s,
1425 )
1428def quote_kept_ruby(
1429 wxr: WiktextractContext,
1430 ruby_tuples: list[
1431 tuple[
1432 str,
1433 str,
1434 ]
1435 ],
1436 s: str,
1437) -> str:
1438 if len(ruby_tuples) < 1: 1438 ↛ 1439line 1438 didn't jump to line 1439 because the condition on line 1438 was never true
1439 wxr.wtp.debug(
1440 "quote_kept_ruby called with no ruby",
1441 sortid="form_description/1114/20230517",
1442 )
1443 return s
1444 ks = []
1445 rs = []
1446 for k, r in ruby_tuples:
1447 ks.append(re.escape(k))
1448 rs.append(re.escape(r))
1449 if not (ks and rs): 1449 ↛ 1450line 1449 didn't jump to line 1450 because the condition on line 1449 was never true
1450 wxr.wtp.debug(
1451 f"empty column in ruby_tuples: {ruby_tuples}",
1452 sortid="form_description/1124/20230606",
1453 )
1454 return s
1455 newm = re.compile(
1456 r"({})\s*\(\s*({})\s*\)".format("|".join(ks), "|".join(rs))
1457 )
1458 rub_re = re.compile(
1459 r"({})".format(
1460 r"|".join(
1461 r"{}\(*{}\)*".format(
1462 re.escape(k),
1463 re.escape(r),
1464 )
1465 for k, r in ruby_tuples
1466 )
1467 )
1468 )
1470 def paren_replace(m: re.Match) -> str:
1471 return re.sub(newm, r"\1__lrub__\2__rrub__", m.group(0))
1473 return re.sub(rub_re, paren_replace, s)
1476def unquote_kept_parens(s: str) -> str:
1477 """Conerts the quoted parentheses back to normal parentheses."""
1478 return re.sub(r"__lpar__(.*?)__rpar__", r"(\1)", s)
1481def add_romanization(
1482 wxr: WiktextractContext,
1483 data: WordData,
1484 roman: str,
1485 text: str,
1486 is_reconstruction: bool,
1487 head_group: Optional[int],
1488 ruby: Sequence[tuple[str, str]],
1489) -> None:
1490 tags_lst = ["romanization"]
1491 m = re.match(r"([^:]+):(.+)", roman)
1492 # This function's purpose is to intercept broken romanizations,
1493 # like "Yale: hēnpyeng" style tags. Most romanization styles
1494 # are already present as tags, so we can use decode_tags to find
1495 # them.
1496 if m: 1496 ↛ 1497line 1496 didn't jump to line 1497 because the condition on line 1496 was never true
1497 tagsets, topics = decode_tags(m.group(1))
1498 if tagsets:
1499 for tags in tagsets:
1500 tags_lst.extend(tags)
1501 roman = m.group(2)
1502 add_related(
1503 wxr,
1504 data,
1505 tags_lst,
1506 [roman],
1507 text,
1508 True,
1509 is_reconstruction,
1510 head_group,
1511 ruby,
1512 )
1515def add_related(
1516 wxr: WiktextractContext,
1517 data: WordData,
1518 tags_lst: Union[list[str], tuple[str, ...]],
1519 related_list: list[str],
1520 origtext: str,
1521 add_all_canonicals: bool,
1522 is_reconstruction: bool,
1523 head_group: Optional[int],
1524 ruby_data: Optional[Sequence[tuple[str, str]]] = None,
1525) -> Optional[list[tuple[str, ...]]]:
1526 """Internal helper function for some post-processing entries for related
1527 forms (e.g., in word head). This returns a list of list of tags to be
1528 added to following related forms or None (cf. walrus/English word head,
1529 parenthesized part starting with "both")."""
1530 assert isinstance(wxr, WiktextractContext)
1531 assert isinstance(tags_lst, (list, tuple))
1532 for x in tags_lst:
1533 assert isinstance(x, str)
1534 assert isinstance(related_list, (list, tuple))
1535 assert isinstance(origtext, str)
1536 assert add_all_canonicals in (True, False)
1537 assert isinstance(ruby_data, (list, tuple)) or ruby_data is None
1538 if ruby_data is None: 1538 ↛ 1539line 1538 didn't jump to line 1539 because the condition on line 1538 was never true
1539 ruby_data = []
1540 related = " ".join(related_list)
1541 # print("add_related: tags_lst={} related={}".format(tags_lst, related))
1542 if related == "[please provide]": 1542 ↛ 1543line 1542 didn't jump to line 1543 because the condition on line 1542 was never true
1543 return None
1544 if related in IGNORED_RELATED: 1544 ↛ 1545line 1544 didn't jump to line 1545 because the condition on line 1544 was never true
1545 return None
1546 if is_reconstruction and related.startswith("*") and len(related) > 1:
1547 related = related[1:]
1549 # Get title word, with any reconstruction prefix removed
1550 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title) # type:ignore[arg-type]
1552 def check_related(related: str) -> None:
1553 # Warn about some suspicious related forms
1554 m = re.search(suspicious_related_re, related)
1555 if (m and m.group(0) not in titleword) or (
1556 related in ("f", "m", "n", "c") and len(titleword) >= 3
1557 ):
1558 if "eumhun" in tags_lst: 1558 ↛ 1559line 1558 didn't jump to line 1559 because the condition on line 1558 was never true
1559 return
1560 if "cangjie-input" in tags_lst: 1560 ↛ 1561line 1560 didn't jump to line 1561 because the condition on line 1560 was never true
1561 return
1562 if "class" in tags_lst: 1562 ↛ 1563line 1562 didn't jump to line 1563 because the condition on line 1562 was never true
1563 return
1564 if wxr.wtp.section == "Korean" and re.search( 1564 ↛ 1568line 1564 didn't jump to line 1568 because the condition on line 1564 was never true
1565 r"^\s*\w*>\w*\s*$", related
1566 ):
1567 # ignore Korean "i>ni" / "라>나" values
1568 return
1569 if ( 1569 ↛ 1576line 1569 didn't jump to line 1576 because the condition on line 1569 was never true
1570 wxr.wtp.section == "Burmese"
1571 and "romanization" in tags_lst
1572 and re.search(r":", related)
1573 ):
1574 # ignore Burmese with ":", that is used in Burmese
1575 # translitteration of "း", the high-tone visarga.
1576 return
1577 wxr.wtp.debug(
1578 "suspicious related form tags {}: {!r} in {!r}".format(
1579 tags_lst, related, origtext
1580 ),
1581 sortid="form_descriptions/1147",
1582 )
1584 following_tagsets = None # Tagsets to add to following related forms
1585 roman = None
1586 tagsets1: list[tuple[str, ...]] = [tuple()]
1587 topics1: list[str] = []
1589 m = re.match(r"\((([^()]|\([^()]*\))*)\)\s+", related)
1590 if m:
1591 paren = m.group(1)
1592 related = related[m.end() :]
1593 m = re.match(r"^(all|both) (.*)", paren)
1594 if m: 1594 ↛ 1595line 1594 didn't jump to line 1595 because the condition on line 1594 was never true
1595 tagsets1, topics1 = decode_tags(m.group(2))
1596 following_tagsets = tagsets1
1597 else:
1598 tagsets1, topics1 = decode_tags(paren)
1599 else:
1600 m = re.search(r"\s+\((([^()]|\([^()]*\))*)\)$", related)
1601 if m:
1602 paren = m.group(1)
1603 if paren.startswith("U+"): 1603 ↛ 1604line 1603 didn't jump to line 1604 because the condition on line 1603 was never true
1604 related = related[: m.start()]
1605 else:
1606 cls = classify_desc(paren)
1607 if ( 1607 ↛ 1614line 1607 didn't jump to line 1614 because the condition on line 1607 was always true
1608 cls in ("romanization", "english")
1609 and classify_desc(related[: m.start()]) == "other"
1610 ):
1611 roman = paren
1612 related = related[: m.start()]
1613 else:
1614 related = related[: m.start()]
1615 tagsets1, topics1 = decode_tags(paren)
1616 if related and related.startswith("{{"): 1616 ↛ 1617line 1616 didn't jump to line 1617 because the condition on line 1616 was never true
1617 wxr.wtp.debug(
1618 "{{ in word head form - possible Wiktionary error: {!r}".format(
1619 related
1620 ),
1621 sortid="form_descriptions/1177",
1622 )
1623 return None # Likely Wiktionary coding error
1624 related = unquote_kept_parens(related)
1625 # Split related by "/" (e.g., grande/Spanish) superlative in head
1626 # Do not split if / in word title, see π//Japanese
1627 if len(related) > 5 and "/" not in wxr.wtp.title: # type:ignore[operator]
1628 alts = split_at_comma_semi(related, separators=["/"])
1629 else:
1630 alts = [related]
1631 if ruby_data:
1632 # prepare some regex stuff in advance
1633 ks, rs = [], []
1634 for k, r in ruby_data:
1635 ks.append(re.escape(k))
1636 rs.append(re.escape(r))
1637 splitter = r"((?:{})__lrub__(?:{})__rrub__)".format(
1638 "|".join(ks), "|".join(rs)
1639 )
1640 for related in alts:
1641 ruby: list[tuple[str, str]] = []
1642 if ruby_data:
1643 new_related = []
1644 rub_split = re.split(splitter, related)
1645 for s in rub_split:
1646 m = re.match(r"(.+)__lrub__(.+)__rrub__", s)
1647 if m:
1648 # add ruby with (\1, \2)
1649 ruby.append((m.group(1), m.group(2)))
1650 new_related.append(m.group(1))
1651 else:
1652 new_related.append(s)
1653 related = "".join(new_related)
1654 tagsets2, topics2 = decode_tags(" ".join(tags_lst))
1655 for tags1 in tagsets1:
1656 assert isinstance(tags1, (list, tuple))
1657 for tags2 in tagsets2:
1658 assert isinstance(tags1, (list, tuple))
1659 dt: LinkageData = {"word": related}
1660 if roman:
1661 dt["roman"] = roman
1662 if ruby:
1663 dt["ruby"] = ruby
1664 if "alt-of" in tags2: 1664 ↛ 1665line 1664 didn't jump to line 1665 because the condition on line 1664 was never true
1665 check_related(related)
1666 data_extend(data, "tags", tags1)
1667 data_extend(data, "tags", tags2)
1668 data_extend(data, "topics", topics1)
1669 data_extend(data, "topics", topics2)
1670 data_append(data, "alt_of", dt)
1671 elif "form-of" in tags2: 1671 ↛ 1672line 1671 didn't jump to line 1672 because the condition on line 1671 was never true
1672 check_related(related)
1673 data_extend(data, "tags", tags1)
1674 data_extend(data, "tags", tags2)
1675 data_extend(data, "topics", topics1)
1676 data_extend(data, "topics", topics2)
1677 data_append(data, "form_of", dt)
1678 elif "compound-of" in tags2: 1678 ↛ 1679line 1678 didn't jump to line 1679 because the condition on line 1678 was never true
1679 check_related(related)
1680 data_extend(data, "tags", tags1)
1681 data_extend(data, "tags", tags2)
1682 data_extend(data, "topics", topics1)
1683 data_extend(data, "topics", topics2)
1684 data_append(data, "compound", related)
1685 else:
1686 lang = wxr.wtp.section or "LANG_MISSING"
1687 related, final_tags = parse_head_final_tags(
1688 wxr, lang, related
1689 )
1690 # print("add_related: related={!r} tags1={!r} tags2={!r} "
1691 # "final_tags={!r}"
1692 # .format(related, tags1, tags2, final_tags))
1693 tags = list(tags1) + list(tags2) + list(final_tags)
1694 check_related(related)
1695 form: FormData = {"form": related}
1696 if head_group:
1697 form["head_nr"] = head_group
1698 if roman:
1699 form["roman"] = roman
1700 if ruby:
1701 form["ruby"] = ruby
1702 data_extend(form, "topics", topics1)
1703 data_extend(form, "topics", topics2)
1704 if topics1 or topics2: 1704 ↛ 1705line 1704 didn't jump to line 1705 because the condition on line 1704 was never true
1705 wxr.wtp.debug(
1706 "word head form has topics: {}".format(form),
1707 sortid="form_descriptions/1233",
1708 )
1709 # Add tags from canonical form into the main entry
1710 if "canonical" in tags:
1711 if related in ("m", "f") and len(titleword) > 1: 1711 ↛ 1712line 1711 didn't jump to line 1712 because the condition on line 1711 was never true
1712 wxr.wtp.debug(
1713 "probably incorrect canonical form "
1714 "{!r} ignored (probably tag combination "
1715 "missing from xlat_head_map)".format(related),
1716 sortid="form_descriptions/1241",
1717 )
1718 continue
1719 if (
1720 related != titleword
1721 or add_all_canonicals
1722 or topics1
1723 or topics2
1724 or ruby
1725 ):
1726 data_extend(form, "tags", sorted(set(tags)))
1727 else:
1728 # We won't add canonical form here
1729 filtered_tags = list(
1730 x for x in tags if x != "canonical"
1731 )
1732 data_extend(data, "tags", filtered_tags)
1733 continue
1734 else:
1735 data_extend(form, "tags", sorted(set(tags)))
1736 # Only insert if the form is not already there
1737 for old in data.get("forms", ()):
1738 if form == old: 1738 ↛ 1739line 1738 didn't jump to line 1739 because the condition on line 1738 was never true
1739 break
1740 else:
1741 data_append(data, "forms", form)
1743 # If this form had pre-tags that started with "both" or "all", add those
1744 # tags also to following related forms that don't have their own tags
1745 # specified.
1746 return following_tagsets
1749# Issue #967, in English word forms sometimes forms are skipped because
1750# they are taggable words and their distw() is too big, like clipping from clip
1751WORDS_WITH_FALSE_POSITIVE_TAGS: dict[str, list[str]] = {
1752 "clip": ["clipping"], # XXX remember to change me back to clipping after
1753 "English": ["English", "Englishes"],
1754 "common": ["common", "commoner"],
1755 # tests.
1756}
1758WORDS_WITH_FALSE_POSITIVE_FORMS: dict[str, list[str]] = {
1759 "unaccountability": ["countable", "uncountable"],
1760 "uncountability": ["countable", "uncountable"],
1761}
1763FALSE_POSITIVE_MISSING_FORMS: dict[str, list[str]] = {}
1765FORM_ASSOCIATED_TAG_WORDS: set[str] = {
1766 "participle",
1767 "past",
1768 "present",
1769 "singular",
1770 "plural",
1771 "first-person",
1772 "second-person",
1773 "third-person",
1774 "gerund",
1775}
1778def parse_word_head(
1779 wxr: WiktextractContext,
1780 pos: str,
1781 text: str,
1782 data: WordData,
1783 is_reconstruction: bool,
1784 head_group: Optional[int],
1785 ruby=None,
1786 links=None,
1787) -> None:
1788 """Parses the head line for a word for in a particular language and
1789 part-of-speech, extracting tags and related forms."""
1790 assert isinstance(wxr, WiktextractContext)
1791 assert isinstance(pos, str)
1792 assert isinstance(text, str)
1793 assert isinstance(data, dict)
1794 assert isinstance(ruby, (list, tuple)) or ruby is None
1795 if ruby is None:
1796 ruby = []
1797 assert is_reconstruction in (True, False)
1798 # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text))
1799 # print(f"PARSE_WORD_HEAD: {data=}")
1800 if links is None:
1801 links = []
1803 if len(links) > 0:
1804 # if we have link data (that is, links with stuff like commas and
1805 # spaces, replace word_re with a modified local scope pattern
1806 # print(f"links {list((c, ord(c)) for link in links for c in link)=}")
1807 word_re = re.compile(
1808 r"\b" # In case we have forms that are longer and contain links
1809 +
1810 # or words as a substring...
1811 r"\b|\b".join(
1812 sorted((re.escape(s) for s in links), key=lambda x: -len(x))
1813 )
1814 + r"\b|"
1815 + word_pattern
1816 )
1817 else:
1818 word_re = word_re_global
1820 if "Lua execution error" in text or "Lua timeout error" in text: 1820 ↛ 1821line 1820 didn't jump to line 1821 because the condition on line 1820 was never true
1821 return
1823 # Fix words with "superlative:" or "comparative:" at end of head
1824 # e.g. grande/Spanish/Adj
1825 text = re.sub(r" (superlative|comparative): (.*)", r" (\1 \2)", text)
1827 # Parse Arabic non-past forms, e.g. أبلع/Arabic/Verb
1828 m = re.search(r", non-past ([^)]+ \([^)]+\))", text)
1829 if m:
1830 add_related(
1831 wxr,
1832 data,
1833 ["non-past"],
1834 [m.group(1)],
1835 text,
1836 True,
1837 is_reconstruction,
1838 head_group,
1839 ruby,
1840 )
1841 text = text[: m.start()] + text[m.end() :]
1843 language = wxr.wtp.section
1844 titleword = re.sub(
1845 r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "MISSING_TITLE"
1846 )
1847 titleparts = list(
1848 m.group(0)
1849 for m in re.finditer(word_re, wxr.wtp.title or "MISSING_TITLE")
1850 )
1851 if not titleparts: 1851 ↛ 1852line 1851 didn't jump to line 1852 because the condition on line 1851 was never true
1852 return
1854 # Remove " or" from the end to prevent weird canonical forms
1855 if text.endswith(" or"):
1856 for tp in titleparts:
1857 if text.endswith(tp): 1857 ↛ 1858line 1857 didn't jump to line 1858 because the condition on line 1857 was never true
1858 break
1859 else:
1860 text = text.removesuffix(" or").rstrip()
1862 # Handle the part of the head that is not in parentheses. However, certain
1863 # parenthesized parts are part of word, and those must be handled
1864 # specially here.
1865 if ruby:
1866 text = quote_kept_ruby(wxr, ruby, text)
1867 base = text
1868 base = quote_kept_parens(base)
1869 base = remove_text_in_parentheses(base)
1870 base = base.replace("?", "") # Removes uncertain articles etc
1871 base = re.sub(r"\s+", " ", base)
1872 base = re.sub(r" ([,;])", r"\1", base)
1873 base = re.sub(r" • ", r" ", base)
1874 # Many languages use • as a punctuation mark separating the base
1875 # from the rest of the head. στάδιος/Ancient Greek, issue #176
1876 base = base.strip()
1877 # print(f"{base=}")
1879 # Check for certain endings in head (mostly for compatibility with weird
1880 # heads, e.g. rata/Romanian "1st conj." at end)
1881 m = re.search(head_end_re, base)
1882 tags: Union[tuple[str, ...], list[str]] = []
1883 if m: 1883 ↛ 1884line 1883 didn't jump to line 1884 because the condition on line 1883 was never true
1884 tags = head_end_map[m.group(1).lower()].split()
1885 data_extend(data, "tags", tags)
1886 base = base[: m.start()]
1888 # Special case: handle Hán Nôm readings for Vietnamese characters
1889 m = re.match(
1890 r"{}: (Hán Nôm) readings: (.*)".format(re.escape(titleword)), base
1891 )
1892 if m: 1892 ↛ 1893line 1892 didn't jump to line 1893 because the condition on line 1892 was never true
1893 tag, readings = m.groups()
1894 tag = re.sub(r"\s+", "-", tag)
1895 for reading in split_at_comma_semi(readings, skipped=links):
1896 add_related(
1897 wxr,
1898 data,
1899 [tag],
1900 [reading],
1901 text,
1902 True,
1903 is_reconstruction,
1904 head_group,
1905 ruby,
1906 )
1907 return
1909 # Special case: Hebrew " [pattern: nnn]" ending
1910 m = re.search(r"\s+\[pattern: ([^]]+)\]", base)
1911 if m: 1911 ↛ 1912line 1911 didn't jump to line 1912 because the condition on line 1911 was never true
1912 add_related(
1913 wxr,
1914 data,
1915 ["class"],
1916 [m.group(1)],
1917 text,
1918 True,
1919 is_reconstruction,
1920 head_group,
1921 ruby,
1922 )
1923 base = base[: m.start()] + base[m.end() :]
1925 # Clean away some messy "Upload an image" template text used in
1926 # American Sign Language:
1927 # S@NearBaseForearm-PalmUp Frontandback S@BaseForearm-PalmUp
1928 m = re.search(r"Upload .+ gif image.", base)
1929 if m: 1929 ↛ 1930line 1929 didn't jump to line 1930 because the condition on line 1929 was never true
1930 base = base[: m.start()] + base[m.end() :]
1932 # Split the head into alternatives. This is a complicated task, as
1933 # we do not want so split on "or" or "," when immediately followed by more
1934 # head-final tags, but otherwise do want to split by them.
1935 # 20230907 added "or" to this to handle 'true or false', titles with 'or'
1936 if wxr.wtp.title and ("," in wxr.wtp.title or " or " in wxr.wtp.title):
1937 # A kludge to handle article titles/phrases with commas.
1938 # Preprocess splits to first capture the title, then handle
1939 # all the others as usual.
1940 presplits = re.split(r"({})".format(wxr.wtp.title), base)
1941 splits = []
1942 for psplit in presplits:
1943 if psplit == wxr.wtp.title:
1944 splits.append(psplit)
1945 else:
1946 splits.extend(re.split(head_split_re, psplit))
1947 else:
1948 # Do the normal split; previous only-behavior.
1949 splits = re.split(head_split_re, base)
1950 # print("SPLITS:", splits)
1951 alts: list[str] = []
1952 # print("parse_word_head: splits:", splits,
1953 # "head_split_re_parens:", head_split_re_parens)
1954 for i in range(
1955 0, len(splits) - head_split_re_parens, head_split_re_parens + 1
1956 ):
1957 v = splits[i]
1958 ending = splits[i + 1] or "" # XXX is this correct???
1959 # print("parse_word_head alts v={!r} ending={!r} alts={}"
1960 # .format(v, ending, alts))
1961 if alts and (v == "" and ending):
1962 assert ending[0] == " "
1963 alts[-1] += " or" + ending # endings starts with space
1964 elif v or ending: 1964 ↛ 1954line 1964 didn't jump to line 1954 because the condition on line 1964 was always true
1965 alts.append((v or "") + (ending or ""))
1966 last = splits[-1].strip()
1967 conn = "" if len(splits) < 3 else splits[-2]
1968 # print("parse_word_head alts last={!r} conn={!r} alts={}"
1969 # .format(last, conn, alts))
1970 if (
1971 alts
1972 and last
1973 and (
1974 last.split()[0] in xlat_head_map
1975 or (
1976 conn == " or "
1977 and (alts[-1] + " or " + last).strip() in xlat_head_map
1978 )
1979 )
1980 ):
1981 alts[-1] += " or " + last
1982 elif last:
1983 alts.append(last)
1985 # print("parse_word_head alts: {}".format(alts))
1986 # print(f"{base=}")
1988 # Process the head alternatives
1989 canonicals: list[tuple[list[str], list[str]]] = []
1990 mode: Optional[str] = None
1991 for alt_i, alt in enumerate(alts):
1992 alt = alt.strip()
1993 if alt.startswith("compound form:"): 1993 ↛ 1994line 1993 didn't jump to line 1994 because the condition on line 1993 was never true
1994 mode = "compound-form"
1995 alt = alt[14:].strip()
1996 if ((dash_i := alt.find(" -")) > 0) and (
1997 dash_i > (wxr.wtp.title or "").find(" -")
1998 ):
1999 # test_en_head / test_suffixes_at_end_of_form1
2000 # Some heads have suffixes that end up attached to the form
2001 # like in https://en.wiktionary.org/wiki/%E6%A5%BD%E3%81%97%E3%81%84
2002 alt = alt[:dash_i]
2003 if mode == "compound-form": 2003 ↛ 2004line 2003 didn't jump to line 2004 because the condition on line 2003 was never true
2004 add_related(
2005 wxr,
2006 data,
2007 ["in-compounds"],
2008 [alt],
2009 text,
2010 True,
2011 is_reconstruction,
2012 head_group,
2013 ruby,
2014 )
2015 continue
2016 # For non-first parts, see if it can be treated as tags-only
2017 if alt_i == 0:
2018 expanded_alts = [alt]
2019 else:
2020 expanded_alts = map_with(xlat_descs_map, [alt])
2021 # print("EXPANDED_ALTS:", expanded_alts)
2022 tagsets: Optional[list[tuple[str, ...]]]
2023 for alt in expanded_alts:
2024 baseparts = list(m.group(0) for m in word_re.finditer(alt))
2025 if alt_i > 0:
2026 tagsets, topics = decode_tags(" ".join(baseparts))
2027 if not any("error-unknown-tag" in x for x in tagsets):
2028 data_extend(data, "topics", topics)
2029 for tags1 in tagsets:
2030 data_extend(data, "tags", tags1)
2031 continue
2033 alt, tags = parse_head_final_tags(
2034 wxr, language or "MISSING_LANG", alt
2035 )
2036 tags = list(tags) # Make sure we don't modify anything cached
2037 tags.append("canonical")
2038 if alt_i == 0 and "," in wxr.wtp.title: # type:ignore[operator]
2039 # Kludge to handle article titles/phrases with commas.
2040 # basepart's regex strips commas, which leads to a
2041 # canonical form that is the title phrase without a comma.
2042 # basepart in add_related is almost immediately joined with
2043 # spaces anyhow. XXX not exactly sure why it's
2044 # canonicals.append((tags, baseparts)) and not (tags, [alt])
2045 baseparts = [alt]
2046 canonicals.append((tags, baseparts))
2047 for tags, baseparts in canonicals:
2048 add_related(
2049 wxr,
2050 data,
2051 tags,
2052 baseparts,
2053 text,
2054 len(canonicals) > 1,
2055 is_reconstruction,
2056 head_group,
2057 ruby,
2058 )
2060 # Handle parenthesized descriptors for the word form and links to
2061 # related words
2062 text = quote_kept_parens(text)
2063 parens = list(
2064 m.group(2)
2065 for m in re.finditer(r"(^|\s)\((([^()]|\([^()]*\))*)\)", text)
2066 )
2067 parens.extend(
2068 m.group(1)
2069 for m in re.finditer(r"[^\s]\((([^()]|\([^()]*\))*)\)($|\s)", text)
2070 )
2071 have_romanization = False
2072 have_ruby = False
2073 hiragana = ""
2074 katakana = ""
2075 for paren in parens:
2076 paren = paren.strip()
2077 if not paren: 2077 ↛ 2078line 2077 didn't jump to line 2078 because the condition on line 2077 was never true
2078 continue
2079 if paren.startswith("see "):
2080 continue
2081 if paren.startswith("U+"): 2081 ↛ 2082line 2081 didn't jump to line 2082 because the condition on line 2081 was never true
2082 continue
2083 # In some rare cases, strip word that inflects form the form
2084 # description, e.g. "look through rose-tinted glasses"/English.
2085 paren = re.sub(r"\s*\(\[[^])]*\]\)", "", paren)
2087 # If it starts with hiragana or katakana, treat as such form. Note
2088 # that each hiragana/katakana character is in separate parentheses,
2089 # so we must concatenate them.
2090 try:
2091 un = unicodedata.name(paren[0]).split()[0]
2092 except ValueError:
2093 un = "INVALID"
2094 if un == "KATAKANA": 2094 ↛ 2095line 2094 didn't jump to line 2095 because the condition on line 2094 was never true
2095 katakana += paren
2096 have_ruby = True
2097 continue
2098 if un == "HIRAGANA": 2098 ↛ 2099line 2098 didn't jump to line 2099 because the condition on line 2098 was never true
2099 hiragana += paren
2100 have_ruby = True
2101 continue
2103 # Parse format ", 16 (Japan, Mainland), 17 (Hong Kong, Taiwan) strokes,"
2104 # in the middle of the parenthesized expression, e.g. 薄
2105 def strokes_repl(m: re.Match) -> str:
2106 strokes1, tags1, strokes2, tags2 = m.groups()
2107 for strokes, tags in [[strokes1, tags1], [strokes2, tags2]]:
2108 tags = tags.split(", ")
2109 tags = list(
2110 "Mainland China" if t == "Mainland" else t for t in tags
2111 )
2112 tags.append("strokes")
2113 add_related(
2114 wxr,
2115 data,
2116 tags,
2117 [strokes],
2118 text,
2119 True,
2120 is_reconstruction,
2121 head_group,
2122 ruby,
2123 )
2124 return ", "
2126 paren = re.sub(
2127 r", (\d+) \(([^()]+)\), (\d+) \(([^()]+)\) strokes, ",
2128 strokes_repl,
2129 paren,
2130 )
2132 descriptors = map_with(xlat_descs_map, [paren])
2133 new_desc = []
2134 for desc in descriptors:
2135 new_desc.extend(
2136 map_with(
2137 xlat_tags_map,
2138 split_at_comma_semi(desc, extra=[", or "], skipped=links),
2139 )
2140 )
2141 prev_tags: Union[list[list[str]], list[tuple[str, ...]], None] = None
2142 following_tags = None # Added to prev_tags from previous parenthesized
2143 # part, e.g. walrus/English
2144 # "(both nonstandard, proscribed, uncommon)"
2145 for desc_i, desc in enumerate(new_desc):
2146 # print("HEAD DESC: {!r}".format(desc))
2148 # Abort on certain descriptors (assume remaining values are
2149 # examples or uninteresting, cf. gaan/Navajo, horior/Latin)
2150 if re.match(r"^(per |e\.g\.$)", desc): 2150 ↛ 2151line 2150 didn't jump to line 2151 because the condition on line 2150 was never true
2151 break
2153 # If it all consists of CJK characters, add it with the
2154 # CJK tag. This is used at least for some Vietnamese
2155 # words (e.g., ba/Vietnamese)
2156 try:
2157 if all(unicodedata.name(x).startswith("CJK ") for x in desc): 2157 ↛ 2158line 2157 didn't jump to line 2158 because the condition on line 2157 was never true
2158 add_related(
2159 wxr,
2160 data,
2161 ["CJK"],
2162 [desc],
2163 text,
2164 True,
2165 is_reconstruction,
2166 head_group,
2167 ruby,
2168 )
2169 continue
2170 except ValueError:
2171 pass
2173 # Handle some special cases
2174 splitdesc = desc.split()
2175 if ( 2175 ↛ 2184line 2175 didn't jump to line 2184 because the condition on line 2175 was never true
2176 len(splitdesc) >= 3
2177 and splitdesc[1] == "superlative"
2178 and classify_desc(splitdesc[0]) != "tags"
2179 and prev_tags
2180 ):
2181 # Handle the special case of second comparative after comma,
2182 # followed by superlative without comma. E.g.
2183 # mal/Portuguese/Adv
2184 for ts in prev_tags:
2185 add_related(
2186 wxr,
2187 data,
2188 ts,
2189 [splitdesc[0]],
2190 text,
2191 True,
2192 is_reconstruction,
2193 head_group,
2194 ruby,
2195 )
2196 desc = " ".join(splitdesc[1:])
2197 elif ( 2197 ↛ 2205line 2197 didn't jump to line 2205 because the condition on line 2197 was never true
2198 len(splitdesc) == 2
2199 and splitdesc[0] in ("also", "and")
2200 and prev_tags
2201 and classify_desc(splitdesc[1]) != "tags"
2202 ):
2203 # Sometimes alternative forms are prefixed with "also" or
2204 # "and"
2205 for ts in prev_tags:
2206 add_related(
2207 wxr,
2208 data,
2209 ts,
2210 [splitdesc[1]],
2211 text,
2212 True,
2213 is_reconstruction,
2214 head_group,
2215 ruby,
2216 )
2217 continue
2218 elif len(splitdesc) >= 2 and splitdesc[0] in ("including",): 2218 ↛ 2219line 2218 didn't jump to line 2219 because the condition on line 2218 was never true
2219 continue
2221 # If only one word, assume it is comma-separated alternative
2222 # to the previous one
2223 if " " not in desc:
2224 cls = classify_desc(desc)
2225 if cls != "tags":
2226 if prev_tags: 2226 ↛ 2228line 2226 didn't jump to line 2228 because the condition on line 2226 was never true
2227 # Assume comma-separated alternative to previous one
2228 for ts in prev_tags:
2229 add_related(
2230 wxr,
2231 data,
2232 ts,
2233 [desc],
2234 text,
2235 True,
2236 is_reconstruction,
2237 head_group,
2238 ruby,
2239 )
2240 continue
2241 elif distw(titleparts, desc) <= 0.5: 2241 ↛ 2244line 2241 didn't jump to line 2244 because the condition on line 2241 was never true
2242 # Similar to head word, assume a dialectal variation to
2243 # the base form. Cf. go/Alemannic German/Verb
2244 add_related(
2245 wxr,
2246 data,
2247 ["alternative"],
2248 [desc],
2249 text,
2250 True,
2251 is_reconstruction,
2252 head_group,
2253 ruby,
2254 )
2255 continue
2256 elif (
2257 cls in ("romanization", "english")
2258 and not have_romanization
2259 and classify_desc(titleword) == "other"
2260 and not (
2261 "categories" in data and desc in data["categories"]
2262 )
2263 ):
2264 # Assume it to be a romanization
2265 add_romanization(
2266 wxr,
2267 data,
2268 desc,
2269 text,
2270 is_reconstruction,
2271 head_group,
2272 ruby,
2273 )
2274 have_romanization = True
2275 continue
2277 m = re.match(r"^(\d+) strokes?$", desc)
2278 if m:
2279 # Special case, used to give #strokes for Han characters
2280 add_related(
2281 wxr,
2282 data,
2283 ["strokes"],
2284 [m.group(1)],
2285 text,
2286 True,
2287 is_reconstruction,
2288 head_group,
2289 ruby,
2290 )
2291 continue
2293 # See if it is radical+strokes
2294 m = re.match(
2295 r"^([\u2F00-\u2FDF\u2E80-\u2EFF\U00018800-\U00018AFF"
2296 r"\uA490-\uA4CF\u4E00-\u9FFF]\+\d+)"
2297 r"( in (Japanese|Chinese|traditional Chinese|"
2298 r"simplified Chinese))?$",
2299 desc,
2300 )
2301 if m: 2301 ↛ 2304line 2301 didn't jump to line 2304 because the condition on line 2301 was never true
2302 # Special case, used to give radical + strokes for Han
2303 # characters
2304 radical_strokes = m.group(1)
2305 lang = m.group(3)
2306 t = ["radical+strokes"]
2307 if lang:
2308 t.extend(lang.split())
2309 add_related(
2310 wxr,
2311 data,
2312 t,
2313 [radical_strokes],
2314 text,
2315 True,
2316 is_reconstruction,
2317 head_group,
2318 ruby,
2319 )
2320 prev_tags = None
2321 following_tags = None
2322 continue
2324 # See if it indicates historical Katakana ortography (←) or
2325 # just otherwise katakana/hiragana form
2326 m = re.match(r"←\s*|kana\s+", desc)
2327 if m: 2327 ↛ 2328line 2327 didn't jump to line 2328 because the condition on line 2327 was never true
2328 if desc.startswith("←"):
2329 t1 = "historical "
2330 else:
2331 t1 = ""
2332 x = desc[m.end() :]
2333 if x.endswith("?"):
2334 x = x[:-1]
2335 # XXX should we add a tag indicating uncertainty?
2336 if x:
2337 name = unicodedata.name(x[0])
2338 if name.startswith("HIRAGANA "):
2339 desc = t1 + "hiragana " + x
2340 elif name.startswith("KATAKANA "):
2341 desc = t1 + "katakana " + x
2343 # See if it is "n strokes in Chinese" or similar
2344 m = re.match(
2345 r"(\d+) strokes in (Chinese|Japanese|"
2346 r"traditional Chinese|simplified Chinese)$",
2347 desc,
2348 )
2349 if m: 2349 ↛ 2351line 2349 didn't jump to line 2351 because the condition on line 2349 was never true
2350 # Special case, used to give just strokes for some Han chars
2351 strokes = m.group(1)
2352 lang = m.group(2)
2353 t = ["strokes"]
2354 t.extend(lang.split())
2355 add_related(
2356 wxr,
2357 data,
2358 t,
2359 [strokes],
2360 text,
2361 True,
2362 is_reconstruction,
2363 head_group,
2364 ruby,
2365 )
2366 prev_tags = None
2367 following_tags = None
2368 continue
2370 # American Sign Language has images (or requests for image)
2371 # as heads, + this ASL gloss after.
2372 m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text)
2373 if m2: 2373 ↛ 2374line 2373 didn't jump to line 2374 because the condition on line 2373 was never true
2374 add_related(
2375 wxr,
2376 data,
2377 ["ASL-gloss"],
2378 [m2.group(1)],
2379 text,
2380 True,
2381 is_reconstruction,
2382 head_group,
2383 ruby,
2384 )
2385 continue
2387 parts = list(m.group(0) for m in re.finditer(word_re, desc))
2388 if not parts: 2388 ↛ 2389line 2388 didn't jump to line 2389 because the condition on line 2388 was never true
2389 prev_tags = None
2390 following_tags = None
2391 continue
2393 # Check for certain language-specific header part starts that
2394 # modify
2395 if len(parts) == 2 and language in lang_specific_head_map: 2395 ↛ 2396line 2395 didn't jump to line 2396 because the condition on line 2395 was never true
2396 ht = lang_specific_head_map[language]
2397 if parts[0] in ht:
2398 rem_tags, add_tags = ht[parts[0]]
2399 new_prev_tags1: list[list[str]] = []
2400 tags2: Union[tuple[str, ...], list[str]]
2401 for tags2 in prev_tags or [()]:
2402 if rem_tags is True: # Remove all old tags
2403 tsets = set()
2404 else:
2405 tsets = set(tags2) - set(rem_tags.split())
2406 tsets = tsets | set(add_tags.split())
2407 tags = list(sorted(tsets))
2408 add_related(
2409 wxr,
2410 data,
2411 tags,
2412 [parts[1]],
2413 text,
2414 True,
2415 is_reconstruction,
2416 head_group,
2417 ruby,
2418 )
2419 new_prev_tags1.append(tags)
2420 prev_tags = new_prev_tags1
2421 following_tags = None
2422 continue
2424 # Handle the special case of descriptors that are parenthesized,
2425 # e.g., (archaic or Scotland)
2426 m = re.match(r"\(([^)]+)\)\s+(.*)$", desc)
2427 if m is not None and classify_desc(m.group(1)) == "tags": 2427 ↛ 2428line 2427 didn't jump to line 2428 because the condition on line 2427 was never true
2428 tagpart = m.group(1)
2429 related = [m.group(2)]
2430 tagsets, topics = decode_tags(tagpart, no_unknown_starts=True)
2431 if topics:
2432 wxr.wtp.debug(
2433 "parenthized head part {!r} contains topics: {}".format(
2434 tagpart, topics
2435 ),
2436 sortid="form_descriptions/1647",
2437 )
2438 elif m is not None and re.match(r"in the sense ", m.group(1)): 2438 ↛ 2441line 2438 didn't jump to line 2441 because the condition on line 2438 was never true
2439 # Handle certain ignored cases
2440 # e.g. bord/Danish: in the sense "plank"
2441 related = [m.group(2)]
2442 tagsets = [()]
2443 else:
2444 # Normal parsing of the descriptor
2445 alt_related = None
2446 alt_tagsets = None
2447 tagsets = None
2448 for i in range(len(parts), 0, -1):
2449 related = parts[i:]
2450 tagparts = parts[:i]
2451 # print(" i={} related={} tagparts={}"
2452 # .format(i, related, tagparts))
2453 tagsets, topics = decode_tags(
2454 " ".join(tagparts), no_unknown_starts=True
2455 )
2456 # print("tagparts={!r} tagsets={} topics={} related={} "
2457 # "alt_related={} distw={:.2f}"
2458 # .format(tagparts, tagsets, topics, related,
2459 # alt_related,
2460 # distw(titleparts, parts[i - 1])))
2461 if (
2462 topics
2463 or not tagsets
2464 or any("error-unknown-tag" in x for x in tagsets)
2465 ):
2466 if alt_related is not None: 2466 ↛ 2468line 2466 didn't jump to line 2468 because the condition on line 2466 was never true
2467 # We already had a good division, so let's stop.
2468 break
2469 # Bad division, try deeper
2470 continue
2471 # print(f"{parts[i-1]=}, {parts=}")
2472 if (
2473 i > 1
2474 and len(parts[i - 1]) >= 4
2475 and (
2476 distw(titleparts, parts[i - 1]) <= 0.4
2477 or (
2478 wxr.wtp.section == "English"
2479 and wxr.wtp.title
2480 in WORDS_WITH_FALSE_POSITIVE_TAGS
2481 and parts[i - 1]
2482 in WORDS_WITH_FALSE_POSITIVE_TAGS[wxr.wtp.title]
2483 )
2484 )
2485 # Fixes 'unaccountability' wiktext #1196
2486 and not (
2487 wxr.wtp.section == "English"
2488 and wxr.wtp.title in WORDS_WITH_FALSE_POSITIVE_FORMS
2489 and parts[i - 1]
2490 in WORDS_WITH_FALSE_POSITIVE_FORMS[wxr.wtp.title]
2491 )
2492 # Fixes wiktextract #983, where "participle"
2493 # was too close to "Martinize" and so this accepted
2494 # ["participle", "Martinize"] as matching; this
2495 # kludge prevents this from happening if titleparts
2496 # is shorter than what would be 'related'.
2497 # This breaks if we want to detect stuff that
2498 # actually gets an extra space-separated word when
2499 # 'inflected'.
2500 and (
2501 len(titleparts) >= len(parts[i - 1 :])
2502 or "or" in parts[i - 1 :]
2503 )
2504 ):
2505 # print(f"Reached; {parts=}, {parts[i-1]=}")
2506 alt_related = related
2507 alt_tagsets = tagsets
2508 continue
2509 alt_related = None
2510 alt_tagsets = None
2511 break
2512 else:
2513 if alt_related is None: 2513 ↛ 2545line 2513 didn't jump to line 2545 because the condition on line 2513 was always true
2514 # Check if the parenthesized part is likely a
2515 # romanization
2516 if ( 2516 ↛ 2524line 2516 didn't jump to line 2524 because the condition on line 2516 was never true
2517 (have_ruby or classify_desc(base) == "other")
2518 and classify_desc(paren) == "romanization"
2519 and not (
2520 "categories" in data
2521 and desc in data["categories"]
2522 )
2523 ):
2524 for r in split_at_comma_semi(
2525 paren, extra=[" or "], skipped=links
2526 ):
2527 add_romanization(
2528 wxr,
2529 data,
2530 r,
2531 text,
2532 is_reconstruction,
2533 head_group,
2534 ruby,
2535 )
2536 have_romanization = True
2537 continue
2538 tagsets = [("error-unrecognized-head-form",)]
2539 wxr.wtp.debug(
2540 "unrecognized head form: {}".format(desc),
2541 sortid="form_descriptions/1698",
2542 )
2543 continue
2545 if alt_related is not None: 2545 ↛ 2546line 2545 didn't jump to line 2546 because the condition on line 2545 was never true
2546 related = alt_related
2547 tagsets = alt_tagsets
2549 # print("FORM END: tagsets={} related={}".format(tagsets, related))
2550 # print("==================")
2552 if ( 2552 ↛ 2573line 2552 didn't jump to line 2573 because the condition on line 2552 was never true
2553 len(related) <= 0
2554 and wxr.wtp.section == "English"
2555 and tagsets is not None
2556 and len(tagsets) > 0
2557 and not any(
2558 s.startswith("error-") for tagset in tagsets for s in tagset
2559 )
2560 and any(
2561 s in FORM_ASSOCIATED_TAG_WORDS
2562 for tagset in tagsets
2563 for s in tagset
2564 )
2565 and (
2566 wxr.wtp.title not in FALSE_POSITIVE_MISSING_FORMS
2567 and not any(
2568 rel in FALSE_POSITIVE_MISSING_FORMS[wxr.wtp.title or ""]
2569 for rel in related
2570 )
2571 )
2572 ):
2573 wxr.wtp.debug(
2574 f"Form tags without form: {desc=}, {tagsets=}",
2575 sortid="form_description/20250107",
2576 )
2577 if not tagsets: 2577 ↛ 2578line 2577 didn't jump to line 2578 because the condition on line 2577 was never true
2578 continue
2580 # print(f"{alts=}, {related=}")
2582 assert isinstance(related, (list, tuple))
2583 related_str = " ".join(related)
2584 if "or" in titleparts:
2585 alts = [related_str]
2586 else:
2587 alts = split_at_comma_semi(
2588 related_str, separators=[r"\bor\b"], skipped=links
2589 )
2590 # print(f"{related_str=}, {alts=}")
2591 if not alts:
2592 alts = [""]
2593 for related_str in alts:
2594 if related_str:
2595 if prev_tags and (
2596 all(
2597 all(
2598 t in ["nonstandard", "dialectal"]
2599 or valid_tags[t] == "dialect"
2600 for t in tags
2601 )
2602 for ts in tagsets
2603 )
2604 or (
2605 any("participle" in ts for ts in prev_tags)
2606 and all(
2607 "attributive" in ts
2608 or any(valid_tags[t] == "gender" for t in ts)
2609 for ts in tagsets
2610 )
2611 )
2612 ):
2613 # Merged with previous tags. Don't update previous
2614 # tags here; cf. burn/English/Verb
2615 for tags_l in tagsets:
2616 for ts in prev_tags:
2617 tags_l1 = sorted(set(tags_l) | set(ts))
2618 add_related(
2619 wxr,
2620 data,
2621 tags_l1,
2622 [related_str],
2623 text,
2624 True,
2625 is_reconstruction,
2626 head_group,
2627 ruby,
2628 )
2629 else:
2630 # Not merged with previous tags
2631 for tags_l in tagsets:
2632 if following_tags is not None: 2632 ↛ 2633line 2632 didn't jump to line 2633 because the condition on line 2632 was never true
2633 for ts in following_tags:
2634 tags_l1 = list(
2635 sorted(set(tags_l) | set(ts))
2636 )
2637 add_related(
2638 wxr,
2639 data,
2640 tags_l1,
2641 [related_str],
2642 text,
2643 True,
2644 is_reconstruction,
2645 head_group,
2646 ruby,
2647 )
2648 else:
2649 ret = add_related(
2650 wxr,
2651 data,
2652 tags_l,
2653 [related_str],
2654 text,
2655 True,
2656 is_reconstruction,
2657 head_group,
2658 ruby,
2659 )
2660 if ret is not None: 2660 ↛ 2661line 2660 didn't jump to line 2661 because the condition on line 2660 was never true
2661 following_tags = ret
2662 prev_tags = tagsets
2663 else:
2664 if desc_i < len(new_desc) - 1 and all( 2664 ↛ 2671line 2664 didn't jump to line 2671 because the condition on line 2664 was never true
2665 "participle" in ts or "infinitive" in ts
2666 for ts in tagsets
2667 ):
2668 # Interpret it as a standalone form description
2669 # in the middle, probably followed by forms or
2670 # language-specific descriptors. cf. drikke/Danish
2671 new_prev_tags2 = []
2672 for ts1 in prev_tags or [()]:
2673 for ts2 in tagsets:
2674 ts = tuple(sorted(set(ts1) | set(ts2)))
2675 new_prev_tags2.append(ts)
2676 prev_tags = new_prev_tags2
2677 continue
2678 for tags in tagsets:
2679 data_extend(data, "tags", tags)
2680 prev_tags = tagsets
2681 following_tags = None
2683 # Finally, if we collected hirakana/katakana, add them now
2684 if hiragana: 2684 ↛ 2685line 2684 didn't jump to line 2685 because the condition on line 2684 was never true
2685 add_related(
2686 wxr,
2687 data,
2688 ["hiragana"],
2689 [hiragana],
2690 text,
2691 True,
2692 is_reconstruction,
2693 head_group,
2694 ruby,
2695 )
2696 if katakana: 2696 ↛ 2697line 2696 didn't jump to line 2697 because the condition on line 2696 was never true
2697 add_related(
2698 wxr,
2699 data,
2700 ["katakana"],
2701 [katakana],
2702 text,
2703 True,
2704 is_reconstruction,
2705 head_group,
2706 ruby,
2707 )
2709 # XXX check if this is actually relevant, tags in word root data
2710 # is extremely rare (not sure where they slip through).
2711 tags = data.get("tags", []) # type:ignore
2712 if len(tags) > 0:
2713 # wxr.wtp.debug(
2714 # f"Tags appear in word root data: {data['tags']=}", # type:ignore
2715 # sortid="form_descriptions/2620/20240606",
2716 # ) # Messes up tests.
2717 data["tags"] = sorted(set(tags)) # type:ignore
2720def parse_sense_qualifier(
2721 wxr: WiktextractContext, text: str, data: Union[SenseData, LinkageData]
2722) -> None:
2723 """Parses tags or topics for a sense or some other data. The values are
2724 added into the dictionary ``data``."""
2725 assert isinstance(wxr, WiktextractContext)
2726 assert isinstance(text, str)
2727 assert isinstance(data, dict)
2728 # print("parse_sense_qualifier:", text)
2729 if re.match(r"\([^()]+\)$", text): 2729 ↛ 2730line 2729 didn't jump to line 2730 because the condition on line 2729 was never true
2730 text = text[1:-1]
2731 if re.match(r'"[^"]+"$', text): 2731 ↛ 2732line 2731 didn't jump to line 2732 because the condition on line 2731 was never true
2732 text = text[1:-1]
2733 lst = map_with(xlat_descs_map, [text])
2734 sense_tags: list[str] = []
2735 for text in lst:
2736 for semi in split_at_comma_semi(text):
2737 if not semi: 2737 ↛ 2738line 2737 didn't jump to line 2738 because the condition on line 2737 was never true
2738 continue
2739 orig_semi = semi
2740 idx = semi.find(":")
2741 if idx >= 0: 2741 ↛ 2742line 2741 didn't jump to line 2742 because the condition on line 2741 was never true
2742 semi = semi[:idx]
2743 cls = classify_desc(semi, allow_unknown_tags=True)
2744 # print("parse_sense_qualifier: classify_desc: {} -> {}"
2745 # .format(semi, cls))
2746 if cls == "tags":
2747 tagsets, topics = decode_tags(semi)
2748 data_extend(data, "topics", topics)
2749 # XXX should think how to handle distinct options better,
2750 # e.g., "singular and plural genitive"; that can't really be
2751 # done with changing the calling convention of this function.
2752 # Should split sense if more than one category of tags differs.
2753 for tags in tagsets:
2754 sense_tags.extend(tags)
2755 elif cls == "taxonomic": 2755 ↛ 2756line 2755 didn't jump to line 2756 because the condition on line 2755 was never true
2756 if re.match(r"×[A-Z]", semi):
2757 sense_tags.append("extinct")
2758 semi = semi[1:]
2759 data["taxonomic"] = semi
2760 elif cls == "english":
2761 if "qualifier" in data and data["qualifier"] != orig_semi: 2761 ↛ 2762line 2761 didn't jump to line 2762 because the condition on line 2761 was never true
2762 data["qualifier"] += "; " + orig_semi
2763 else:
2764 data["qualifier"] = orig_semi
2765 else:
2766 wxr.wtp.debug(
2767 "unrecognized sense qualifier: {}".format(text),
2768 sortid="form_descriptions/1831",
2769 )
2770 sense_tags = sorted(set(sense_tags))
2771 data_extend(data, "tags", sense_tags)
2774def parse_pronunciation_tags(
2775 wxr: WiktextractContext, text: str, data: SoundData
2776) -> None:
2777 assert isinstance(wxr, WiktextractContext)
2778 assert isinstance(text, str)
2779 assert isinstance(data, dict)
2780 text = text.strip()
2781 if not text: 2781 ↛ 2782line 2781 didn't jump to line 2782 because the condition on line 2781 was never true
2782 return
2783 cls = classify_desc(text)
2784 notes = []
2785 if cls == "tags":
2786 tagsets, topics = decode_tags(text)
2787 data_extend(data, "topics", topics)
2788 for tagset in tagsets:
2789 for t in tagset:
2790 if " " in t: 2790 ↛ 2791line 2790 didn't jump to line 2791 because the condition on line 2790 was never true
2791 notes.append(t)
2792 else:
2793 data_append(data, "tags", t)
2794 else:
2795 notes.append(text)
2796 if notes:
2797 data["note"] = "; ".join(notes)
2800def parse_translation_desc(
2801 wxr: WiktextractContext, lang: str, text: str, tr: TranslationData
2802) -> None:
2803 assert isinstance(wxr, WiktextractContext)
2804 assert isinstance(lang, str) # The language of ``text``
2805 assert isinstance(text, str)
2806 assert isinstance(tr, dict)
2807 # print("parse_translation_desc:", text)
2809 # Process all parenthesized parts from the translation item
2810 note = None
2811 restore_beginning = ""
2812 restore_end = ""
2813 while True:
2814 beginning = False
2815 # See if we can find a parenthesized expression at the end
2816 m = re.search(r"\s*\((([^()]|\([^()]+\))+)\)\.?$", text)
2817 if m:
2818 par = m.group(1)
2819 text = text[: m.start()]
2820 if par.startswith(("literally ", "lit.")):
2821 continue # Not useful for disambiguation in many idioms
2822 else:
2823 # See if we can find a parenthesized expression at the start
2824 m = re.match(r"^\^?\((([^()]|\([^()]+\))+)\):?(\s+|$)", text)
2825 if m:
2826 par = m.group(1)
2827 text = text[m.end() :]
2828 beginning = True
2829 if re.match(r"^(\d|\s|,| or | and )+$", par): 2829 ↛ 2834line 2829 didn't jump to line 2834 because the condition on line 2829 was never true
2830 # Looks like this beginning parenthesized expression only
2831 # contains digits or their combinations. We assume such
2832 # to be sense descriptions if no sense has been selected,
2833 # or otherwise just ignore them.
2834 if not tr.get("sense"):
2835 tr["sense"] = par
2836 continue
2837 else:
2838 # See if we can find a parenthesized expression in the middle.
2839 # Romanizations are sometimes between word and gender marker,
2840 # e.g. wife/English/Tr/Yiddish.
2841 m = re.search(r"\s+\((([^()]|\([^()]+\))+)\)", text)
2842 if m:
2843 par = m.group(1)
2844 text = text[: m.start()] + text[m.end() :]
2845 else:
2846 # No more parenthesized expressions - break out of the loop
2847 break
2849 # Some cleanup of artifacts that may result from skipping some templates
2850 # in earlier stages
2851 if par.startswith(": "): 2851 ↛ 2852line 2851 didn't jump to line 2852 because the condition on line 2851 was never true
2852 par = par[2:]
2853 if par.endswith(","): 2853 ↛ 2854line 2853 didn't jump to line 2854 because the condition on line 2853 was never true
2854 par = par[:-1]
2855 if re.match(r'^[“"]([^“”"]*)[“”"]$', par): 2855 ↛ 2856line 2855 didn't jump to line 2856 because the condition on line 2855 was never true
2856 par = par[1:-1]
2857 par = par.strip()
2859 # Check for special script pronunciation followed by romanization,
2860 # used in many Asian languages.
2861 lst = par.split(", ")
2862 if len(lst) == 2:
2863 a, r = lst
2864 if classify_desc(a) == "other":
2865 cls = classify_desc(r)
2866 # print("parse_translation_desc: r={} cls={}".format(r, cls))
2867 if cls == "romanization" or (
2868 cls == "english" and len(r.split()) == 1 and r[0].islower()
2869 ):
2870 if tr.get("alt") and tr.get("alt") != a: 2870 ↛ 2871line 2870 didn't jump to line 2871 because the condition on line 2870 was never true
2871 wxr.wtp.debug(
2872 'more than one value in "alt": {} vs. {}'.format(
2873 tr["alt"], a
2874 ),
2875 sortid="form_descriptions/1930",
2876 )
2877 tr["alt"] = a
2878 if tr.get("roman") and tr.get("roman") != r: 2878 ↛ 2879line 2878 didn't jump to line 2879 because the condition on line 2878 was never true
2879 wxr.wtp.debug(
2880 'more than one value in "roman": {} vs. {}'.format(
2881 tr["roman"], r
2882 ),
2883 sortid="form_descriptions/1936",
2884 )
2885 tr["roman"] = r
2886 continue
2888 # Check for certain comma-separated tags combined with English text
2889 # at the beginning or end of a comma-separated parenthesized list
2890 while len(lst) > 1:
2891 cls = classify_desc(lst[0])
2892 if cls == "tags": 2892 ↛ 2893line 2892 didn't jump to line 2893 because the condition on line 2892 was never true
2893 tagsets, topics = decode_tags(lst[0])
2894 for t in tagsets:
2895 data_extend(tr, "tags", t)
2896 data_extend(tr, "topics", topics)
2897 lst = lst[1:]
2898 continue
2899 cls = classify_desc(lst[-1])
2900 if cls == "tags":
2901 tagsets, topics = decode_tags(lst[-1])
2902 for t in tagsets:
2903 data_extend(tr, "tags", t)
2904 data_extend(tr, "topics", topics)
2905 lst = lst[:-1]
2906 continue
2907 break
2908 par = ", ".join(lst)
2910 if not par: 2910 ↛ 2911line 2910 didn't jump to line 2911 because the condition on line 2910 was never true
2911 continue
2912 if re.search(tr_ignored_parens_re, par): 2912 ↛ 2913line 2912 didn't jump to line 2913 because the condition on line 2912 was never true
2913 continue
2914 if par.startswith("numeral:"):
2915 par = par[8:].strip()
2917 # Classify the part in parenthesis and process accordingly
2918 cls = classify_desc(par)
2919 # print("parse_translation_desc classify: {!r} -> {}"
2920 # .format(par, cls))
2921 if par == text:
2922 pass
2923 if par == "f": 2923 ↛ 2924line 2923 didn't jump to line 2924 because the condition on line 2923 was never true
2924 data_append(tr, "tags", "feminine")
2925 elif par == "m": 2925 ↛ 2926line 2925 didn't jump to line 2926 because the condition on line 2925 was never true
2926 data_append(tr, "tags", "masculine")
2927 elif cls == "tags":
2928 tagsets, topics = decode_tags(par)
2929 for tags in tagsets:
2930 data_extend(tr, "tags", tags)
2931 data_extend(tr, "topics", topics)
2932 elif cls == "english":
2933 # If the text contains any of certain grammatical words, treat it
2934 # as a "note" instead of "english"
2935 if re.search(tr_note_re, par):
2936 if par.endswith(":"): 2936 ↛ 2937line 2936 didn't jump to line 2937 because the condition on line 2936 was never true
2937 par = par[:-1]
2938 if par not in ("see entry for forms",): 2938 ↛ 2813line 2938 didn't jump to line 2813 because the condition on line 2938 was always true
2939 if note: 2939 ↛ 2940line 2939 didn't jump to line 2940 because the condition on line 2939 was never true
2940 note = note + ";" + par
2941 else:
2942 note = par
2943 else:
2944 # There can be more than one parenthesized english item, see
2945 # e.g. Aunt/English/Translations/Tamil
2946 if "translation" in tr and "english" in tr:
2947 tr["english"] += "; " + par # DEPRECATED for "translation"
2948 tr["translation"] += "; " + par
2949 else:
2950 tr["english"] = par # DEPRECATED for "translation"
2951 tr["translation"] = par
2952 elif cls == "romanization":
2953 # print("roman text={!r} text cls={}"
2954 # .format(text, classify_desc(text)))
2955 if classify_desc(text) in (
2956 "english",
2957 "romanization",
2958 ) and lang not in ("Egyptian",):
2959 if beginning:
2960 restore_beginning += "({}) ".format(par)
2961 else:
2962 restore_end = " ({})".format(par) + restore_end
2963 else:
2964 if tr.get("roman"): 2964 ↛ 2965line 2964 didn't jump to line 2965 because the condition on line 2964 was never true
2965 wxr.wtp.debug(
2966 'more than one value in "roman": {} vs. {}'.format(
2967 tr["roman"], par
2968 ),
2969 sortid="form_descriptions/2013",
2970 )
2971 tr["roman"] = par
2972 elif cls == "taxonomic": 2972 ↛ 2973line 2972 didn't jump to line 2973 because the condition on line 2972 was never true
2973 if tr.get("taxonomic"):
2974 wxr.wtp.debug(
2975 'more than one value in "taxonomic": {} vs. {}'.format(
2976 tr["taxonomic"], par
2977 ),
2978 sortid="form_descriptions/2019",
2979 )
2980 if re.match(r"×[A-Z]", par):
2981 data_append(tr, "tags", "extinct")
2982 par = par[1:]
2983 tr["taxonomic"] = par
2984 elif cls == "other": 2984 ↛ 2994line 2984 didn't jump to line 2994 because the condition on line 2984 was always true
2985 if tr.get("alt"): 2985 ↛ 2986line 2985 didn't jump to line 2986 because the condition on line 2985 was never true
2986 wxr.wtp.debug(
2987 'more than one value in "alt": {} vs. {}'.format(
2988 tr["alt"], par
2989 ),
2990 sortid="form_descriptions/2028",
2991 )
2992 tr["alt"] = par
2993 else:
2994 wxr.wtp.debug(
2995 "parse_translation_desc unimplemented cls {}: {}".format(
2996 cls, par
2997 ),
2998 sortid="form_descriptions/2033",
2999 )
3001 # Check for gender indications in suffix
3002 text, final_tags = parse_head_final_tags(wxr, lang, text)
3003 data_extend(tr, "tags", final_tags)
3005 # Restore those parts that we did not want to remove (they are often
3006 # optional words or words that are always used with the given translation)
3007 text = restore_beginning + text + restore_end
3009 if note:
3010 tr["note"] = note.strip()
3011 if text and text not in ignored_translations:
3012 tr["word"] = text.strip()
3014 # Sometimes gender seems to be at the end of "roman" field, see e.g.
3015 # fire/English/Noun/Translations/Egyptian (for "oxidation reaction")
3016 roman = tr.get("roman")
3017 if roman:
3018 if roman.endswith(" f"): 3018 ↛ 3019line 3018 didn't jump to line 3019 because the condition on line 3018 was never true
3019 data_append(tr, "tags", "feminine")
3020 tr["roman"] = roman[:-2].strip()
3021 elif roman.endswith(" m"): 3021 ↛ 3022line 3021 didn't jump to line 3022 because the condition on line 3021 was never true
3022 data_append(tr, "tags", "masculine")
3023 tr["roman"] = roman[:-2].strip()
3025 # If the word now has "translation" field but no "roman" field, and
3026 # the word would be classified "other" (generally non-latin
3027 # characters), and the value in "translation" is only one lowercase
3028 # word, move it to "roman". This happens semi-frequently when the
3029 # translation is transliterated the same as some English word.
3030 roman = tr.get("roman")
3031 english = tr.get("translation")
3032 if english and not roman and "word" in tr:
3033 cls = classify_desc(tr["word"])
3034 if cls == "other" and " " not in english and english[0].islower():
3035 del tr["translation"]
3036 if "english" in tr: # DEPRECATED for "translation" 3036 ↛ 3038line 3036 didn't jump to line 3038 because the condition on line 3036 was always true
3037 del tr["english"]
3038 tr["roman"] = english
3040 # If the entry now has both tr["roman"] and tr["word"] and they have
3041 # the same value, delete tr["roman"] (e.g., man/English/Translations
3042 # Evenki)
3043 if tr.get("word") and tr.get("roman") == tr.get("word"): 3043 ↛ 3044line 3043 didn't jump to line 3044 because the condition on line 3043 was never true
3044 del tr["roman"]
3047def parse_alt_or_inflection_of(
3048 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str]
3049) -> Optional[tuple[list[str], Optional[list[AltOf]]]]:
3050 """Tries to parse an inflection-of or alt-of description. If successful,
3051 this returns (tags, alt-of/inflection-of-dict). If the description cannot
3052 be parsed, this returns None. This may also return (tags, None) when the
3053 gloss describes a form (or some other tags were extracted from it), but
3054 there was no alt-of/form-of/synonym-of word."""
3055 # print("parse_alt_or_inflection_of: {!r}".format(gloss))
3056 # Occasionally inflection_of/alt_of have "A(n) " etc. at the beginning.
3058 # Never interpret a gloss that is equal to the word itself as a tag
3059 # (e.g., instrumental/Romanian, instrumental/Spanish).
3060 if gloss.lower() == wxr.wtp.title.lower() or ( # type:ignore[union-attr]
3061 len(gloss) >= 5 and distw([gloss.lower()], wxr.wtp.title.lower()) < 0.2 # type:ignore[union-attr]
3062 ):
3063 return None
3065 # First try parsing it as-is
3066 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args)
3067 if parsed is not None:
3068 return parsed
3070 # Next try parsing it with the first character converted to lowercase if
3071 # it was previously uppercase.
3072 if gloss and gloss[0].isupper():
3073 gloss = gloss[0].lower() + gloss[1:]
3074 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args)
3075 if parsed is not None:
3076 return parsed
3078 return None
3081# These tags are not allowed in alt-or-inflection-of parsing
3082alt_infl_disallowed: set[str] = set(
3083 [
3084 "error-unknown-tag",
3085 "place", # Not in inflected forms and causes problems e.g. house/English
3086 ]
3087)
3090def parse_alt_or_inflection_of1(
3091 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str]
3092) -> Optional[tuple[list[str], Optional[list[AltOf]]]]:
3093 """Helper function for parse_alt_or_inflection_of. This handles a single
3094 capitalization."""
3095 if not gloss or not gloss.strip(): 3095 ↛ 3096line 3095 didn't jump to line 3096 because the condition on line 3095 was never true
3096 return None
3098 # Prevent some common errors where we would parse something we shouldn't
3099 if re.search(r"(?i)form of address ", gloss): 3099 ↛ 3100line 3099 didn't jump to line 3100 because the condition on line 3099 was never true
3100 return None
3102 gloss = re.sub(r"only used in [^,]+, ", "", gloss)
3104 # First try all formats ending with "of" (or other known last words that
3105 # can end a form description)
3106 matches = list(re.finditer(r"\b(of|for|by|as|letter|number) ", gloss))
3107 m: Optional[re.Match]
3108 for m in reversed(matches):
3109 desc = gloss[: m.end()].strip()
3110 base = gloss[m.end() :].strip()
3111 tagsets, topics = decode_tags(desc, no_unknown_starts=True)
3112 if not topics and any(
3113 not (alt_infl_disallowed & set(ts)) for ts in tagsets
3114 ):
3115 # Successfully parsed, including "of" etc.
3116 tags: list[str] = []
3117 # If you have ("Western-Armenian", ..., "form-of") as your
3118 # tag set, it's most probable that it's something like
3119 # "Western Armenian form of խոսել (xosel)", which should
3120 # get "alt-of" instead of "form-of" (inflection).
3121 # խօսիլ/Armenian
3122 for ts_t in tagsets:
3123 if "form-of" in ts_t and any(
3124 valid_tags.get(tk) == "dialect" for tk in ts_t
3125 ):
3126 ts_s = (set(ts_t) - {"form-of"}) | {"alt-of"}
3127 else:
3128 ts_s = set(ts_t)
3129 if not (alt_infl_disallowed & ts_s): 3129 ↛ 3122line 3129 didn't jump to line 3122 because the condition on line 3129 was always true
3130 tags.extend(ts_s)
3131 if (
3132 "alt-of" in tags
3133 or "form-of" in tags
3134 or "synonym-of" in tags
3135 or "compound-of" in tags
3136 ):
3137 break
3138 if m.group(1) == "of":
3139 # Try parsing without the final "of". This is commonly used in
3140 # various form-of expressions.
3141 desc = gloss[: m.start()]
3142 base = gloss[m.end() :]
3143 tagsets, topics = decode_tags(desc, no_unknown_starts=True)
3144 # print("ALT_OR_INFL: desc={!r} base={!r} tagsets={} topics={}"
3145 # .format(desc, base, tagsets, topics))
3146 if not topics and any(
3147 not (alt_infl_disallowed & set(t)) for t in tagsets
3148 ):
3149 tags = []
3150 for t in tagsets:
3151 if not (alt_infl_disallowed & set(t)): 3151 ↛ 3150line 3151 didn't jump to line 3150 because the condition on line 3151 was always true
3152 tags.extend(t)
3153 # It must have at least one tag from form_of_tags
3154 if set(tags) & form_of_tags:
3155 # Accept this as form-of
3156 tags.append("form-of")
3157 break
3158 if set(tags) & alt_of_tags:
3159 # Accept this as alt-of
3160 tags.append("alt-of")
3161 break
3163 else:
3164 # Did not find a form description based on last word; see if the
3165 # whole description is tags
3166 tagsets, topics = decode_tags(gloss, no_unknown_starts=True)
3167 if not topics and any(
3168 not (alt_infl_disallowed & set(ts)) and form_of_tags & set(ts)
3169 for ts in tagsets
3170 ):
3171 tags = []
3172 for ts in tagsets:
3173 if not (alt_infl_disallowed & set(ts)) and form_of_tags & set( 3173 ↛ 3172line 3173 didn't jump to line 3172 because the condition on line 3173 was always true
3174 ts
3175 ):
3176 tags.extend(ts)
3177 base = ""
3178 else:
3179 return None
3181 # kludge for Spanish (again): 'x of [word] combined with [clitic]'
3182 m = re.search(r"combined with \w+$", base)
3183 if m: 3183 ↛ 3184line 3183 didn't jump to line 3184 because the condition on line 3183 was never true
3184 tagsets, topics = decode_tags(m.group(0), no_unknown_starts=True)
3185 if not topics:
3186 for ts in tagsets:
3187 tags.extend(ts)
3188 base = base[: m.start()]
3190 # It is fairly common for form_of glosses to end with something like
3191 # "ablative case" or "in instructive case". Parse that ending.
3192 base = base.strip()
3193 lst = base.split()
3194 # print("parse_alt_or_inflection_of: lst={}".format(lst))
3195 if len(lst) >= 3 and lst[-1] in ("case", "case."): 3195 ↛ 3196line 3195 didn't jump to line 3196 because the condition on line 3195 was never true
3196 node = valid_sequences.children.get(lst[-2])
3197 if node and node.end:
3198 for s in node.tags:
3199 tags.extend(s.split(" "))
3200 lst = lst[:-2]
3201 if lst[-1] == "in" and len(lst) > 1:
3202 lst = lst[:-1]
3204 # Eliminate empty and duplicate tags
3205 tags = sorted(set(t for t in tags if t))
3207 # Clean up some extra stuff from the linked word, separating the text
3208 # into ``base`` (the linked word) and ``extra`` (additional information,
3209 # such as English translation or clarifying word sense information).
3210 orig_base = base
3211 base = re.sub(alt_of_form_of_clean_re, "", orig_base)
3212 base = re.sub(r" [(⟨][^()]*[)⟩]", "", base) # Remove all (...) groups
3213 extra = orig_base[len(base) :]
3214 extra = re.sub(r"^[- :;.,,—]+", "", extra)
3215 if extra.endswith(".") and extra.count(".") == 1:
3216 extra = extra[:-1].strip()
3217 m = re.match(r"^\(([^()]*)\)$", extra)
3218 if m: 3218 ↛ 3219line 3218 didn't jump to line 3219 because the condition on line 3218 was never true
3219 extra = m.group(1)
3220 else:
3221 # These weird backets used in "slash mark"
3222 m = re.match(r"^⟨([^()]*)⟩$", extra)
3223 if m: 3223 ↛ 3224line 3223 didn't jump to line 3224 because the condition on line 3223 was never true
3224 extra = m.group(1)
3225 m = re.match(r'^[“"]([^"“”]*)["”]$', extra)
3226 if m: 3226 ↛ 3227line 3226 didn't jump to line 3227 because the condition on line 3226 was never true
3227 extra = m.group(1)
3228 # Note: base might still contain comma-separated values and values
3229 # separated by "and"
3230 base = base.strip()
3231 if base.endswith(",") and len(base) > 2: 3231 ↛ 3232line 3231 didn't jump to line 3232 because the condition on line 3231 was never true
3232 base = base[:-1].strip()
3233 while (
3234 base.endswith(".")
3235 and not wxr.wtp.page_exists(base)
3236 and base not in gloss_template_args
3237 ):
3238 base = base[:-1].strip()
3239 if base.endswith('(\u201cconjecture")'): 3239 ↛ 3240line 3239 didn't jump to line 3240 because the condition on line 3239 was never true
3240 base = base[:-14].strip()
3241 tags.append("conjecture")
3242 while ( 3242 ↛ 3247line 3242 didn't jump to line 3247 because the condition on line 3242 was never true
3243 base.endswith(".")
3244 and not wxr.wtp.page_exists(base)
3245 and base not in gloss_template_args
3246 ):
3247 base = base[:-1].strip()
3248 if ( 3248 ↛ 3253line 3248 didn't jump to line 3253 because the condition on line 3248 was never true
3249 base.endswith(".")
3250 and base not in gloss_template_args
3251 and base[:-1] in gloss_template_args
3252 ):
3253 base = base[:-1]
3254 base = base.strip()
3255 if not base:
3256 return tags, None
3258 # Kludge: Spanish verb forms seem to have a dot added at the end.
3259 # Remove it; we know of no Spanish verbs ending with a dot.
3260 language = wxr.wtp.section
3261 pos = wxr.wtp.subsection
3262 # print("language={} pos={} base={}".format(language, pos, base))
3263 if ( 3263 ↛ 3269line 3263 didn't jump to line 3269 because the condition on line 3263 was never true
3264 base.endswith(".")
3265 and len(base) > 1
3266 and base[-2].isalpha()
3267 and (language == "Spanish" and pos == "Verb")
3268 ):
3269 base = base[:-1]
3271 # Split base to alternatives when multiple alternatives provided
3272 parts = split_at_comma_semi(base, extra=[" / ", "/", r" \+ "])
3273 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "")
3274 if (
3275 len(parts) <= 1
3276 or base.startswith("/")
3277 or base.endswith("/")
3278 or "/" in titleword
3279 ):
3280 parts = [base]
3281 # Split base to alternatives when of form "a or b" and "a" and "b" are
3282 # similar (generally spelling variants of the same word or similar words)
3283 if len(parts) == 1:
3284 pp = base.split()
3285 if len(pp) == 3 and pp[1] == "or" and distw([pp[0]], pp[2]) < 0.4:
3286 parts = [pp[0], pp[2]]
3288 # Create form-of/alt-of entries based on the extracted data
3289 dt_lst: list[AltOf] = []
3290 for p in parts:
3291 # Check for some suspicious base forms
3292 m = re.search(r"[.,] |[{}()]", p)
3293 if m and not wxr.wtp.page_exists(p): 3293 ↛ 3294line 3293 didn't jump to line 3294 because the condition on line 3293 was never true
3294 wxr.wtp.debug(
3295 "suspicious alt_of/form_of with {!r}: {}".format(m.group(0), p),
3296 sortid="form_descriptions/2278",
3297 )
3298 if p.startswith("*") and len(p) >= 3 and p[1].isalpha(): 3298 ↛ 3299line 3298 didn't jump to line 3299 because the condition on line 3298 was never true
3299 p = p[1:]
3300 dt: AltOf = {"word": p}
3301 if extra:
3302 dt["extra"] = extra
3303 dt_lst.append(dt)
3304 # print("alt_or_infl_of returning tags={} lst={} base={!r}"
3305 # .format(tags, lst, base))
3306 return tags, dt_lst
3309@functools.lru_cache(maxsize=65536)
3310def classify_desc(
3311 desc: str,
3312 allow_unknown_tags=False,
3313 no_unknown_starts=False,
3314 accepted: Union[tuple[str, ...], frozenset[str]] = tuple(),
3315) -> str:
3316 """Determines whether the given description is most likely tags, english,
3317 a romanization, or something else. Returns one of: "tags", "english",
3318 "romanization", or "other". If ``allow_unknown_tags`` is True, then
3319 allow "tags" classification even when the only tags are those starting
3320 with a word in allowed_unknown_starts."""
3321 assert isinstance(desc, str)
3322 # Empty and whitespace-only strings are treated as "other"
3323 desc = desc.strip()
3324 if not desc:
3325 return "other"
3327 normalized_desc = unicodedata.normalize("NFKD", desc)
3329 # If it can be fully decoded as tags without errors, treat as tags
3330 tagsets, topics = decode_tags(desc, no_unknown_starts=no_unknown_starts)
3331 for tagset in tagsets:
3332 assert isinstance(tagset, (list, tuple, set))
3333 if "error-unknown-tag" not in tagset and (
3334 topics or allow_unknown_tags or any(" " not in x for x in tagset)
3335 ):
3336 return "tags"
3338 # Check if it looks like the taxonomic name of a species
3339 if desc in known_species:
3340 return "taxonomic"
3341 desc1 = re.sub(r"^×([A-Z])", r"\1", desc)
3342 desc1 = re.sub(r"\s*×.*", "", desc1)
3343 lst = desc1.split()
3344 if len(lst) > 1 and len(lst) <= 5 and lst[0] in known_firsts:
3345 have_non_english = 1 if lst[0].lower() not in english_words else 0
3346 for x in lst[1:]:
3347 if x in ("A", "B", "C", "D", "E", "F", "I", "II", "III", "IV", "V"):
3348 continue
3349 if x[0].isupper():
3350 break
3351 if x not in english_words:
3352 have_non_english += 1
3353 else:
3354 # Starts with known taxonomic term, does not contain uppercase
3355 # words (except allowed letters) and at least one word is not
3356 # English
3357 if have_non_english >= len(lst) - 1 and have_non_english > 0: 3357 ↛ 3363line 3357 didn't jump to line 3363 because the condition on line 3357 was always true
3358 return "taxonomic"
3360 # If all words are in our English dictionary, interpret as English.
3361 # [ -~] is regex black magic, "ALL CHARACTERS from space to tilde"
3362 # in ASCII. Took me a while to figure out.
3363 if re.match(r"[ -~―—“”…'‘’ʹ€]+$", normalized_desc) and len(desc) > 1:
3364 if desc in english_words and desc[0].isalpha():
3365 return "english" # Handles ones containing whitespace
3366 desc1 = re.sub(
3367 tokenizer_fixup_re, lambda m: tokenizer_fixup_map[m.group(0)], desc
3368 )
3369 tokens = tokenizer.tokenize(desc1)
3370 if not tokens: 3370 ↛ 3371line 3370 didn't jump to line 3371 because the condition on line 3370 was never true
3371 return "other"
3372 lst_bool = list(
3373 x not in not_english_words
3374 and
3375 # not x.isdigit() and
3376 (
3377 x in english_words
3378 or x.lower() in english_words
3379 or x in known_firsts
3380 or x[0].isdigit()
3381 or x in accepted
3382 or
3383 # (x[0].isupper() and x.find("-") < 0 and x.isascii()) or
3384 (
3385 x.endswith("s") and len(x) >= 4 and x[:-1] in english_words
3386 ) # Plural
3387 or (
3388 x.endswith("ies")
3389 and len(x) >= 5
3390 and x[:-3] + "y" in english_words
3391 ) # E.g. lily - lilies
3392 or (
3393 x.endswith("ing")
3394 and len(x) >= 5
3395 and x[:-3] in english_words
3396 ) # E.g. bring - bringing
3397 or (
3398 x.endswith("ing")
3399 and len(x) >= 5
3400 and x[:-3] + "e" in english_words
3401 ) # E.g., tone - toning
3402 or (
3403 x.endswith("ed") and len(x) >= 5 and x[:-2] in english_words
3404 ) # E.g. hang - hanged
3405 or (
3406 x.endswith("ed")
3407 and len(x) >= 5
3408 and x[:-2] + "e" in english_words
3409 ) # E.g. atone - atoned
3410 or (x.endswith("'s") and x[:-2] in english_words)
3411 or (x.endswith("s'") and x[:-2] in english_words)
3412 or (
3413 x.endswith("ise")
3414 and len(x) >= 5
3415 and x[:-3] + "ize" in english_words
3416 )
3417 or (
3418 x.endswith("ised")
3419 and len(x) >= 6
3420 and x[:-4] + "ized" in english_words
3421 )
3422 or (
3423 x.endswith("ising")
3424 and len(x) >= 7
3425 and x[:-5] + "izing" in english_words
3426 )
3427 or (
3428 re.search(r"[-/]", x)
3429 and all(
3430 ((y in english_words and len(y) > 2) or not y)
3431 for y in re.split(r"[-/]", x)
3432 )
3433 )
3434 )
3435 for x in tokens
3436 )
3437 cnt = lst_bool.count(True)
3438 rejected_words = tuple(
3439 x for i, x in enumerate(tokens) if not lst_bool[i]
3440 )
3441 if (
3442 any(
3443 lst_bool[i] and x[0].isalpha() and len(x) > 1
3444 for i, x in enumerate(tokens)
3445 )
3446 and not desc.startswith("-")
3447 and not desc.endswith("-")
3448 and re.search(r"\w+", desc)
3449 and (
3450 cnt == len(lst_bool)
3451 or (
3452 any(
3453 lst_bool[i] and len(x) > 3 for i, x in enumerate(tokens)
3454 )
3455 and cnt >= len(lst_bool) - 1
3456 )
3457 or cnt / len(lst_bool) >= 0.8
3458 or (
3459 all(x in potentially_english_words for x in rejected_words)
3460 and cnt / len(lst_bool) >= 0.50
3461 )
3462 )
3463 ):
3464 return "english"
3465 # Some translations have apparent pronunciation descriptions in /.../
3466 # which we'll put in the romanization field (even though they probably are
3467 # not exactly romanizations).
3468 if desc.startswith("/") and desc.endswith("/"):
3469 return "romanization"
3470 # If all characters are in classes that could occur in romanizations,
3471 # treat as romanization
3472 classes = list(
3473 unicodedata.category(x) if x not in ("-", ",", ":", "/", '"') else "OK"
3474 for x in normalized_desc
3475 )
3476 classes1 = []
3477 num_latin = 0
3478 num_greek = 0
3479 # part = ""
3480 # for ch, cl in zip(normalized_desc, classes):
3481 # part += f"{ch}({cl})"
3482 # print(part)
3483 for ch, cl in zip(normalized_desc, classes):
3484 if ch in (
3485 "'", # ' in Arabic, / in IPA-like parenthesized forms
3486 ".", # e.g., "..." in translations
3487 ";",
3488 ":",
3489 "!",
3490 "‘",
3491 "’",
3492 '"',
3493 "“",
3494 "”",
3495 "/",
3496 "?",
3497 "…", # alternative to "..."
3498 "⁉", # 見る/Japanese automatic transcriptions...
3499 "?",
3500 "!",
3501 "⁻", # superscript -, used in some Cantonese roman, e.g. "we"
3502 "ʔ",
3503 "ʼ",
3504 "ʾ",
3505 "ʹ",
3506 ): # ʹ e.g. in understand/English/verb Russian transl
3507 classes1.append("OK")
3508 continue
3509 if cl not in ("Ll", "Lu"):
3510 classes1.append(cl)
3511 continue
3512 try:
3513 name = unicodedata.name(ch)
3514 first = name.split()[0]
3515 if first == "LATIN":
3516 num_latin += 1
3517 elif first == "GREEK":
3518 num_greek += 1
3519 elif first == "COMBINING": # Combining diacritic 3519 ↛ 3520line 3519 didn't jump to line 3520 because the condition on line 3519 was never true
3520 cl = "OK"
3521 elif re.match(non_latin_scripts_re, name): 3521 ↛ 3525line 3521 didn't jump to line 3525 because the condition on line 3521 was always true
3522 cl = "NO" # Not acceptable in romanizations
3523 except ValueError:
3524 cl = "NO" # Not acceptable in romanizations
3525 classes1.append(cl)
3526 # print("classify_desc: {!r} classes1: {}".format(desc, classes1))
3527 # print(set(classes1) )
3528 if all(
3529 x in ("Ll", "Lu", "Lt", "Lm", "Mn", "Mc", "Zs", "Nd", "OK")
3530 for x in classes1
3531 ):
3532 if (
3533 (num_latin >= num_greek + 2 or num_greek == 0)
3534 and classes1.count("OK") < len(classes1)
3535 and classes1.count("Nd") < len(classes1)
3536 ):
3537 return "romanization"
3538 # Otherwise it is something else, such as hanji version of the word
3539 return "other"
3542def remove_text_in_parentheses(text: str) -> str:
3543 parentheses = 0
3544 new_text = ""
3545 for c in text:
3546 if c == "(":
3547 parentheses += 1
3548 elif c == ")":
3549 parentheses -= 1
3550 elif parentheses == 0:
3551 new_text += c
3552 return new_text