Coverage for src/wiktextract/extractor/en/form_descriptions.py: 76%
1327 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1# Code for parsing linguistic form descriptions and tags for word senses
2# (both the word entry head - initial part and parenthesized parts -
3# and tags at the beginning of word senses)
4#
5# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
7import functools
8import re
9import unicodedata
10from typing import (
11 Any,
12 Literal,
13 Optional,
14 Sequence,
15 Union,
16)
18import Levenshtein
19from nltk import TweetTokenizer # type:ignore[import-untyped]
21from ...datautils import data_append, data_extend, split_at_comma_semi
22from ...tags import (
23 alt_of_tags,
24 form_of_tags,
25 head_final_bantu_langs,
26 head_final_bantu_map,
27 head_final_numeric_langs,
28 head_final_other_langs,
29 head_final_other_map,
30 head_final_semitic_langs,
31 head_final_semitic_map,
32 uppercase_tags,
33 valid_tags,
34 xlat_descs_map,
35 xlat_head_map,
36 xlat_tags_map,
37)
38from ...topics import topic_generalize_map, valid_topics
39from ...wxr_context import WiktextractContext
40from .english_words import (
41 english_words,
42 not_english_words,
43 potentially_english_words,
44)
45from .form_descriptions_known_firsts import known_firsts
46from .taxondata import known_species
47from .type_utils import (
48 AltOf,
49 FormData,
50 LinkageData,
51 SenseData,
52 SoundData,
53 TranslationData,
54 WordData,
55)
57# Tokenizer for classify_desc()
58tokenizer = TweetTokenizer()
60# These are ignored as the value of a related form in form head.
61IGNORED_RELATED: set[str] = set(
62 [
63 "-",
64 "־",
65 "᠆",
66 "‐",
67 "‑",
68 "‒",
69 "–",
70 "—",
71 "―",
72 "−",
73 "⸺",
74 "⸻",
75 "﹘",
76 "﹣",
77 "-",
78 "?",
79 "(none)",
80 ]
81)
84# First words of unicodedata.name() that indicate scripts that cannot be
85# accepted in romanizations or english (i.e., should be considered "other"
86# in classify_desc()).
87non_latin_scripts: list[str] = [
88 "ADLAM",
89 "ARABIC",
90 "ARABIC-INDIC",
91 "ARMENIAN",
92 "BALINESE",
93 "BENGALI",
94 "BRAHMI",
95 "BRAILLE",
96 "CANADIAN",
97 "CHAKMA",
98 "CHAM",
99 "CHEROKEE",
100 "CJK",
101 "COPTIC",
102 "COUNTING ROD",
103 "CUNEIFORM",
104 "CYRILLIC",
105 "DOUBLE-STRUCK",
106 "EGYPTIAN",
107 "ETHIOPIC",
108 "EXTENDED ARABIC-INDIC",
109 "GEORGIAN",
110 "GLAGOLITIC",
111 "GOTHIC",
112 "GREEK",
113 "GUJARATI",
114 "GURMUKHI",
115 "HANGUL",
116 "HANIFI ROHINGYA",
117 "HEBREW",
118 "HIRAGANA",
119 "JAVANESE",
120 "KANNADA",
121 "KATAKANA",
122 "KAYAH LI",
123 "KHMER",
124 "KHUDAWADI",
125 "LAO",
126 "LEPCHA",
127 "LIMBU",
128 "MALAYALAM",
129 "MEETEI",
130 "MYANMAR",
131 "NEW TAI LUE",
132 "NKO",
133 "OL CHIKI",
134 "OLD PERSIAN",
135 "OLD SOUTH ARABIAN",
136 "ORIYA",
137 "OSMANYA",
138 "PHOENICIAN",
139 "SAURASHTRA",
140 "SHARADA",
141 "SINHALA",
142 "SUNDANESE",
143 "SYLOTI",
144 "TAI THAM",
145 "TAKRI",
146 "TAMIL",
147 "TELUGU",
148 "THAANA",
149 "THAI",
150 "TIBETAN",
151 "TIFINAGH",
152 "TIRHUTA",
153 "UGARITIC",
154 "WARANG CITI",
155 "YI",
156]
157non_latin_scripts_re = re.compile(
158 r"(" + r"|".join(re.escape(x) for x in non_latin_scripts) + r")\b"
159)
161# Sanity check xlat_head_map values
162for k, v in xlat_head_map.items():
163 if v.startswith("?"):
164 v = v[1:]
165 for tag in v.split():
166 if tag not in valid_tags: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true
167 print(
168 "WARNING: xlat_head_map[{}] contains unrecognized tag {}".format(
169 k, tag
170 )
171 )
173# Regexp for finding nested translations from translation items (these are
174# used in, e.g., year/English/Translations/Arabic). This is actually used
175# in page.py.
176nested_translations_re = re.compile(
177 r"\s+\((({}): ([^()]|\([^()]+\))+)\)".format(
178 "|".join(
179 re.escape(x.removeprefix("?"))
180 for x in sorted(xlat_head_map.values(), key=len, reverse=True)
181 if x and not x.startswith("class-")
182 )
183 )
184)
186# Regexp that matches head tag specifiers. Used to match tags from end of
187# translations and linkages
188head_final_re_text = r"( -)?( ({}))+".format(
189 "|".join(
190 re.escape(x)
191 for x in
192 # The sort is to put longer ones first, preferring them in
193 # the regexp match
194 sorted(xlat_head_map.keys(), key=len, reverse=True)
195 )
196)
197head_final_re = re.compile(head_final_re_text + "$")
199# Regexp used to match head tag specifiers at end of a form for certain
200# Bantu languages (particularly Swahili and similar languages).
201head_final_bantu_re_text = r" ({})".format(
202 "|".join(re.escape(x) for x in head_final_bantu_map.keys())
203)
204head_final_bantu_re = re.compile(head_final_bantu_re_text + "$")
206# Regexp used to match head tag specifiers at end of a form for certain
207# Semitic languages (particularly Arabic and similar languages).
208head_final_semitic_re_text = r" ({})".format(
209 "|".join(re.escape(x) for x in head_final_semitic_map.keys())
210)
211head_final_semitic_re = re.compile(head_final_semitic_re_text + "$")
213# Regexp used to match head tag specifiers at end of a form for certain
214# other languages (e.g., Lithuanian, Finnish, French).
215head_final_other_re_text = r" ({})".format(
216 "|".join(re.escape(x) for x in head_final_other_map.keys())
217)
218head_final_other_re = re.compile(head_final_other_re_text + "$")
220# Regexp for splitting heads. See parse_word_head().
221head_split_re_text = (
222 "("
223 + head_final_re_text
224 + "|"
225 + head_final_bantu_re_text
226 + "|"
227 + head_final_semitic_re_text
228 + "|"
229 + head_final_other_re_text
230 + ")?( or |[,;]+)"
231)
232head_split_re = re.compile(head_split_re_text)
233head_split_re_parens = 0
234for m in re.finditer(r"(^|[^\\])[(]+", head_split_re_text):
235 head_split_re_parens += m.group(0).count("(")
237# Parenthesized parts that are ignored in translations
238tr_ignored_parens: set[str] = set(
239 [
240 "please verify",
241 "(please verify)",
242 "transliteration needed",
243 "(transliteration needed)",
244 "in words with back vowel harmony",
245 "(in words with back vowel harmony)",
246 "in words with front vowel harmony",
247 "(in words with front vowel harmony)",
248 "see below",
249 "see usage notes below",
250 ]
251)
252tr_ignored_parens_re = re.compile(
253 r"^("
254 + "|".join(re.escape(x) for x in tr_ignored_parens)
255 + ")$"
256 + r"|^(Can we clean up|Can we verify|for other meanings see "
257 r"lit\. )"
258)
260# Translations that are ignored
261ignored_translations: set[str] = set(
262 [
263 "[script needed]",
264 "please add this translation if you can",
265 ]
266)
268# Put english text into the "note" field in a translation if it contains one
269# of these words
270tr_note_re = re.compile(
271 r"(\b(article|definite|indefinite|superlative|comparative|pattern|"
272 r"adjective|adjectives|clause|clauses|pronoun|pronouns|preposition|prep|"
273 r"postposition|postp|action|actions|articles|"
274 r"adverb|adverbs|noun|nouns|verb|verbs|before|"
275 r"after|placed|prefix|suffix|used with|translated|"
276 r"nominative|genitive|dative|infinitive|participle|past|perfect|imperfect|"
277 r"perfective|imperfective|auxiliary|negative|future|present|tense|aspect|"
278 r"conjugation|declension|class|category|plural|singular|positive|"
279 r"seldom used|formal|informal|familiar|unspoken|spoken|written|"
280 r"indicative|progressive|conditional|potential|"
281 r"accusative|adessive|inessive|superessive|elative|allative|"
282 r"dialect|dialects|object|subject|predicate|movies|recommended|language|"
283 r"locative|continuous|simple|continuousness|gerund|subjunctive|"
284 r"periphrastically|no equivalent|not used|not always used|"
285 r"used only with|not applicable|use the|signifying|wordplay|pronounced|"
286 r"preconsonantal|spelled|spelling|respelling|respellings|phonetic|"
287 r"may be replaced|stricter sense|for nonhumans|"
288 r"sense:|used:|in full:|informally used|followed by|"
289 r"not restricted to|pertaining to|or optionally with|are optional|"
290 r"in conjunction with|in compounds|depending on the relationship|"
291 r"person addressed|one person|multiple persons|may be replaced with|"
292 r"optionally completed with|in the phrase|in response to|"
293 r"before a|before an|preceded by|verbs ending|very common|after a verb|"
294 r"with verb|with uncountable|with the objects|with stative|"
295 r"can be replaced by|often after|used before|used after|"
296 r"used in|clipping of|spoken|somewhat|capitalized|"
297 r"short form|shortening of|shortened form|initialism of|"
298 r"said to|rare:|rarer also|is rarer|negatively connoted|"
299 r"previously mentioned|uncountable noun|countable noun|"
300 r"countable nouns|uncountable nouns|"
301 r"with predicative|with -|with imperfect|with a negated|"
302 r"colloquial|misspelling|holophrastic|frequently|esp\.|especially|"
303 r'"|'
304 r"general term|after a vowel|before a vowel|"
305 r"form|regular|irregular|alternative)"
306 r")($|[) ])|^("
307 # Following are only matched at the beginning of the string
308 r"pl|pl\.|see:|pl:|sg:|plurals:|e\.g\.|e\.g\.:|e\.g\.,|cf\.|compare|such as|"
309 r"see|only|often|usually|used|usage:|of|not|in|compare|usu\.|"
310 r"as|about|abbrv\.|abbreviation|abbr\.|that:|optionally|"
311 r"mainly|from|for|also|also:|acronym|"
312 r"\+|with) "
313)
314# \b does not work at the end???
316# Related forms matching this regexp will be considered suspicious if the
317# page title does not also match one of these.
318suspicious_related_re = re.compile(
319 r"(^| )(f|m|n|c|or|pl|sg|inan|anim|pers|anml|impf|pf|vir|nvir)( |$)"
320 r"|[][:=<>&#*|]"
321 r"| \d+$"
322)
324# Word forms (head forms, translations, etc) that will be considered ok and
325# silently accepted even if they would otherwise trigger a suspicious
326# form warning.
327ok_suspicious_forms: set[str] = set(
328 [
329 "but en or", # "golden goal"/English/Tr/French
330 "cœur en or", # "heart of gold"/Eng/Tr/French
331 "en or", # golden/Eng/Tr/French
332 "men du", # jet/Etym2/Noun/Tr/Cornish
333 "parachute en or", # "golden parachute"/Eng/Tr/French
334 "vieil or", # "old gold"/Eng/Tr/French
335 # "all that glitters is not gold"/Eng/Tr/French
336 "tout ce qui brille n’est pas or",
337 "μη αποκλειστικό or", # inclusive or/Eng/Tr/Greek
338 "period or full stop",
339 ]
340)
343# Replacements to be done in classify_desc before tokenizing. This is a
344# workaround for shortcomings in TweetTokenizer.
345tokenizer_fixup_map = {
346 r"a.m.": "AM",
347 r"p.m.": "PM",
348}
349tokenizer_fixup_re = re.compile(
350 r"\b("
351 + "|".join(
352 re.escape(x)
353 for x in sorted(
354 tokenizer_fixup_map.keys(), key=lambda x: len(x), reverse=True
355 )
356 )
357 + r")"
358)
360# Unknown tags starting with these words will be silently ignored.
361ignored_unknown_starts: set[str] = set(
362 [
363 "originally",
364 "e.g.",
365 "c.f.",
366 "supplanted by",
367 "supplied by",
368 ]
369)
371ignored_unknown_starts_re = re.compile(
372 r"^("
373 + "|".join(
374 re.escape(x)
375 for x in sorted(ignored_unknown_starts, key=lambda x: -len(x))
376 )
377 + ") "
378)
380# If an unknown sequence starts with one of these, it will continue as an
381# unknown sequence until the end, unless it turns out to have a replacement.
382allowed_unknown_starts: set[str] = set(
383 [
384 "Relating",
385 "accompanied",
386 "added",
387 "after",
388 "answering",
389 "as",
390 "based",
391 "before",
392 "conjugated",
393 "conjunction",
394 "construed",
395 "especially",
396 "expression:",
397 "figurative:",
398 "followed",
399 "for",
400 "forms",
401 "from",
402 "governs",
403 "in",
404 "indicating",
405 "modifying",
406 "normally",
407 "not",
408 "of",
409 "preceding",
410 "prefixed",
411 "referring",
412 "relating",
413 "revived",
414 "said",
415 "since",
416 "takes",
417 "used",
418 "with",
419 "With",
420 "without",
421 ]
422)
423# Allow the ignored unknown starts without complaining
424allowed_unknown_starts.update(ignored_unknown_starts)
426# Full unknown tags that will be ignored in decode_tags()
427# XXX this is unused, ask Tatu where the contents is now
428ignored_unknown_tags: set[str] = set([])
430# Head endings that are mapped to tags
431head_end_map = {
432 " 1st conj.": "conjugation-1",
433 " 2nd conj.": "conjugation-2",
434 " 3rd conj.": "conjugation-3",
435 " 4th conj.": "conjugation-4",
436 " 5th conj.": "conjugation-5",
437 " 6th conj.": "conjugation-6",
438 " 7th conj.": "conjugation-7",
439}
440head_end_re = re.compile(
441 r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$"
442)
445# Dictionary of language-specific parenthesized head part starts that
446# either introduce new tags or modify previous tags. The value for each
447# language is a dictionary that maps the first word of the head part to
448# (rem_tags, add_tags), where ``rem_tags`` can be True to remove all previous
449# tags or a space-separated string of tags to remove, and ``add_tags`` should
450# be a string of tags to add.
451lang_specific_head_map: dict[
452 str, dict[str, Union[tuple[str, str], tuple[Literal[True], str]]]
453] = {
454 "Danish": {
455 # prefix: (rem_tags space separate string/True, add_tags s-sep str)
456 "c": ("neuter", "common-gender"),
457 "n": ("common-gender", "neuter"),
458 "pl": ("singular neuter common-gender", "plural"),
459 "sg": ("plural neuter common-gender", "singular"),
460 },
461}
464# Regular expression used to strip additional stuff from the end of alt_of and
465# form_of.
466alt_of_form_of_clean_re = re.compile(
467 r"(?s)("
468 + "|".join(
469 [
470 r":",
471 r'[“"]',
472 r";",
473 r" \(",
474 r" - ",
475 r" ־ ",
476 r" ᠆ ",
477 r" ‐ ",
478 r" ‑ ",
479 r" ‒ ",
480 r" – ",
481 r" — ",
482 r" ― ",
483 r" − ",
484 r" ⸺ ",
485 r" ⸻ ",
486 r" ﹘ ",
487 r" ﹣ ",
488 r" - ",
489 r" \+ ",
490 r" \(with ",
491 r" with -ra/-re",
492 r"\. Used ",
493 r"\. Also ",
494 r"\. Since ",
495 r"\. A ",
496 r"\.\. A ",
497 r"\. An ",
498 r"\.\. An ",
499 r"\. an ",
500 r"\. The ",
501 r"\. Spanish ",
502 r"\. Language ",
503 r"\. former name of ",
504 r"\. AIM",
505 r"\. OT",
506 r"\. Not ",
507 r"\. Now ",
508 r"\. Nowadays ",
509 r"\. Early ",
510 r"\. ASEAN",
511 r"\. UN",
512 r"\. IMF",
513 r"\. WHO",
514 r"\. WIPO",
515 r"\. AC",
516 r"\. DC",
517 r"\. DNA",
518 r"\. RNA",
519 r"\. SOB",
520 r"\. IMO",
521 r"\. Behavior",
522 r"\. Income ",
523 r"\. More ",
524 r"\. Most ",
525 r"\. Only ",
526 r"\. Also ",
527 r"\. From ",
528 r"\. Of ",
529 r"\.\. Of ",
530 r"\. To ",
531 r"\. For ",
532 r"\. If ",
533 r"\. Praenominal ",
534 r"\. This ",
535 r"\. Replaced ",
536 r"\. CHCS is the ",
537 r"\. Equivalent ",
538 r"\. Initialism ",
539 r"\. Note ",
540 r"\. Alternative ",
541 r"\. Compare ",
542 r"\. Cf\. ",
543 r"\. Comparable ",
544 r"\. Involves ",
545 r"\. Sometimes ",
546 r"\. Commonly ",
547 r"\. Often ",
548 r"\. Typically ",
549 r"\. Possibly ",
550 r"\. Although ",
551 r"\. Rare ",
552 r"\. Instead ",
553 r"\. Integrated ",
554 r"\. Distinguished ",
555 r"\. Given ",
556 r"\. Found ",
557 r"\. Was ",
558 r"\. In ",
559 r"\. It ",
560 r"\.\. It ",
561 r"\. One ",
562 r"\. Any ",
563 r"\. They ",
564 r"\. Members ",
565 r"\. Each ",
566 r"\. Original ",
567 r"\. Especially ",
568 r"\. Usually ",
569 r"\. Known ",
570 r"\.\. Known ",
571 r"\. See ",
572 r"\. see ",
573 r"\. target was not ",
574 r"\. Popular ",
575 r"\. Pedantic ",
576 r"\. Positive ",
577 r"\. Society ",
578 r"\. Plan ",
579 r"\. Environmentally ",
580 r"\. Affording ",
581 r"\. Encompasses ",
582 r"\. Expresses ",
583 r"\. Indicates ",
584 r"\. Text ",
585 r"\. Large ",
586 r"\. Sub-sorting ",
587 r"\. Sax",
588 r"\. First-person ",
589 r"\. Second-person ",
590 r"\. Third-person ",
591 r"\. 1st ",
592 r"\. 2nd ",
593 r"\. 3rd ",
594 r"\. Term ",
595 r"\. Northeastern ",
596 r"\. Northwestern ",
597 r"\. Southeast ",
598 r"\. Egyptian ",
599 r"\. English ",
600 r"\. Cape Province was split into ",
601 r"\. Pañcat",
602 r"\. of the ",
603 r"\. is ",
604 r"\. after ",
605 r"\. or ",
606 r"\. chromed",
607 r"\. percussion",
608 r"\. with his ",
609 r"\. a\.k\.a\. ",
610 r"\. comparative form ",
611 r"\. singular ",
612 r"\. plural ",
613 r"\. present ",
614 r"\. his ",
615 r"\. her ",
616 r"\. equivalent ",
617 r"\. measuring ",
618 r"\. used in ",
619 r"\. cutely ",
620 r"\. Protects",
621 r'\. "',
622 r"\.^",
623 r"\. \+ ",
624 r"\., ",
625 r". — ",
626 r", a ",
627 r", an ",
628 r", the ",
629 r", obsolete ",
630 r", possessed", # 'd/English
631 r", imitating", # 1/English
632 r", derived from",
633 r", called ",
634 r", especially ",
635 r", slang for ",
636 r" corresponding to ",
637 r" equivalent to ",
638 r" popularized by ",
639 r" denoting ",
640 r" in its various senses\.",
641 r" used by ",
642 r" but not for ",
643 r" since ",
644 r" i\.e\. ",
645 r" i\. e\. ",
646 r" e\.g\. ",
647 r" eg\. ",
648 r" etc\. ",
649 r"\[http",
650 r" — used as ",
651 r" by K\. Forsyth ",
652 r" by J\. R\. Allen ",
653 r" by S\. Ferguson ",
654 r" by G\. Donaldson ",
655 r" May refer to ",
656 r" An area or region ",
657 ]
658 )
659 + r").*$"
660)
663class ValidNode:
664 """Node in the valid_sequences tree. Each node is part of a chain
665 or chains that form sequences built out of keys in key->tags
666 maps like xlat_tags, etc. The ValidNode's 'word' is the key
667 by which it is refered to in the root dict or a `children` dict,
668 `end` marks that the node is the end-terminus of a sequence (but
669 it can still continue if the sequence is shared by the start of
670 other sequences: "nominative$" and "nominative plural$" for example),
671 `tags` and `topics` are the dicts containing tag and topic strings
672 for terminal nodes (end==True)."""
674 __slots__ = (
675 "end",
676 "tags",
677 "topics",
678 "children",
679 )
681 def __init__(
682 self,
683 end=False,
684 tags: Optional[list[str]] = None,
685 topics: Optional[list[str]] = None,
686 children: Optional[dict[str, "ValidNode"]] = None,
687 ) -> None:
688 self.end = end
689 self.tags: list[str] = tags or []
690 self.topics: list[str] = topics or []
691 self.children: dict[str, "ValidNode"] = children or {}
694def add_to_valid_tree(tree: ValidNode, desc: str, v: Optional[str]) -> None:
695 """Helper function for building trees of valid tags/sequences during
696 initialization."""
697 assert isinstance(tree, ValidNode)
698 assert isinstance(desc, str)
699 assert v is None or isinstance(v, str)
700 node = tree
702 # Build the tree structure: each node has children nodes
703 # whose names are denoted by their dict key.
704 for w in desc.split(" "):
705 if w in node.children:
706 node = node.children[w]
707 else:
708 new_node = ValidNode()
709 node.children[w] = new_node
710 node = new_node
711 if not node.end:
712 node.end = True
713 if not v:
714 return None # Terminate early because there are no tags
716 tagslist = []
717 topicslist = []
718 for vv in v.split():
719 if vv in valid_tags:
720 tagslist.append(vv)
721 elif vv in valid_topics: 721 ↛ 724line 721 didn't jump to line 724 because the condition on line 721 was always true
722 topicslist.append(vv)
723 else:
724 print(
725 "WARNING: tag/topic {!r} maps to unknown {!r}".format(desc, vv)
726 )
727 topics = " ".join(topicslist)
728 tags = " ".join(tagslist)
729 # Changed to "_tags" and "_topics" to avoid possible key-collisions.
730 if topics:
731 node.topics.extend([topics])
732 if tags:
733 node.tags.extend([tags])
736def add_to_valid_tree1(
737 tree: ValidNode,
738 k: str,
739 v: Union[list[str], tuple[str, ...], str],
740 valid_values: Union[set[str], dict[str, Any]],
741) -> list[str]:
742 assert isinstance(tree, ValidNode)
743 assert isinstance(k, str)
744 assert v is None or isinstance(v, (list, tuple, str))
745 assert isinstance(valid_values, (set, dict))
746 if not v: 746 ↛ 747line 746 didn't jump to line 747 because the condition on line 746 was never true
747 add_to_valid_tree(valid_sequences, k, None)
748 return []
749 elif isinstance(v, str):
750 v = [v]
751 q = []
752 for vv in v:
753 assert isinstance(vv, str)
754 add_to_valid_tree(valid_sequences, k, vv)
755 vvs = vv.split()
756 for x in vvs:
757 q.append(x)
758 # return each individual tag
759 return q
762def add_to_valid_tree_mapping(
763 tree: ValidNode,
764 mapping: Union[dict[str, Union[list[str], str]], dict[str, str]],
765 valid_values: Union[set[str], dict[str, Any]],
766 recurse: bool,
767) -> None:
768 assert isinstance(tree, ValidNode)
769 assert isinstance(mapping, dict)
770 assert isinstance(valid_values, (set, dict))
771 assert recurse in (True, False)
772 for k, v in mapping.items():
773 assert isinstance(k, str)
774 assert isinstance(v, (list, str))
775 if isinstance(v, str):
776 q = add_to_valid_tree1(tree, k, [v], valid_values)
777 else:
778 q = add_to_valid_tree1(tree, k, v, valid_values)
779 if recurse:
780 visited = set()
781 while q:
782 v = q.pop()
783 if v in visited:
784 continue
785 visited.add(v)
786 if v not in mapping:
787 continue
788 vv = mapping[v]
789 qq = add_to_valid_tree1(tree, k, vv, valid_values)
790 q.extend(qq)
793# Tree of sequences considered to be tags (includes sequences that are
794# mapped to something that becomes one or more valid tags)
795valid_sequences = ValidNode()
796sequences_with_slashes: set[str] = set()
797for tag in valid_tags:
798 # The basic tags used in our tag system; some are a bit weird, but easier
799 # to implement this with 'false' positives than filter out stuff no one else
800 # uses.
801 if "/" in tag:
802 sequences_with_slashes.add(tag)
803 add_to_valid_tree(valid_sequences, tag, tag)
804for tag in uppercase_tags:
805 hyphenated = re.sub(r"\s+", "-", tag)
806 if hyphenated in valid_tags: 806 ↛ 807line 806 didn't jump to line 807 because the condition on line 806 was never true
807 print(
808 "DUPLICATE TAG: {} (from uppercase tag {!r})".format(
809 hyphenated, tag
810 )
811 )
812 assert hyphenated not in valid_tags
813 # Might as well, while we're here: Add hyphenated location tag.
814 valid_tags[hyphenated] = "dialect"
815 add_to_valid_tree(valid_sequences, hyphenated, hyphenated)
816for tag in uppercase_tags:
817 hyphenated = re.sub(r"\s+", "-", tag)
818 # XXX Move to above loop? Or is this here for readability?
819 if "/" in tag:
820 sequences_with_slashes.add(tag)
821 add_to_valid_tree(valid_sequences, tag, hyphenated)
822# xlat_tags_map!
823add_to_valid_tree_mapping(valid_sequences, xlat_tags_map, valid_tags, False)
824for k in xlat_tags_map:
825 if "/" in k:
826 sequences_with_slashes.add(k)
827# Add topics to the same table, with all generalized topics also added
828for topic in valid_topics:
829 assert " " not in topic
830 if "/" in topic: 830 ↛ 831line 830 didn't jump to line 831 because the condition on line 830 was never true
831 sequences_with_slashes.add(topic)
832 add_to_valid_tree(valid_sequences, topic, topic)
833# Let each original topic value stand alone. These are not generally on
834# valid_topics. We add the original topics with spaces replaced by hyphens.
835for topic in topic_generalize_map.keys():
836 hyphenated = topic.replace(" ", "-")
837 valid_topics.add(hyphenated)
838 if "/" in topic: 838 ↛ 839line 838 didn't jump to line 839 because the condition on line 838 was never true
839 sequences_with_slashes.add(tag)
840 add_to_valid_tree(valid_sequences, topic, hyphenated)
841# Add canonicalized/generalized topic values
842add_to_valid_tree_mapping(
843 valid_sequences, topic_generalize_map, valid_topics, True
844)
846# Regex used to divide a decode candidate into parts that shouldn't
847# have their slashes turned into spaces
848slashes_re = re.compile(
849 r"(" + "|".join((re.escape(s) for s in sequences_with_slashes)) + r")"
850)
852# Regexp used to find "words" from word heads and linguistic descriptions
853word_pattern = (
854 r"[^ ,;()\u200e]+|"
855 r"\([^ ,;()\u200e]+\)[^ ,;()\u200e]+|"
856 r"[\u2800-\u28ff]|" # Braille characters
857 r"\(([^()]|\([^()]*\))*\)"
858)
860word_re_global = re.compile(word_pattern)
863def distw(titleparts: Sequence[str], word: str) -> float:
864 """Computes how distinct ``word`` is from the most similar word in
865 ``titleparts``. Returns 1 if words completely distinct, 0 if
866 identical, or otherwise something in between."""
867 assert isinstance(titleparts, (list, tuple))
868 assert isinstance(word, str)
869 w = min(
870 Levenshtein.distance(word, tw) / max(len(tw), len(word))
871 for tw in titleparts
872 )
873 return w
876def map_with(
877 ht: Union[dict[str, Union[str, list[str]]], dict[str, str]],
878 lst: Sequence[str],
879) -> list[str]:
880 """Takes alternatives from ``lst``, maps them using ``ht`` to zero or
881 more alternatives each, and returns a combined list of alternatives."""
882 assert isinstance(ht, dict)
883 assert isinstance(lst, (list, tuple))
884 ret = []
885 for x in lst:
886 assert isinstance(x, str)
887 x = x.strip()
888 x = ht.get(x, x)
889 if isinstance(x, str): 889 ↛ 892line 889 didn't jump to line 892 because the condition on line 889 was always true
890 if x: 890 ↛ 885line 890 didn't jump to line 885 because the condition on line 890 was always true
891 ret.append(x)
892 elif isinstance(x, (list, tuple)):
893 ret.extend(x)
894 else:
895 raise RuntimeError("map_with unexpected value: {!r}".format(x))
896 return ret
899TagList = list[str]
900PosPathStep = tuple[int, TagList, TagList]
903def check_unknown(
904 from_i: int,
905 to_i: int,
906 i: int,
907 wordlst: Sequence[str],
908 allow_any: bool,
909 no_unknown_starts: bool,
910) -> list[PosPathStep]:
911 """Check if the current section from_i->to_i is actually unknown
912 or if it needs some special handling. We already presupposed that
913 this is UNKNOWN; this is just called to see what *kind* of UNKNOWN."""
914 assert isinstance(to_i, int)
915 assert isinstance(from_i, int)
916 assert isinstance(i, int)
917 # Adds unknown tag if needed. Returns new last_i
918 # print("check_unknown to_i={} from_i={} i={}"
919 # .format(to_i, from_i, i))
920 if from_i >= to_i:
921 return []
922 words = wordlst[from_i:to_i]
923 tag = " ".join(words)
924 assert tag
925 # print(f"{tag=}")
926 if re.match(ignored_unknown_starts_re, tag):
927 # Tags with this start are to be ignored
928 return [(from_i, ["UNKNOWN"], [])]
929 if tag in ignored_unknown_tags: 929 ↛ 930line 929 didn't jump to line 930 because the condition on line 929 was never true
930 return [] # One of the tags listed as to be ignored
931 if tag in ("and", "or"):
932 return []
933 if (
934 not allow_any
935 and not words[0].startswith("~")
936 and (
937 no_unknown_starts
938 or words[0] not in allowed_unknown_starts
939 or len(words) <= 1
940 )
941 ):
942 # print("ERR allow_any={} words={}"
943 # .format(allow_any, words))
944 return [
945 (from_i, ["UNKNOWN"], ["error-unknown-tag"])
946 ] # Add ``tag`` here to include
947 else:
948 return [(from_i, ["UNKNOWN"], [tag])]
951def add_new1(
952 node: ValidNode,
953 i: int,
954 start_i: int,
955 last_i: int,
956 new_paths: list[list[PosPathStep]],
957 new_nodes: list[tuple[ValidNode, int, int]],
958 pos_paths: list[list[list[PosPathStep]]],
959 wordlst: list[str],
960 allow_any: bool,
961 no_unknown_starts: bool,
962 max_last_i: int,
963) -> int:
964 assert isinstance(new_paths, list)
965 # print("add_new: start_i={} last_i={}".format(start_i, last_i))
966 # print("$ {} last_i={} start_i={}"
967 # .format(w, last_i, start_i))
968 max_last_i = max(max_last_i, last_i) # if last_i has grown
969 if (node, start_i, last_i) not in new_nodes:
970 new_nodes.append((node, start_i, last_i))
971 if node.end:
972 # We can see a terminal point in the search tree.
973 u = check_unknown(
974 last_i, start_i, i, wordlst, allow_any, no_unknown_starts
975 )
976 # Create new paths candidates based on different past possible
977 # paths; pos_path[last_i] contains possible paths, so add this
978 # new one at the beginning(?)
979 # The list comprehension inside the parens generates an iterable
980 # of lists, so this is .extend( [(last_i...)], [(last_i...)], ... )
981 # XXX: this is becoming impossible to annotate, nodes might
982 # need to become classed objects and not just dicts, or at least
983 # a TypedDict with a "children" node
984 new_paths.extend(
985 [(last_i, node.tags, node.topics)] + u + x
986 for x in pos_paths[last_i]
987 )
988 max_last_i = i + 1
989 return max_last_i
992@functools.lru_cache(maxsize=65536)
993def decode_tags(
994 src: str,
995 allow_any=False,
996 no_unknown_starts=False,
997) -> tuple[list[tuple[str, ...]], list[str]]:
998 tagsets, topics = decode_tags1(src, allow_any, no_unknown_starts)
999 # print(f"decode_tags: {src=}, {tagsets=}")
1001 # Insert retry-code here that modifies the text source
1002 if (
1003 any(s.startswith("error-") for tagset in tagsets for s in tagset)
1004 # I hate Python's *nested* list comprehension syntax ^
1005 or any(s.startswith("error-") for s in topics)
1006 ):
1007 new_tagsets: list[tuple[str, ...]] = []
1008 new_topics: list[str] = []
1010 if "/" in src:
1011 # slashes_re contains valid key entries with slashes; we're going
1012 # to skip them by splitting the string and skipping handling every
1013 # second entry, which contains the splitting group like "masculine/
1014 # feminine" style keys.
1015 split_parts = re.split(slashes_re, src)
1016 new_parts: list[str] = []
1017 if len(split_parts) > 1:
1018 for i, s in enumerate(split_parts):
1019 if i % 2 == 0:
1020 new_parts.append(s.replace("/", " "))
1021 else:
1022 new_parts.append(s)
1023 new_src = "".join(new_parts)
1024 else:
1025 new_src = src
1026 new_tagsets, new_topics = decode_tags1(
1027 new_src, allow_any, no_unknown_starts
1028 )
1029 elif " or " in src or " and " in src:
1030 # Annoying kludge.
1031 new_src = src.replace(" and ", " ")
1032 new_src = new_src.replace(" or ", " ")
1033 new_tagsets, new_topics = decode_tags1(
1034 new_src, allow_any, no_unknown_starts
1035 )
1036 # print(f"{new_tagsets=}")
1038 if new_tagsets or new_topics:
1039 old_errors = sum(
1040 1 for tagset in tagsets for s in tagset if s.startswith("error")
1041 )
1042 old_errors += sum(1 for s in topics if s.startswith("error"))
1043 new_errors = sum(
1044 1
1045 for new_tagset in new_tagsets
1046 for s in new_tagset
1047 if s.startswith("error")
1048 )
1049 new_errors += sum(1 for s in new_topics if s.startswith("error"))
1051 if new_errors <= old_errors: 1051 ↛ 1054line 1051 didn't jump to line 1054 because the condition on line 1051 was always true
1052 return new_tagsets, new_topics
1054 return tagsets, topics
1057def decode_tags1(
1058 src: str,
1059 allow_any=False,
1060 no_unknown_starts=False,
1061) -> tuple[list[tuple[str, ...]], list[str]]:
1062 """Decodes tags, doing some canonicalizations. This returns a list of
1063 lists of tags and a list of topics."""
1064 assert isinstance(src, str)
1066 # print("decode_tags: src={!r}".format(src))
1068 pos_paths: list[list[list[PosPathStep]]] = [[[]]]
1069 wordlst: list[str] = []
1070 max_last_i = 0 # pre-initialized here so that it can be used as a ref
1072 add_new = functools.partial(
1073 add_new1, # pre-set parameters and references for function
1074 pos_paths=pos_paths,
1075 wordlst=wordlst,
1076 allow_any=allow_any,
1077 no_unknown_starts=no_unknown_starts,
1078 max_last_i=max_last_i,
1079 )
1080 # First split the tags at commas and semicolons. Their significance is that
1081 # a multi-word sequence cannot continue across them.
1082 parts = split_at_comma_semi(src, extra=[";", ":"])
1084 for part in parts:
1085 max_last_i = len(wordlst) # "how far have we gone?"
1086 lst1 = part.split()
1087 if not lst1:
1088 continue
1089 wordlst.extend(lst1)
1090 cur_nodes: list[tuple[ValidNode, int, int]] = [] # Currently seen
1091 for w in lst1:
1092 i = len(pos_paths) - 1
1093 new_nodes: list[tuple[ValidNode, int, int]] = []
1094 # replacement nodes for next loop
1095 new_paths: list[list[PosPathStep]] = []
1096 # print("ITER i={} w={} max_last_i={} wordlst={}"
1097 # .format(i, w, max_last_i, wordlst))
1098 node: ValidNode
1099 start_i: int
1100 last_i: int
1101 for node, start_i, last_i in cur_nodes:
1102 # ValidNodes are part of a search tree that checks if a
1103 # phrase is found in xlat_tags_map and other text->tags dicts.
1104 if w in node.children:
1105 # the phrase continues down the tree
1106 # print("INC", w)
1107 max_last_i = add_new(
1108 node.children[w],
1109 i,
1110 start_i,
1111 last_i,
1112 new_paths,
1113 new_nodes,
1114 )
1115 if node.end:
1116 # we've hit an end point, the tags and topics have already
1117 # been gathered at some point, don't do anything with the
1118 # old stuff
1119 if w in valid_sequences.children:
1120 # This starts a *new* possible section
1121 max_last_i = add_new(
1122 valid_sequences.children[w], # root->
1123 i,
1124 i,
1125 i,
1126 new_paths,
1127 new_nodes,
1128 )
1129 if w not in node.children and not node.end:
1130 # print("w not in node and $: i={} last_i={} wordlst={}"
1131 # .format(i, last_i, wordlst))
1132 # If i == last_i == 0, for example (beginning)
1133 if (
1134 i == last_i
1135 or no_unknown_starts
1136 or wordlst[last_i] not in allowed_unknown_starts
1137 ):
1138 # print("NEW", w)
1139 if w in valid_sequences.children:
1140 # Start new sequences here
1141 max_last_i = add_new(
1142 valid_sequences.children[w],
1143 i,
1144 i,
1145 last_i,
1146 new_paths,
1147 new_nodes,
1148 )
1149 if not new_nodes:
1150 # This is run at the start when i == max_last_i == 0,
1151 # which is what populates the first node in new_nodes.
1152 # Some initial words cause the rest to be interpreted as unknown
1153 # print("not new nodes: i={} last_i={} wordlst={}"
1154 # .format(i, max_last_i, wordlst))
1155 if (
1156 i == max_last_i
1157 or no_unknown_starts
1158 or wordlst[max_last_i] not in allowed_unknown_starts
1159 ):
1160 # print("RECOVER w={} i={} max_last_i={} wordlst={}"
1161 # .format(w, i, max_last_i, wordlst))
1162 if w in valid_sequences.children:
1163 max_last_i = add_new(
1164 # new sequence from root
1165 valid_sequences.children[w],
1166 i,
1167 i,
1168 max_last_i,
1169 new_paths,
1170 new_nodes,
1171 )
1172 cur_nodes = new_nodes # Completely replace nodes!
1173 # 2023-08-18, fix to improve performance
1174 # Decode tags does a big search of the best-shortest matching
1175 # sequences of tags, but the original algorithm didn't have
1176 # any culling happen during operation, so in a case with
1177 # a lot of tags (for example, big blocks of text inserted
1178 # somewhere by mistake that is processed by decode_tags),
1179 # it would lead to exponential growth of new_paths contents.
1180 # This culling, using the same weighting algorithm code as
1181 # in the original is just applied to new_paths before it is
1182 # added to pos_paths. Basically it's "take the 10 best paths".
1183 # This *can* cause bugs if it gets stuck in a local minimum
1184 # or something, but this whole process is one-dimensional
1185 # and not that complex, so hopefully it works out...
1186 pw = []
1187 path: list[PosPathStep]
1188 for path in new_paths:
1189 weight = len(path)
1190 if any(x[1] == ["UNKNOWN"] for x in path):
1191 weight += 100 # Penalize unknown paths
1192 pw.append((weight, path))
1193 new_paths = [weightpath[1] for weightpath in sorted(pw)[:10]]
1194 pos_paths.append(new_paths)
1196 # print("END max_last_i={} len(wordlst)={} len(pos_paths)={}"
1197 # .format(max_last_i, len(wordlst), len(pos_paths)))
1199 if cur_nodes:
1200 # print("END HAVE_NODES")
1201 for node, start_i, last_i in cur_nodes:
1202 if node.end:
1203 # print("$ END start_i={} last_i={}"
1204 # .format(start_i, last_i))
1205 for path in pos_paths[start_i]:
1206 pos_paths[-1].append(
1207 [(last_i, node.tags, node.topics)] + path
1208 )
1209 else:
1210 # print("UNK END start_i={} last_i={} wordlst={}"
1211 # .format(start_i, last_i, wordlst))
1212 u = check_unknown(
1213 last_i,
1214 len(wordlst),
1215 len(wordlst),
1216 wordlst,
1217 allow_any,
1218 no_unknown_starts,
1219 )
1220 if pos_paths[start_i]:
1221 for path in pos_paths[start_i]:
1222 pos_paths[-1].append(u + path)
1223 else:
1224 pos_paths[-1].append(u)
1225 else:
1226 # Check for a final unknown tag
1227 # print("NO END NODES max_last_i={}".format(max_last_i))
1228 paths = pos_paths[max_last_i] or [[]]
1229 u = check_unknown(
1230 max_last_i,
1231 len(wordlst),
1232 len(wordlst),
1233 wordlst,
1234 allow_any,
1235 no_unknown_starts,
1236 )
1237 if u:
1238 # print("end max_last_i={}".format(max_last_i))
1239 for path in list(paths): # Copy in case it is the last pos
1240 pos_paths[-1].append(u + path)
1242 # import json
1243 # print("POS_PATHS:", json.dumps(pos_paths, indent=2, sort_keys=True))
1245 if not pos_paths[-1]:
1246 # print("decode_tags: {}: EMPTY POS_PATHS[-1]".format(src))
1247 return [], []
1249 # Find the best path
1250 pw = []
1251 for path in pos_paths[-1]:
1252 weight = len(path)
1253 if any(x[1] == ["UNKNOWN"] for x in path):
1254 weight += 100 # Penalize unknown paths
1255 pw.append((weight, path))
1256 path = min(pw)[1]
1258 # Convert the best path to tagsets and topics
1259 tagsets: list[list[str]] = [[]]
1260 topics: list[str] = []
1261 for i, tagspec, topicspec in path:
1262 if len(tagsets or "") > 16:
1263 # ctx.error("Too many tagsets! This is probably exponential",
1264 # sortid="form_descriptions/20230818")
1265 return [("error-unknown-tag", "error-exponential-tagsets")], []
1266 if tagspec == ["UNKNOWN"]:
1267 new_tagsets = []
1268 for x in tagsets:
1269 new_tagsets.append(x + topicspec)
1270 tagsets = new_tagsets
1271 continue
1272 if tagspec:
1273 new_tagsets = []
1274 for x in tagsets:
1275 for t in tagspec:
1276 if t: 1276 ↛ 1283line 1276 didn't jump to line 1283 because the condition on line 1276 was always true
1277 new_tags = list(x)
1278 for tag in t.split():
1279 if tag not in new_tags:
1280 new_tags.append(tag)
1281 new_tagsets.append(new_tags)
1282 else:
1283 new_tagsets.append(x)
1284 tagsets = new_tagsets
1285 if topicspec:
1286 for t in topicspec:
1287 for topic in t.split():
1288 if topic not in topics:
1289 topics.append(topic)
1291 # print("unsorted tagsets:", tagsets)
1292 ret_tagsets = sorted(set(tuple(sorted(set(tags))) for tags in tagsets))
1293 # topics = list(sorted(set(topics))) XXX tests expect not sorted
1294 # print("decode_tags: {} -> {} topics {}".format(src, tagsets, topics))
1295 # Yes, ret_tagsets is a list of tags in tuples, while topics is a LIST
1296 # of tags. Turning topics into a tuple breaks tests, turning the tuples
1297 # inside tagsets into lists breaks tests, I'm leaving them mismatched
1298 # for now. XXX
1299 return ret_tagsets, topics
1302def parse_head_final_tags(
1303 wxr: WiktextractContext, lang: str, form: str
1304) -> tuple[str, list[str]]:
1305 """Parses tags that are allowed at the end of a form head from the end
1306 of the form. This can also be used for parsing the final gender etc tags
1307 from translations and linkages."""
1308 assert isinstance(wxr, WiktextractContext)
1309 assert isinstance(lang, str) # Should be language that "form" is for
1310 assert isinstance(form, str)
1312 # print("parse_head_final_tags: lang={} form={!r}".format(lang, form))
1314 # Make sure there are no double spaces in the form as this code does not
1315 # handle them otherwise.
1316 form = re.sub(r"\s+", " ", form.strip())
1317 if not form:
1318 return form, []
1320 origform = form
1322 tags = []
1324 # If parsing for certain Bantu languages (e.g., Swahili), handle
1325 # some extra head-final tags first
1326 if lang in head_final_bantu_langs:
1327 m = re.search(head_final_bantu_re, form)
1328 if m is not None:
1329 tagkeys = m.group(1)
1330 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1330 ↛ 1345line 1330 didn't jump to line 1345 because the condition on line 1330 was always true
1331 form = form[: m.start()]
1332 v = head_final_bantu_map[tagkeys]
1333 if v.startswith("?"): 1333 ↛ 1334line 1333 didn't jump to line 1334 because the condition on line 1333 was never true
1334 v = v[1:]
1335 wxr.wtp.debug(
1336 "suspicious suffix {!r} in language {}: {}".format(
1337 tagkeys, lang, origform
1338 ),
1339 sortid="form_descriptions/1028",
1340 )
1341 tags.extend(v.split())
1343 # If parsing for certain Semitic languages (e.g., Arabic), handle
1344 # some extra head-final tags first
1345 if lang in head_final_semitic_langs:
1346 m = re.search(head_final_semitic_re, form)
1347 if m is not None:
1348 tagkeys = m.group(1)
1349 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1349 ↛ 1364line 1349 didn't jump to line 1364 because the condition on line 1349 was always true
1350 form = form[: m.start()]
1351 v = head_final_semitic_map[tagkeys]
1352 if v.startswith("?"): 1352 ↛ 1353line 1352 didn't jump to line 1353 because the condition on line 1352 was never true
1353 v = v[1:]
1354 wxr.wtp.debug(
1355 "suspicious suffix {!r} in language {}: {}".format(
1356 tagkeys, lang, origform
1357 ),
1358 sortid="form_descriptions/1043",
1359 )
1360 tags.extend(v.split())
1362 # If parsing for certain other languages (e.g., Lithuanian,
1363 # French, Finnish), handle some extra head-final tags first
1364 if lang in head_final_other_langs:
1365 m = re.search(head_final_other_re, form)
1366 if m is not None:
1367 tagkeys = m.group(1)
1368 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1368 ↛ 1373line 1368 didn't jump to line 1373 because the condition on line 1368 was always true
1369 form = form[: m.start()]
1370 tags.extend(head_final_other_map[tagkeys].split(" "))
1372 # Handle normal head-final tags
1373 m = re.search(head_final_re, form)
1374 if m is not None:
1375 tagkeys = m.group(3)
1376 # Only replace tags ending with numbers in languages that have
1377 # head-final numeric tags (e.g., Bantu classes); also, don't replace
1378 # tags if the main title ends with them (then presume they are part
1379 # of the word)
1380 # print("head_final_tags form={!r} tagkeys={!r} lang={}"
1381 # .format(form, tagkeys, lang))
1382 tagkeys_contains_digit = re.search(r"\d", tagkeys)
1383 if (
1384 (not tagkeys_contains_digit or lang in head_final_numeric_langs)
1385 and not wxr.wtp.title.endswith(" " + tagkeys) # type:ignore[union-attr]
1386 and
1387 # XXX the above test does not capture when the whole word is a
1388 # xlat_head_map key, so I added the below test to complement
1389 # it; does this break anything?
1390 not wxr.wtp.title == tagkeys
1391 ): # defunct/English,
1392 # "more defunct" -> "more" ["archaic"]
1393 if not tagkeys_contains_digit or lang in head_final_numeric_langs: 1393 ↛ 1407line 1393 didn't jump to line 1407 because the condition on line 1393 was always true
1394 form = form[: m.start()]
1395 v = xlat_head_map[tagkeys]
1396 if v.startswith("?"): 1396 ↛ 1397line 1396 didn't jump to line 1397 because the condition on line 1396 was never true
1397 v = v[1:]
1398 wxr.wtp.debug(
1399 "suspicious suffix {!r} in language {}: {}".format(
1400 tagkeys, lang, origform
1401 ),
1402 sortid="form_descriptions/1077",
1403 )
1404 tags.extend(v.split())
1406 # Generate warnings about words ending in " or" after processing
1407 if (
1408 (form.endswith(" or") and not origform.endswith(" or"))
1409 or re.search(
1410 r" (1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|"
1411 r"1a|2a|9a|10a|m1|f1|f2|m2|f3|m3|f4|m4|f5|m5|or|\?)"
1412 r"($|/| (f|m|sg|pl|anim|inan))",
1413 form,
1414 )
1415 or form.endswith(" du")
1416 ):
1417 if form not in ok_suspicious_forms:
1418 wxr.wtp.debug(
1419 "suspicious unhandled suffix in {}: {!r}, originally {!r}".format(
1420 lang, form, origform
1421 ),
1422 sortid="form_descriptions/1089",
1423 )
1425 # print("parse_head_final_tags: form={!r} tags={}".format(form, tags))
1426 return form, tags
1429def quote_kept_parens(s: str) -> str:
1430 """Changes certain parenthesized expressions so that they won't be
1431 interpreted as parentheses. This is used for parts that are kept as
1432 part of the word, such as "read admiral (upper half)"."""
1433 return re.sub(
1434 r"\((lower half|upper half|k|s|n|II|III|A|C|G|U|Y|"
1435 r"vinyl|p-phenylene vinylene|\(\(\s*\)\))\)",
1436 r"__lpar__\1__rpar__",
1437 s,
1438 )
1441def quote_kept_ruby(
1442 wxr: WiktextractContext,
1443 ruby_tuples: list[
1444 tuple[
1445 str,
1446 str,
1447 ]
1448 ],
1449 s: str,
1450) -> str:
1451 if len(ruby_tuples) < 1:
1452 wxr.wtp.debug(
1453 "quote_kept_ruby called with no ruby",
1454 sortid="form_description/1114/20230517",
1455 )
1456 return s
1457 ks = []
1458 rs = []
1459 for k, r in ruby_tuples:
1460 ks.append(re.escape(k))
1461 rs.append(re.escape(r))
1462 if not (ks and rs):
1463 wxr.wtp.debug(
1464 f"empty column in ruby_tuples: {ruby_tuples}",
1465 sortid="form_description/1124/20230606",
1466 )
1467 return s
1468 newm = re.compile(
1469 r"({})\s*\(\s*({})\s*\)".format("|".join(ks), "|".join(rs))
1470 )
1471 rub_re = re.compile(
1472 r"({})".format(
1473 r"|".join(
1474 r"{}\(*{}\)*".format(
1475 re.escape(k),
1476 re.escape(r),
1477 )
1478 for k, r in ruby_tuples
1479 )
1480 )
1481 )
1483 def paren_replace(m: re.Match) -> str:
1484 return re.sub(newm, r"\1__lrub__\2__rrub__", m.group(0))
1486 return re.sub(rub_re, paren_replace, s)
1489def unquote_kept_parens(s: str) -> str:
1490 """Conerts the quoted parentheses back to normal parentheses."""
1491 return re.sub(r"__lpar__(.*?)__rpar__", r"(\1)", s)
1494def add_romanization(
1495 wxr: WiktextractContext,
1496 data: WordData,
1497 roman: str,
1498 text: str,
1499 is_reconstruction: bool,
1500 head_group: Optional[int],
1501 ruby: Sequence[tuple[str, str]],
1502) -> None:
1503 tags_lst = ["romanization"]
1504 m = re.match(r"([^:]+):(.+)", roman)
1505 # This function's purpose is to intercept broken romanizations,
1506 # like "Yale: hēnpyeng" style tags. Most romanization styles
1507 # are already present as tags, so we can use decode_tags to find
1508 # them.
1509 if m: 1509 ↛ 1510line 1509 didn't jump to line 1510 because the condition on line 1509 was never true
1510 tagsets, topics = decode_tags(m.group(1))
1511 if tagsets:
1512 for tags in tagsets:
1513 tags_lst.extend(tags)
1514 roman = m.group(2)
1515 add_related(
1516 wxr,
1517 data,
1518 tags_lst,
1519 [roman],
1520 text,
1521 True,
1522 is_reconstruction,
1523 head_group,
1524 ruby,
1525 )
1528def add_related(
1529 wxr: WiktextractContext,
1530 data: WordData,
1531 tags_lst: Union[list[str], tuple[str, ...]],
1532 related_list: list[str],
1533 origtext: str,
1534 add_all_canonicals: bool,
1535 is_reconstruction: bool,
1536 head_group: Optional[int],
1537 ruby_data: Optional[Sequence[tuple[str, str]]] = None,
1538) -> Optional[list[tuple[str, ...]]]:
1539 """Internal helper function for some post-processing entries for related
1540 forms (e.g., in word head). This returns a list of list of tags to be
1541 added to following related forms or None (cf. walrus/English word head,
1542 parenthesized part starting with "both")."""
1543 assert isinstance(wxr, WiktextractContext)
1544 assert isinstance(tags_lst, (list, tuple))
1545 for x in tags_lst:
1546 assert isinstance(x, str)
1547 assert isinstance(related_list, (list, tuple))
1548 assert isinstance(origtext, str)
1549 assert add_all_canonicals in (True, False)
1550 assert isinstance(ruby_data, (list, tuple)) or ruby_data is None
1551 if ruby_data is None: 1551 ↛ 1552line 1551 didn't jump to line 1552 because the condition on line 1551 was never true
1552 ruby_data = []
1553 related = " ".join(related_list)
1554 # print("add_related: tags_lst={} related={}".format(tags_lst, related))
1555 if related == "[please provide]": 1555 ↛ 1556line 1555 didn't jump to line 1556 because the condition on line 1555 was never true
1556 return None
1557 if related in IGNORED_RELATED: 1557 ↛ 1558line 1557 didn't jump to line 1558 because the condition on line 1557 was never true
1558 return None
1559 if is_reconstruction and related.startswith("*") and len(related) > 1:
1560 related = related[1:]
1562 # Get title word, with any reconstruction prefix removed
1563 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title) # type:ignore[arg-type]
1565 def check_related(related: str) -> None:
1566 # Warn about some suspicious related forms
1567 m = re.search(suspicious_related_re, related)
1568 if (m and m.group(0) not in titleword) or (
1569 related in ("f", "m", "n", "c") and len(titleword) >= 3
1570 ):
1571 if "eumhun" in tags_lst: 1571 ↛ 1572line 1571 didn't jump to line 1572 because the condition on line 1571 was never true
1572 return
1573 if "cangjie-input" in tags_lst: 1573 ↛ 1574line 1573 didn't jump to line 1574 because the condition on line 1573 was never true
1574 return
1575 if "class" in tags_lst: 1575 ↛ 1576line 1575 didn't jump to line 1576 because the condition on line 1575 was never true
1576 return
1577 if wxr.wtp.section == "Korean" and re.search( 1577 ↛ 1581line 1577 didn't jump to line 1581 because the condition on line 1577 was never true
1578 r"^\s*\w*>\w*\s*$", related
1579 ):
1580 # ignore Korean "i>ni" / "라>나" values
1581 return
1582 if ( 1582 ↛ 1589line 1582 didn't jump to line 1589 because the condition on line 1582 was never true
1583 wxr.wtp.section == "Burmese"
1584 and "romanization" in tags_lst
1585 and re.search(r":", related)
1586 ):
1587 # ignore Burmese with ":", that is used in Burmese
1588 # translitteration of "း", the high-tone visarga.
1589 return
1590 wxr.wtp.debug(
1591 "suspicious related form tags {}: {!r} in {!r}".format(
1592 tags_lst, related, origtext
1593 ),
1594 sortid="form_descriptions/1147",
1595 )
1597 following_tagsets = None # Tagsets to add to following related forms
1598 roman = None
1599 tagsets1: list[tuple[str, ...]] = [tuple()]
1600 topics1: list[str] = []
1602 m = re.match(r"\((([^()]|\([^()]*\))*)\)\s+", related)
1603 if m:
1604 paren = m.group(1)
1605 related = related[m.end() :]
1606 m = re.match(r"^(all|both) (.*)", paren)
1607 if m: 1607 ↛ 1608line 1607 didn't jump to line 1608 because the condition on line 1607 was never true
1608 tagsets1, topics1 = decode_tags(m.group(2))
1609 following_tagsets = tagsets1
1610 else:
1611 tagsets1, topics1 = decode_tags(paren)
1612 else:
1613 m = re.search(r"\s+\((([^()]|\([^()]*\))*)\)$", related)
1614 if m:
1615 paren = m.group(1)
1616 if paren.startswith("U+"): 1616 ↛ 1617line 1616 didn't jump to line 1617 because the condition on line 1616 was never true
1617 related = related[: m.start()]
1618 else:
1619 cls = classify_desc(paren)
1620 if ( 1620 ↛ 1627line 1620 didn't jump to line 1627 because the condition on line 1620 was always true
1621 cls in ("romanization", "english")
1622 and classify_desc(related[: m.start()]) == "other"
1623 ):
1624 roman = paren
1625 related = related[: m.start()]
1626 else:
1627 related = related[: m.start()]
1628 tagsets1, topics1 = decode_tags(paren)
1629 if related and related.startswith("{{"): 1629 ↛ 1630line 1629 didn't jump to line 1630 because the condition on line 1629 was never true
1630 wxr.wtp.debug(
1631 "{{ in word head form - possible Wiktionary error: {!r}".format(
1632 related
1633 ),
1634 sortid="form_descriptions/1177",
1635 )
1636 return None # Likely Wiktionary coding error
1637 related = unquote_kept_parens(related)
1638 # Split related by "/" (e.g., grande/Spanish) superlative in head
1639 # Do not split if / in word title, see π//Japanese
1640 if len(related) > 5 and "/" not in wxr.wtp.title: # type:ignore[operator]
1641 alts = split_at_comma_semi(related, separators=["/"])
1642 else:
1643 alts = [related]
1644 if ruby_data: 1644 ↛ 1646line 1644 didn't jump to line 1646 because the condition on line 1644 was never true
1645 # prepare some regex stuff in advance
1646 ks, rs = [], []
1647 for k, r in ruby_data:
1648 ks.append(re.escape(k))
1649 rs.append(re.escape(r))
1650 splitter = r"((?:{})__lrub__(?:{})__rrub__)".format(
1651 "|".join(ks), "|".join(rs)
1652 )
1653 for related in alts:
1654 ruby: list[tuple[str, str]] = []
1655 if ruby_data: 1655 ↛ 1656line 1655 didn't jump to line 1656 because the condition on line 1655 was never true
1656 new_related = []
1657 rub_split = re.split(splitter, related)
1658 for s in rub_split:
1659 m = re.match(r"(.+)__lrub__(.+)__rrub__", s)
1660 if m:
1661 # add ruby with (\1, \2)
1662 ruby.append((m.group(1), m.group(2)))
1663 new_related.append(m.group(1))
1664 else:
1665 new_related.append(s)
1666 related = "".join(new_related)
1667 tagsets2, topics2 = decode_tags(" ".join(tags_lst))
1668 for tags1 in tagsets1:
1669 assert isinstance(tags1, (list, tuple))
1670 for tags2 in tagsets2:
1671 assert isinstance(tags1, (list, tuple))
1672 dt: LinkageData = {"word": related}
1673 if roman:
1674 dt["roman"] = roman
1675 if ruby: 1675 ↛ 1676line 1675 didn't jump to line 1676 because the condition on line 1675 was never true
1676 dt["ruby"] = ruby
1677 if "alt-of" in tags2: 1677 ↛ 1678line 1677 didn't jump to line 1678 because the condition on line 1677 was never true
1678 check_related(related)
1679 data_extend(data, "tags", tags1)
1680 data_extend(data, "tags", tags2)
1681 data_extend(data, "topics", topics1)
1682 data_extend(data, "topics", topics2)
1683 data_append(data, "alt_of", dt)
1684 elif "form-of" in tags2: 1684 ↛ 1685line 1684 didn't jump to line 1685 because the condition on line 1684 was never true
1685 check_related(related)
1686 data_extend(data, "tags", tags1)
1687 data_extend(data, "tags", tags2)
1688 data_extend(data, "topics", topics1)
1689 data_extend(data, "topics", topics2)
1690 data_append(data, "form_of", dt)
1691 elif "compound-of" in tags2: 1691 ↛ 1692line 1691 didn't jump to line 1692 because the condition on line 1691 was never true
1692 check_related(related)
1693 data_extend(data, "tags", tags1)
1694 data_extend(data, "tags", tags2)
1695 data_extend(data, "topics", topics1)
1696 data_extend(data, "topics", topics2)
1697 data_append(data, "compound", related)
1698 else:
1699 lang = wxr.wtp.section or "LANG_MISSING"
1700 related, final_tags = parse_head_final_tags(
1701 wxr, lang, related
1702 )
1703 # print("add_related: related={!r} tags1={!r} tags2={!r} "
1704 # "final_tags={!r}"
1705 # .format(related, tags1, tags2, final_tags))
1706 tags = list(tags1) + list(tags2) + list(final_tags)
1707 check_related(related)
1708 form: FormData = {"form": related}
1709 if head_group:
1710 form["head_nr"] = head_group
1711 if roman:
1712 form["roman"] = roman
1713 if ruby: 1713 ↛ 1714line 1713 didn't jump to line 1714 because the condition on line 1713 was never true
1714 form["ruby"] = ruby
1715 data_extend(form, "topics", topics1)
1716 data_extend(form, "topics", topics2)
1717 if topics1 or topics2: 1717 ↛ 1718line 1717 didn't jump to line 1718 because the condition on line 1717 was never true
1718 wxr.wtp.debug(
1719 "word head form has topics: {}".format(form),
1720 sortid="form_descriptions/1233",
1721 )
1722 # Add tags from canonical form into the main entry
1723 if "canonical" in tags:
1724 if related in ("m", "f") and len(titleword) > 1: 1724 ↛ 1725line 1724 didn't jump to line 1725 because the condition on line 1724 was never true
1725 wxr.wtp.debug(
1726 "probably incorrect canonical form "
1727 "{!r} ignored (probably tag combination "
1728 "missing from xlat_head_map)".format(related),
1729 sortid="form_descriptions/1241",
1730 )
1731 continue
1732 if (
1733 related != titleword
1734 or add_all_canonicals
1735 or topics1
1736 or topics2
1737 or ruby
1738 ):
1739 data_extend(form, "tags", list(sorted(set(tags))))
1740 else:
1741 # We won't add canonical form here
1742 filtered_tags = list(
1743 x for x in tags if x != "canonical"
1744 )
1745 data_extend(data, "tags", filtered_tags)
1746 continue
1747 else:
1748 data_extend(form, "tags", list(sorted(set(tags))))
1749 # Only insert if the form is not already there
1750 for old in data.get("forms", ()):
1751 if form == old: 1751 ↛ 1752line 1751 didn't jump to line 1752 because the condition on line 1751 was never true
1752 break
1753 else:
1754 data_append(data, "forms", form)
1756 # If this form had pre-tags that started with "both" or "all", add those
1757 # tags also to following related forms that don't have their own tags
1758 # specified.
1759 return following_tagsets
1762# Issue #967, in English word forms sometimes forms are skipped because
1763# they are taggable words and their distw() is too big, like clipping from clip
1764WORDS_WITH_FALSE_POSITIVE_TAGS: dict[str, list[str]] = {
1765 "clip": ["clipping"], # XXX remember to change me back to clipping after
1766 "English": ["English", "Englishes"],
1767 "common": ["common", "commoner"],
1768 # tests.
1769}
1771WORDS_WITH_FALSE_POSITIVE_FORMS: dict[str, list[str]] = {
1772 "unaccountability": ["countable", "uncountable"],
1773 "uncountability": ["countable", "uncountable"],
1774}
1776FALSE_POSITIVE_MISSING_FORMS: dict[str, list[str]] = {}
1778FORM_ASSOCIATED_TAG_WORDS: set[str] = {
1779 "participle",
1780 "past",
1781 "present",
1782 "singular",
1783 "plural",
1784 "first-person",
1785 "second-person",
1786 "third-person",
1787 "gerund",
1788}
1791def parse_word_head(
1792 wxr: WiktextractContext,
1793 pos: str,
1794 text: str,
1795 data: WordData,
1796 is_reconstruction: bool,
1797 head_group: Optional[int],
1798 ruby=None,
1799 links=None,
1800) -> None:
1801 """Parses the head line for a word for in a particular language and
1802 part-of-speech, extracting tags and related forms."""
1803 assert isinstance(wxr, WiktextractContext)
1804 assert isinstance(pos, str)
1805 assert isinstance(text, str)
1806 assert isinstance(data, dict)
1807 assert isinstance(ruby, (list, tuple)) or ruby is None
1808 if ruby is None:
1809 ruby = []
1810 assert is_reconstruction in (True, False)
1811 # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text))
1812 # print(f"PARSE_WORD_HEAD: {data=}")
1813 if links is None:
1814 links = []
1816 if len(links) > 0:
1817 # if we have link data (that is, links with stuff like commas and
1818 # spaces, replace word_re with a modified local scope pattern
1819 # print(f"links {list((c, ord(c)) for link in links for c in link)=}")
1820 word_re = re.compile(
1821 r"\b" # In case we have forms that are longer and contain links
1822 +
1823 # or words as a substring...
1824 r"\b|\b".join(
1825 sorted((re.escape(s) for s in links), key=lambda x: -len(x))
1826 )
1827 + r"\b|"
1828 + word_pattern
1829 )
1830 else:
1831 word_re = word_re_global
1833 if "Lua execution error" in text or "Lua timeout error" in text: 1833 ↛ 1834line 1833 didn't jump to line 1834 because the condition on line 1833 was never true
1834 return
1836 # In Aug 2021, some words had spurious Template:en at the end of head forms
1837 # due to a Wiktionary error.
1838 text = re.sub(r"\s+Template:[-a-zA-Z]+\s*$", "", text)
1840 # Fix words with "superlative:" or "comparative:" at end of head
1841 # e.g. grande/Spanish/Adj
1842 text = re.sub(r" (superlative|comparative): (.*)", r" (\1 \2)", text)
1844 # Parse Arabic non-past forms, e.g. أبلع/Arabic/Verb
1845 m = re.search(r", non-past ([^)]+ \([^)]+\))", text)
1846 if m:
1847 add_related(
1848 wxr,
1849 data,
1850 ["non-past"],
1851 [m.group(1)],
1852 text,
1853 True,
1854 is_reconstruction,
1855 head_group,
1856 ruby,
1857 )
1858 text = text[: m.start()] + text[m.end() :]
1860 language = wxr.wtp.section
1861 titleword = re.sub(
1862 r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "MISSING_TITLE"
1863 )
1864 titleparts = list(
1865 m.group(0)
1866 for m in re.finditer(word_re, wxr.wtp.title or "MISSING_TITLE")
1867 )
1868 if not titleparts: 1868 ↛ 1869line 1868 didn't jump to line 1869 because the condition on line 1868 was never true
1869 return
1871 # Remove " or" from the end to prevent weird canonical forms
1872 if text.endswith(" or"):
1873 for tp in titleparts:
1874 if text.endswith(tp): 1874 ↛ 1875line 1874 didn't jump to line 1875 because the condition on line 1874 was never true
1875 break
1876 else:
1877 text = text.removesuffix(" or").rstrip()
1879 # Handle the part of the head that is not in parentheses. However, certain
1880 # parenthesized parts are part of word, and those must be handled
1881 # specially here.
1882 if ruby: 1882 ↛ 1883line 1882 didn't jump to line 1883 because the condition on line 1882 was never true
1883 text = quote_kept_ruby(wxr, ruby, text)
1884 base = text
1885 base = quote_kept_parens(base)
1886 base = remove_text_in_parentheses(base)
1887 base = base.replace("?", "") # Removes uncertain articles etc
1888 base = re.sub(r"\s+", " ", base)
1889 base = re.sub(r" ([,;])", r"\1", base)
1890 base = re.sub(r"(.*) •.*", r"\1", base)
1891 # Many languages use • as a punctuation mark separating the base
1892 # from the rest of the head. στάδιος/Ancient Greek, issue #176
1893 base = base.strip()
1895 # Check for certain endings in head (mostly for compatibility with weird
1896 # heads, e.g. rata/Romanian "1st conj." at end)
1897 m = re.search(head_end_re, base)
1898 tags: Union[tuple[str, ...], list[str]] = []
1899 if m: 1899 ↛ 1900line 1899 didn't jump to line 1900 because the condition on line 1899 was never true
1900 tags = head_end_map[m.group(1).lower()].split()
1901 data_extend(data, "tags", tags)
1902 base = base[: m.start()]
1904 # Special case: handle Hán Nôm readings for Vietnamese characters
1905 m = re.match(
1906 r"{}: (Hán Nôm) readings: (.*)".format(re.escape(titleword)), base
1907 )
1908 if m: 1908 ↛ 1909line 1908 didn't jump to line 1909 because the condition on line 1908 was never true
1909 tag, readings = m.groups()
1910 tag = re.sub(r"\s+", "-", tag)
1911 for reading in split_at_comma_semi(readings, skipped=links):
1912 add_related(
1913 wxr,
1914 data,
1915 [tag],
1916 [reading],
1917 text,
1918 True,
1919 is_reconstruction,
1920 head_group,
1921 ruby,
1922 )
1923 return
1925 # Special case: Hebrew " [pattern: nnn]" ending
1926 m = re.search(r"\s+\[pattern: ([^]]+)\]", base)
1927 if m: 1927 ↛ 1928line 1927 didn't jump to line 1928 because the condition on line 1927 was never true
1928 add_related(
1929 wxr,
1930 data,
1931 ["class"],
1932 [m.group(1)],
1933 text,
1934 True,
1935 is_reconstruction,
1936 head_group,
1937 ruby,
1938 )
1939 base = base[: m.start()] + base[m.end() :]
1941 # Clean away some messy "Upload an image" template text used in
1942 # American Sign Language:
1943 # S@NearBaseForearm-PalmUp Frontandback S@BaseForearm-PalmUp
1944 m = re.search(r"Upload .+ gif image.", base)
1945 if m: 1945 ↛ 1946line 1945 didn't jump to line 1946 because the condition on line 1945 was never true
1946 base = base[: m.start()] + base[m.end() :]
1948 # Split the head into alternatives. This is a complicated task, as
1949 # we do not want so split on "or" or "," when immediately followed by more
1950 # head-final tags, but otherwise do want to split by them.
1951 # 20230907 added "or" to this to handle 'true or false', titles with 'or'
1952 if wxr.wtp.title and ("," in wxr.wtp.title or " or " in wxr.wtp.title):
1953 # A kludge to handle article titles/phrases with commas.
1954 # Preprocess splits to first capture the title, then handle
1955 # all the others as usual.
1956 presplits = re.split(r"({})".format(wxr.wtp.title), base)
1957 splits = []
1958 for psplit in presplits:
1959 if psplit == wxr.wtp.title:
1960 splits.append(psplit)
1961 else:
1962 splits.extend(re.split(head_split_re, psplit))
1963 else:
1964 # Do the normal split; previous only-behavior.
1965 splits = re.split(head_split_re, base)
1966 # print("SPLITS:", splits)
1967 alts: list[str] = []
1968 # print("parse_word_head: splits:", splits,
1969 # "head_split_re_parens:", head_split_re_parens)
1970 for i in range(
1971 0, len(splits) - head_split_re_parens, head_split_re_parens + 1
1972 ):
1973 v = splits[i]
1974 ending = splits[i + 1] or "" # XXX is this correct???
1975 # print("parse_word_head alts v={!r} ending={!r} alts={}"
1976 # .format(v, ending, alts))
1977 if alts and (v == "" and ending):
1978 assert ending[0] == " "
1979 alts[-1] += " or" + ending # endings starts with space
1980 elif v or ending: 1980 ↛ 1970line 1980 didn't jump to line 1970 because the condition on line 1980 was always true
1981 alts.append((v or "") + (ending or ""))
1982 last = splits[-1].strip()
1983 conn = "" if len(splits) < 3 else splits[-2]
1984 # print("parse_word_head alts last={!r} conn={!r} alts={}"
1985 # .format(last, conn, alts))
1986 if (
1987 alts
1988 and last
1989 and (
1990 last.split()[0] in xlat_head_map
1991 or (
1992 conn == " or "
1993 and (alts[-1] + " or " + last).strip() in xlat_head_map
1994 )
1995 )
1996 ):
1997 alts[-1] += " or " + last
1998 elif last:
1999 alts.append(last)
2001 # print("parse_word_head alts: {}".format(alts))
2002 # print(f"{base=}")
2004 # Process the head alternatives
2005 canonicals: list[tuple[list[str], list[str]]] = []
2006 mode: Optional[str] = None
2007 for alt_i, alt in enumerate(alts):
2008 alt = alt.strip()
2009 if alt.startswith("compound form:"): 2009 ↛ 2010line 2009 didn't jump to line 2010 because the condition on line 2009 was never true
2010 mode = "compound-form"
2011 alt = alt[14:].strip()
2012 if mode == "compound-form": 2012 ↛ 2013line 2012 didn't jump to line 2013 because the condition on line 2012 was never true
2013 add_related(
2014 wxr,
2015 data,
2016 ["in-compounds"],
2017 [alt],
2018 text,
2019 True,
2020 is_reconstruction,
2021 head_group,
2022 ruby,
2023 )
2024 continue
2025 # For non-first parts, see if it can be treated as tags-only
2026 if alt_i == 0:
2027 expanded_alts = [alt]
2028 else:
2029 expanded_alts = map_with(xlat_descs_map, [alt])
2030 # print("EXPANDED_ALTS:", expanded_alts)
2031 tagsets: Optional[list[tuple[str, ...]]]
2032 for alt in expanded_alts:
2033 baseparts = list(m.group(0) for m in word_re.finditer(alt))
2034 if alt_i > 0:
2035 tagsets, topics = decode_tags(" ".join(baseparts))
2036 if not any("error-unknown-tag" in x for x in tagsets):
2037 data_extend(data, "topics", topics)
2038 for tags1 in tagsets:
2039 data_extend(data, "tags", tags1)
2040 continue
2042 alt, tags = parse_head_final_tags(
2043 wxr, language or "MISSING_LANG", alt
2044 )
2045 tags = list(tags) # Make sure we don't modify anything cached
2046 tags.append("canonical")
2047 if alt_i == 0 and "," in wxr.wtp.title: # type:ignore[operator]
2048 # Kludge to handle article titles/phrases with commas.
2049 # basepart's regex strips commas, which leads to a
2050 # canonical form that is the title phrase without a comma.
2051 # basepart in add_related is almost immediately joined with
2052 # spaces anyhow. XXX not exactly sure why it's
2053 # canonicals.append((tags, baseparts)) and not (tags, [alt])
2054 baseparts = [alt]
2055 canonicals.append((tags, baseparts))
2056 for tags, baseparts in canonicals:
2057 add_related(
2058 wxr,
2059 data,
2060 tags,
2061 baseparts,
2062 text,
2063 len(canonicals) > 1,
2064 is_reconstruction,
2065 head_group,
2066 ruby,
2067 )
2069 # Handle parenthesized descriptors for the word form and links to
2070 # related words
2071 text = quote_kept_parens(text)
2072 parens = list(
2073 m.group(2)
2074 for m in re.finditer(r"(^|\s)\((([^()]|\([^()]*\))*)\)", text)
2075 )
2076 parens.extend(
2077 m.group(1)
2078 for m in re.finditer(r"[^\s]\((([^()]|\([^()]*\))*)\)($|\s)", text)
2079 )
2080 have_romanization = False
2081 have_ruby = False
2082 hiragana = ""
2083 katakana = ""
2084 for paren in parens:
2085 paren = paren.strip()
2086 if not paren: 2086 ↛ 2087line 2086 didn't jump to line 2087 because the condition on line 2086 was never true
2087 continue
2088 if paren.startswith("see "):
2089 continue
2090 if paren.startswith("U+"): 2090 ↛ 2091line 2090 didn't jump to line 2091 because the condition on line 2090 was never true
2091 continue
2092 # In some rare cases, strip word that inflects form the form
2093 # description, e.g. "look through rose-tinted glasses"/English.
2094 paren = re.sub(r"\s*\(\[[^])]*\]\)", "", paren)
2096 # If it starts with hiragana or katakana, treat as such form. Note
2097 # that each hiragana/katakana character is in separate parentheses,
2098 # so we must concatenate them.
2099 try:
2100 un = unicodedata.name(paren[0]).split()[0]
2101 except ValueError:
2102 un = "INVALID"
2103 if un == "KATAKANA": 2103 ↛ 2104line 2103 didn't jump to line 2104 because the condition on line 2103 was never true
2104 katakana += paren
2105 have_ruby = True
2106 continue
2107 if un == "HIRAGANA": 2107 ↛ 2108line 2107 didn't jump to line 2108 because the condition on line 2107 was never true
2108 hiragana += paren
2109 have_ruby = True
2110 continue
2112 # Parse format ", 16 (Japan, Mainland), 17 (Hong Kong, Taiwan) strokes,"
2113 # in the middle of the parenthesized expression, e.g. 薄
2114 def strokes_repl(m: re.Match) -> str:
2115 strokes1, tags1, strokes2, tags2 = m.groups()
2116 for strokes, tags in [[strokes1, tags1], [strokes2, tags2]]:
2117 tags = tags.split(", ")
2118 tags = list(
2119 "Mainland China" if t == "Mainland" else t for t in tags
2120 )
2121 tags.append("strokes")
2122 add_related(
2123 wxr,
2124 data,
2125 tags,
2126 [strokes],
2127 text,
2128 True,
2129 is_reconstruction,
2130 head_group,
2131 ruby,
2132 )
2133 return ", "
2135 paren = re.sub(
2136 r", (\d+) \(([^()]+)\), (\d+) \(([^()]+)\) strokes, ",
2137 strokes_repl,
2138 paren,
2139 )
2141 descriptors = map_with(xlat_descs_map, [paren])
2142 new_desc = []
2143 for desc in descriptors:
2144 new_desc.extend(
2145 map_with(
2146 xlat_tags_map,
2147 split_at_comma_semi(desc, extra=[", or "], skipped=links),
2148 )
2149 )
2150 prev_tags: Union[list[list[str]], list[tuple[str, ...]], None] = None
2151 following_tags = None # Added to prev_tags from previous parenthesized
2152 # part, e.g. walrus/English
2153 # "(both nonstandard, proscribed, uncommon)"
2154 for desc_i, desc in enumerate(new_desc):
2155 # print("HEAD DESC: {!r}".format(desc))
2157 # Abort on certain descriptors (assume remaining values are
2158 # examples or uninteresting, cf. gaan/Navajo, horior/Latin)
2159 if re.match(r"^(per |e\.g\.$)", desc): 2159 ↛ 2160line 2159 didn't jump to line 2160 because the condition on line 2159 was never true
2160 break
2162 # If it all consists of CJK characters, add it with the
2163 # CJK tag. This is used at least for some Vietnamese
2164 # words (e.g., ba/Vietnamese)
2165 try:
2166 if all(unicodedata.name(x).startswith("CJK ") for x in desc): 2166 ↛ 2167line 2166 didn't jump to line 2167 because the condition on line 2166 was never true
2167 add_related(
2168 wxr,
2169 data,
2170 ["CJK"],
2171 [desc],
2172 text,
2173 True,
2174 is_reconstruction,
2175 head_group,
2176 ruby,
2177 )
2178 continue
2179 except ValueError:
2180 pass
2182 # Handle some special cases
2183 splitdesc = desc.split()
2184 if ( 2184 ↛ 2193line 2184 didn't jump to line 2193 because the condition on line 2184 was never true
2185 len(splitdesc) >= 3
2186 and splitdesc[1] == "superlative"
2187 and classify_desc(splitdesc[0]) != "tags"
2188 and prev_tags
2189 ):
2190 # Handle the special case of second comparative after comma,
2191 # followed by superlative without comma. E.g.
2192 # mal/Portuguese/Adv
2193 for ts in prev_tags:
2194 add_related(
2195 wxr,
2196 data,
2197 ts,
2198 [splitdesc[0]],
2199 text,
2200 True,
2201 is_reconstruction,
2202 head_group,
2203 ruby,
2204 )
2205 desc = " ".join(splitdesc[1:])
2206 elif ( 2206 ↛ 2214line 2206 didn't jump to line 2214 because the condition on line 2206 was never true
2207 len(splitdesc) == 2
2208 and splitdesc[0] in ("also", "and")
2209 and prev_tags
2210 and classify_desc(splitdesc[1]) != "tags"
2211 ):
2212 # Sometimes alternative forms are prefixed with "also" or
2213 # "and"
2214 for ts in prev_tags:
2215 add_related(
2216 wxr,
2217 data,
2218 ts,
2219 [splitdesc[1]],
2220 text,
2221 True,
2222 is_reconstruction,
2223 head_group,
2224 ruby,
2225 )
2226 continue
2227 elif len(splitdesc) >= 2 and splitdesc[0] in ("including",): 2227 ↛ 2228line 2227 didn't jump to line 2228 because the condition on line 2227 was never true
2228 continue
2230 # If only one word, assume it is comma-separated alternative
2231 # to the previous one
2232 if " " not in desc:
2233 cls = classify_desc(desc)
2234 if cls != "tags":
2235 if prev_tags: 2235 ↛ 2237line 2235 didn't jump to line 2237 because the condition on line 2235 was never true
2236 # Assume comma-separated alternative to previous one
2237 for ts in prev_tags:
2238 add_related(
2239 wxr,
2240 data,
2241 ts,
2242 [desc],
2243 text,
2244 True,
2245 is_reconstruction,
2246 head_group,
2247 ruby,
2248 )
2249 continue
2250 elif distw(titleparts, desc) <= 0.5: 2250 ↛ 2253line 2250 didn't jump to line 2253 because the condition on line 2250 was never true
2251 # Similar to head word, assume a dialectal variation to
2252 # the base form. Cf. go/Alemannic German/Verb
2253 add_related(
2254 wxr,
2255 data,
2256 ["alternative"],
2257 [desc],
2258 text,
2259 True,
2260 is_reconstruction,
2261 head_group,
2262 ruby,
2263 )
2264 continue
2265 elif (
2266 cls in ("romanization", "english")
2267 and not have_romanization
2268 and classify_desc(titleword) == "other"
2269 and not (
2270 "categories" in data and desc in data["categories"]
2271 )
2272 ):
2273 # Assume it to be a romanization
2274 add_romanization(
2275 wxr,
2276 data,
2277 desc,
2278 text,
2279 is_reconstruction,
2280 head_group,
2281 ruby,
2282 )
2283 have_romanization = True
2284 continue
2286 m = re.match(r"^(\d+) strokes?$", desc)
2287 if m:
2288 # Special case, used to give #strokes for Han characters
2289 add_related(
2290 wxr,
2291 data,
2292 ["strokes"],
2293 [m.group(1)],
2294 text,
2295 True,
2296 is_reconstruction,
2297 head_group,
2298 ruby,
2299 )
2300 continue
2302 # See if it is radical+strokes
2303 m = re.match(
2304 r"^([\u2F00-\u2FDF\u2E80-\u2EFF\U00018800-\U00018AFF"
2305 r"\uA490-\uA4CF\u4E00-\u9FFF]\+\d+)"
2306 r"( in (Japanese|Chinese|traditional Chinese|"
2307 r"simplified Chinese))?$",
2308 desc,
2309 )
2310 if m: 2310 ↛ 2313line 2310 didn't jump to line 2313 because the condition on line 2310 was never true
2311 # Special case, used to give radical + strokes for Han
2312 # characters
2313 radical_strokes = m.group(1)
2314 lang = m.group(3)
2315 t = ["radical+strokes"]
2316 if lang:
2317 t.extend(lang.split())
2318 add_related(
2319 wxr,
2320 data,
2321 t,
2322 [radical_strokes],
2323 text,
2324 True,
2325 is_reconstruction,
2326 head_group,
2327 ruby,
2328 )
2329 prev_tags = None
2330 following_tags = None
2331 continue
2333 # See if it indicates historical Katakana ortography (←) or
2334 # just otherwise katakana/hiragana form
2335 m = re.match(r"←\s*|kana\s+", desc)
2336 if m: 2336 ↛ 2337line 2336 didn't jump to line 2337 because the condition on line 2336 was never true
2337 if desc.startswith("←"):
2338 t1 = "historical "
2339 else:
2340 t1 = ""
2341 x = desc[m.end() :]
2342 if x.endswith("?"):
2343 x = x[:-1]
2344 # XXX should we add a tag indicating uncertainty?
2345 if x:
2346 name = unicodedata.name(x[0])
2347 if name.startswith("HIRAGANA "):
2348 desc = t1 + "hiragana " + x
2349 elif name.startswith("KATAKANA "):
2350 desc = t1 + "katakana " + x
2352 # See if it is "n strokes in Chinese" or similar
2353 m = re.match(
2354 r"(\d+) strokes in (Chinese|Japanese|"
2355 r"traditional Chinese|simplified Chinese)$",
2356 desc,
2357 )
2358 if m: 2358 ↛ 2360line 2358 didn't jump to line 2360 because the condition on line 2358 was never true
2359 # Special case, used to give just strokes for some Han chars
2360 strokes = m.group(1)
2361 lang = m.group(2)
2362 t = ["strokes"]
2363 t.extend(lang.split())
2364 add_related(
2365 wxr,
2366 data,
2367 t,
2368 [strokes],
2369 text,
2370 True,
2371 is_reconstruction,
2372 head_group,
2373 ruby,
2374 )
2375 prev_tags = None
2376 following_tags = None
2377 continue
2379 # American Sign Language has images (or requests for image)
2380 # as heads, + this ASL gloss after.
2381 m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text)
2382 if m2: 2382 ↛ 2383line 2382 didn't jump to line 2383 because the condition on line 2382 was never true
2383 add_related(
2384 wxr,
2385 data,
2386 ["ASL-gloss"],
2387 [m2.group(1)],
2388 text,
2389 True,
2390 is_reconstruction,
2391 head_group,
2392 ruby,
2393 )
2394 continue
2396 parts = list(m.group(0) for m in re.finditer(word_re, desc))
2397 if not parts: 2397 ↛ 2398line 2397 didn't jump to line 2398 because the condition on line 2397 was never true
2398 prev_tags = None
2399 following_tags = None
2400 continue
2402 # Check for certain language-specific header part starts that
2403 # modify
2404 if len(parts) == 2 and language in lang_specific_head_map: 2404 ↛ 2405line 2404 didn't jump to line 2405 because the condition on line 2404 was never true
2405 ht = lang_specific_head_map[language]
2406 if parts[0] in ht:
2407 rem_tags, add_tags = ht[parts[0]]
2408 new_prev_tags1: list[list[str]] = []
2409 tags2: Union[tuple[str, ...], list[str]]
2410 for tags2 in prev_tags or [()]:
2411 if rem_tags is True: # Remove all old tags
2412 tsets = set()
2413 else:
2414 tsets = set(tags2) - set(rem_tags.split())
2415 tsets = tsets | set(add_tags.split())
2416 tags = list(sorted(tsets))
2417 add_related(
2418 wxr,
2419 data,
2420 tags,
2421 [parts[1]],
2422 text,
2423 True,
2424 is_reconstruction,
2425 head_group,
2426 ruby,
2427 )
2428 new_prev_tags1.append(tags)
2429 prev_tags = new_prev_tags1
2430 following_tags = None
2431 continue
2433 # Handle the special case of descriptors that are parenthesized,
2434 # e.g., (archaic or Scotland)
2435 m = re.match(r"\(([^)]+)\)\s+(.*)$", desc)
2436 if m is not None and classify_desc(m.group(1)) == "tags": 2436 ↛ 2437line 2436 didn't jump to line 2437 because the condition on line 2436 was never true
2437 tagpart = m.group(1)
2438 related = [m.group(2)]
2439 tagsets, topics = decode_tags(tagpart, no_unknown_starts=True)
2440 if topics:
2441 wxr.wtp.debug(
2442 "parenthized head part {!r} contains topics: {}".format(
2443 tagpart, topics
2444 ),
2445 sortid="form_descriptions/1647",
2446 )
2447 elif m is not None and re.match(r"in the sense ", m.group(1)): 2447 ↛ 2450line 2447 didn't jump to line 2450 because the condition on line 2447 was never true
2448 # Handle certain ignored cases
2449 # e.g. bord/Danish: in the sense "plank"
2450 related = [m.group(2)]
2451 tagsets = [()]
2452 else:
2453 # Normal parsing of the descriptor
2454 alt_related = None
2455 alt_tagsets = None
2456 tagsets = None
2457 for i in range(len(parts), 0, -1):
2458 related = parts[i:]
2459 tagparts = parts[:i]
2460 # print(" i={} related={} tagparts={}"
2461 # .format(i, related, tagparts))
2462 tagsets, topics = decode_tags(
2463 " ".join(tagparts), no_unknown_starts=True
2464 )
2465 # print("tagparts={!r} tagsets={} topics={} related={} "
2466 # "alt_related={} distw={:.2f}"
2467 # .format(tagparts, tagsets, topics, related,
2468 # alt_related,
2469 # distw(titleparts, parts[i - 1])))
2470 if (
2471 topics
2472 or not tagsets
2473 or any("error-unknown-tag" in x for x in tagsets)
2474 ):
2475 if alt_related is not None: 2475 ↛ 2477line 2475 didn't jump to line 2477 because the condition on line 2475 was never true
2476 # We already had a good division, so let's stop.
2477 break
2478 # Bad division, try deeper
2479 continue
2480 # print(f"{parts[i-1]=}, {parts=}")
2481 if (
2482 i > 1
2483 and len(parts[i - 1]) >= 4
2484 and (
2485 distw(titleparts, parts[i - 1]) <= 0.4
2486 or (
2487 wxr.wtp.section == "English"
2488 and wxr.wtp.title
2489 in WORDS_WITH_FALSE_POSITIVE_TAGS
2490 and parts[i - 1]
2491 in WORDS_WITH_FALSE_POSITIVE_TAGS[wxr.wtp.title]
2492 )
2493 )
2494 # Fixes 'unaccountability' wiktext #1196
2495 and not (
2496 wxr.wtp.section == "English"
2497 and wxr.wtp.title in WORDS_WITH_FALSE_POSITIVE_FORMS
2498 and parts[i - 1]
2499 in WORDS_WITH_FALSE_POSITIVE_FORMS[wxr.wtp.title]
2500 )
2501 # Fixes wiktextract #983, where "participle"
2502 # was too close to "Martinize" and so this accepted
2503 # ["participle", "Martinize"] as matching; this
2504 # kludge prevents this from happening if titleparts
2505 # is shorter than what would be 'related'.
2506 # This breaks if we want to detect stuff that
2507 # actually gets an extra space-separated word when
2508 # 'inflected'.
2509 and (
2510 len(titleparts) >= len(parts[i - 1 :])
2511 or "or" in parts[i - 1 :]
2512 )
2513 ):
2514 # print(f"Reached; {parts=}, {parts[i-1]=}")
2515 alt_related = related
2516 alt_tagsets = tagsets
2517 continue
2518 alt_related = None
2519 alt_tagsets = None
2520 break
2521 else:
2522 if alt_related is None: 2522 ↛ 2554line 2522 didn't jump to line 2554 because the condition on line 2522 was always true
2523 # Check if the parenthesized part is likely a
2524 # romanization
2525 if ( 2525 ↛ 2533line 2525 didn't jump to line 2533 because the condition on line 2525 was never true
2526 (have_ruby or classify_desc(base) == "other")
2527 and classify_desc(paren) == "romanization"
2528 and not (
2529 "categories" in data
2530 and desc in data["categories"]
2531 )
2532 ):
2533 for r in split_at_comma_semi(
2534 paren, extra=[" or "], skipped=links
2535 ):
2536 add_romanization(
2537 wxr,
2538 data,
2539 r,
2540 text,
2541 is_reconstruction,
2542 head_group,
2543 ruby,
2544 )
2545 have_romanization = True
2546 continue
2547 tagsets = [("error-unrecognized-head-form",)]
2548 wxr.wtp.debug(
2549 "unrecognized head form: {}".format(desc),
2550 sortid="form_descriptions/1698",
2551 )
2552 continue
2554 if alt_related is not None: 2554 ↛ 2555line 2554 didn't jump to line 2555 because the condition on line 2554 was never true
2555 related = alt_related
2556 tagsets = alt_tagsets
2558 # print("FORM END: tagsets={} related={}".format(tagsets, related))
2559 # print("==================")
2561 if ( 2561 ↛ 2582line 2561 didn't jump to line 2582 because the condition on line 2561 was never true
2562 len(related) <= 0
2563 and wxr.wtp.section == "English"
2564 and tagsets is not None
2565 and len(tagsets) > 0
2566 and not any(
2567 s.startswith("error-") for tagset in tagsets for s in tagset
2568 )
2569 and any(
2570 s in FORM_ASSOCIATED_TAG_WORDS
2571 for tagset in tagsets
2572 for s in tagset
2573 )
2574 and (
2575 wxr.wtp.title not in FALSE_POSITIVE_MISSING_FORMS
2576 and not any(
2577 rel in FALSE_POSITIVE_MISSING_FORMS[wxr.wtp.title or ""]
2578 for rel in related
2579 )
2580 )
2581 ):
2582 wxr.wtp.debug(
2583 f"Form tags without form: {desc=}, {tagsets=}",
2584 sortid="form_description/20250107",
2585 )
2586 if not tagsets: 2586 ↛ 2587line 2586 didn't jump to line 2587 because the condition on line 2586 was never true
2587 continue
2589 # print(f"{alts=}, {related=}")
2591 assert isinstance(related, (list, tuple))
2592 related_str = " ".join(related)
2593 if "or" in titleparts:
2594 alts = [related_str]
2595 else:
2596 alts = split_at_comma_semi(
2597 related_str, separators=[r"\bor\b"], skipped=links
2598 )
2599 # print(f"{related_str=}, {alts=}")
2600 if not alts:
2601 alts = [""]
2602 for related_str in alts:
2603 if related_str:
2604 if prev_tags and (
2605 all(
2606 all(
2607 t in ["nonstandard", "dialectal"]
2608 or valid_tags[t] == "dialect"
2609 for t in tags
2610 )
2611 for ts in tagsets
2612 )
2613 or (
2614 any("participle" in ts for ts in prev_tags)
2615 and all(
2616 "attributive" in ts
2617 or any(valid_tags[t] == "gender" for t in ts)
2618 for ts in tagsets
2619 )
2620 )
2621 ):
2622 # Merged with previous tags. Don't update previous
2623 # tags here; cf. burn/English/Verb
2624 for tags_l in tagsets:
2625 for ts in prev_tags:
2626 tags_l1 = list(sorted(set(tags_l) | set(ts)))
2627 add_related(
2628 wxr,
2629 data,
2630 tags_l1,
2631 [related_str],
2632 text,
2633 True,
2634 is_reconstruction,
2635 head_group,
2636 ruby,
2637 )
2638 else:
2639 # Not merged with previous tags
2640 for tags_l in tagsets:
2641 if following_tags is not None: 2641 ↛ 2642line 2641 didn't jump to line 2642 because the condition on line 2641 was never true
2642 for ts in following_tags:
2643 tags_l1 = list(
2644 sorted(set(tags_l) | set(ts))
2645 )
2646 add_related(
2647 wxr,
2648 data,
2649 tags_l1,
2650 [related_str],
2651 text,
2652 True,
2653 is_reconstruction,
2654 head_group,
2655 ruby,
2656 )
2657 else:
2658 ret = add_related(
2659 wxr,
2660 data,
2661 tags_l,
2662 [related_str],
2663 text,
2664 True,
2665 is_reconstruction,
2666 head_group,
2667 ruby,
2668 )
2669 if ret is not None: 2669 ↛ 2670line 2669 didn't jump to line 2670 because the condition on line 2669 was never true
2670 following_tags = ret
2671 prev_tags = tagsets
2672 else:
2673 if desc_i < len(new_desc) - 1 and all( 2673 ↛ 2680line 2673 didn't jump to line 2680 because the condition on line 2673 was never true
2674 "participle" in ts or "infinitive" in ts
2675 for ts in tagsets
2676 ):
2677 # Interpret it as a standalone form description
2678 # in the middle, probably followed by forms or
2679 # language-specific descriptors. cf. drikke/Danish
2680 new_prev_tags2 = []
2681 for ts1 in prev_tags or [()]:
2682 for ts2 in tagsets:
2683 ts = tuple(sorted(set(ts1) | set(ts2)))
2684 new_prev_tags2.append(ts)
2685 prev_tags = new_prev_tags2
2686 continue
2687 for tags in tagsets:
2688 data_extend(data, "tags", tags)
2689 prev_tags = tagsets
2690 following_tags = None
2692 # Finally, if we collected hirakana/katakana, add them now
2693 if hiragana: 2693 ↛ 2694line 2693 didn't jump to line 2694 because the condition on line 2693 was never true
2694 add_related(
2695 wxr,
2696 data,
2697 ["hiragana"],
2698 [hiragana],
2699 text,
2700 True,
2701 is_reconstruction,
2702 head_group,
2703 ruby,
2704 )
2705 if katakana: 2705 ↛ 2706line 2705 didn't jump to line 2706 because the condition on line 2705 was never true
2706 add_related(
2707 wxr,
2708 data,
2709 ["katakana"],
2710 [katakana],
2711 text,
2712 True,
2713 is_reconstruction,
2714 head_group,
2715 ruby,
2716 )
2718 # XXX check if this is actually relevant, tags in word root data
2719 # is extremely rare (not sure where they slip through).
2720 tags = data.get("tags", []) # type:ignore
2721 if len(tags) > 0:
2722 # wxr.wtp.debug(
2723 # f"Tags appear in word root data: {data['tags']=}", # type:ignore
2724 # sortid="form_descriptions/2620/20240606",
2725 # ) # Messes up tests.
2726 data["tags"] = list(sorted(set(tags))) # type:ignore
2729def parse_sense_qualifier(
2730 wxr: WiktextractContext, text: str, data: Union[SenseData, LinkageData]
2731) -> None:
2732 """Parses tags or topics for a sense or some other data. The values are
2733 added into the dictionary ``data``."""
2734 assert isinstance(wxr, WiktextractContext)
2735 assert isinstance(text, str)
2736 assert isinstance(data, dict)
2737 # print("parse_sense_qualifier:", text)
2738 if re.match(r"\([^()]+\)$", text): 2738 ↛ 2739line 2738 didn't jump to line 2739 because the condition on line 2738 was never true
2739 text = text[1:-1]
2740 if re.match(r'"[^"]+"$', text): 2740 ↛ 2741line 2740 didn't jump to line 2741 because the condition on line 2740 was never true
2741 text = text[1:-1]
2742 lst = map_with(xlat_descs_map, [text])
2743 sense_tags: list[str] = []
2744 for text in lst:
2745 for semi in split_at_comma_semi(text):
2746 if not semi: 2746 ↛ 2747line 2746 didn't jump to line 2747 because the condition on line 2746 was never true
2747 continue
2748 orig_semi = semi
2749 idx = semi.find(":")
2750 if idx >= 0: 2750 ↛ 2751line 2750 didn't jump to line 2751 because the condition on line 2750 was never true
2751 semi = semi[:idx]
2752 cls = classify_desc(semi, allow_unknown_tags=True)
2753 # print("parse_sense_qualifier: classify_desc: {} -> {}"
2754 # .format(semi, cls))
2755 if cls == "tags":
2756 tagsets, topics = decode_tags(semi)
2757 data_extend(data, "topics", topics)
2758 # XXX should think how to handle distinct options better,
2759 # e.g., "singular and plural genitive"; that can't really be
2760 # done with changing the calling convention of this function.
2761 # Should split sense if more than one category of tags differs.
2762 for tags in tagsets:
2763 sense_tags.extend(tags)
2764 elif cls == "taxonomic": 2764 ↛ 2765line 2764 didn't jump to line 2765 because the condition on line 2764 was never true
2765 if re.match(r"×[A-Z]", semi):
2766 sense_tags.append("extinct")
2767 semi = semi[1:]
2768 data["taxonomic"] = semi
2769 elif cls == "english":
2770 if "qualifier" in data and data["qualifier"] != orig_semi: 2770 ↛ 2771line 2770 didn't jump to line 2771 because the condition on line 2770 was never true
2771 data["qualifier"] += "; " + orig_semi
2772 else:
2773 data["qualifier"] = orig_semi
2774 else:
2775 wxr.wtp.debug(
2776 "unrecognized sense qualifier: {}".format(text),
2777 sortid="form_descriptions/1831",
2778 )
2779 sense_tags = list(sorted(set(sense_tags)))
2780 data_extend(data, "tags", sense_tags)
2783def parse_pronunciation_tags(
2784 wxr: WiktextractContext, text: str, data: SoundData
2785) -> None:
2786 assert isinstance(wxr, WiktextractContext)
2787 assert isinstance(text, str)
2788 assert isinstance(data, dict)
2789 text = text.strip()
2790 if not text: 2790 ↛ 2791line 2790 didn't jump to line 2791 because the condition on line 2790 was never true
2791 return
2792 cls = classify_desc(text)
2793 notes = []
2794 if cls == "tags":
2795 tagsets, topics = decode_tags(text)
2796 data_extend(data, "topics", topics)
2797 for tagset in tagsets:
2798 for t in tagset:
2799 if " " in t: 2799 ↛ 2800line 2799 didn't jump to line 2800 because the condition on line 2799 was never true
2800 notes.append(t)
2801 else:
2802 data_append(data, "tags", t)
2803 else:
2804 notes.append(text)
2805 if notes:
2806 data["note"] = "; ".join(notes)
2809def parse_translation_desc(
2810 wxr: WiktextractContext, lang: str, text: str, tr: TranslationData
2811) -> None:
2812 assert isinstance(wxr, WiktextractContext)
2813 assert isinstance(lang, str) # The language of ``text``
2814 assert isinstance(text, str)
2815 assert isinstance(tr, dict)
2816 # print("parse_translation_desc:", text)
2818 # Process all parenthesized parts from the translation item
2819 note = None
2820 restore_beginning = ""
2821 restore_end = ""
2822 while True:
2823 beginning = False
2824 # See if we can find a parenthesized expression at the end
2825 m = re.search(r"\s*\((([^()]|\([^()]+\))+)\)\.?$", text)
2826 if m:
2827 par = m.group(1)
2828 text = text[: m.start()]
2829 if par.startswith(("literally ", "lit.")):
2830 continue # Not useful for disambiguation in many idioms
2831 else:
2832 # See if we can find a parenthesized expression at the start
2833 m = re.match(r"^\^?\((([^()]|\([^()]+\))+)\):?(\s+|$)", text)
2834 if m:
2835 par = m.group(1)
2836 text = text[m.end() :]
2837 beginning = True
2838 if re.match(r"^(\d|\s|,| or | and )+$", par): 2838 ↛ 2843line 2838 didn't jump to line 2843 because the condition on line 2838 was never true
2839 # Looks like this beginning parenthesized expression only
2840 # contains digits or their combinations. We assume such
2841 # to be sense descriptions if no sense has been selected,
2842 # or otherwise just ignore them.
2843 if not tr.get("sense"):
2844 tr["sense"] = par
2845 continue
2846 else:
2847 # See if we can find a parenthesized expression in the middle.
2848 # Romanizations are sometimes between word and gender marker,
2849 # e.g. wife/English/Tr/Yiddish.
2850 m = re.search(r"\s+\((([^()]|\([^()]+\))+)\)", text)
2851 if m:
2852 par = m.group(1)
2853 text = text[: m.start()] + text[m.end() :]
2854 else:
2855 # No more parenthesized expressions - break out of the loop
2856 break
2858 # Some cleanup of artifacts that may result from skipping some templates
2859 # in earlier stages
2860 if par.startswith(": "): 2860 ↛ 2861line 2860 didn't jump to line 2861 because the condition on line 2860 was never true
2861 par = par[2:]
2862 if par.endswith(","): 2862 ↛ 2863line 2862 didn't jump to line 2863 because the condition on line 2862 was never true
2863 par = par[:-1]
2864 if re.match(r'^[“"]([^“”"]*)[“”"]$', par): 2864 ↛ 2865line 2864 didn't jump to line 2865 because the condition on line 2864 was never true
2865 par = par[1:-1]
2866 par = par.strip()
2868 # Check for special script pronunciation followed by romanization,
2869 # used in many Asian languages.
2870 lst = par.split(", ")
2871 if len(lst) == 2:
2872 a, r = lst
2873 if classify_desc(a) == "other":
2874 cls = classify_desc(r)
2875 # print("parse_translation_desc: r={} cls={}".format(r, cls))
2876 if cls == "romanization" or (
2877 cls == "english" and len(r.split()) == 1 and r[0].islower()
2878 ):
2879 if tr.get("alt") and tr.get("alt") != a: 2879 ↛ 2880line 2879 didn't jump to line 2880 because the condition on line 2879 was never true
2880 wxr.wtp.debug(
2881 'more than one value in "alt": {} vs. {}'.format(
2882 tr["alt"], a
2883 ),
2884 sortid="form_descriptions/1930",
2885 )
2886 tr["alt"] = a
2887 if tr.get("roman") and tr.get("roman") != r: 2887 ↛ 2888line 2887 didn't jump to line 2888 because the condition on line 2887 was never true
2888 wxr.wtp.debug(
2889 'more than one value in "roman": '
2890 "{} vs. {}".format(tr["roman"], r),
2891 sortid="form_descriptions/1936",
2892 )
2893 tr["roman"] = r
2894 continue
2896 # Check for certain comma-separated tags combined with English text
2897 # at the beginning or end of a comma-separated parenthesized list
2898 while len(lst) > 1:
2899 cls = classify_desc(lst[0])
2900 if cls == "tags": 2900 ↛ 2901line 2900 didn't jump to line 2901 because the condition on line 2900 was never true
2901 tagsets, topics = decode_tags(lst[0])
2902 for t in tagsets:
2903 data_extend(tr, "tags", t)
2904 data_extend(tr, "topics", topics)
2905 lst = lst[1:]
2906 continue
2907 cls = classify_desc(lst[-1])
2908 if cls == "tags":
2909 tagsets, topics = decode_tags(lst[-1])
2910 for t in tagsets:
2911 data_extend(tr, "tags", t)
2912 data_extend(tr, "topics", topics)
2913 lst = lst[:-1]
2914 continue
2915 break
2916 par = ", ".join(lst)
2918 if not par: 2918 ↛ 2919line 2918 didn't jump to line 2919 because the condition on line 2918 was never true
2919 continue
2920 if re.search(tr_ignored_parens_re, par): 2920 ↛ 2921line 2920 didn't jump to line 2921 because the condition on line 2920 was never true
2921 continue
2922 if par.startswith("numeral:"):
2923 par = par[8:].strip()
2925 # Classify the part in parenthesis and process accordingly
2926 cls = classify_desc(par)
2927 # print("parse_translation_desc classify: {!r} -> {}"
2928 # .format(par, cls))
2929 if par == text:
2930 pass
2931 if par == "f": 2931 ↛ 2932line 2931 didn't jump to line 2932 because the condition on line 2931 was never true
2932 data_append(tr, "tags", "feminine")
2933 elif par == "m": 2933 ↛ 2934line 2933 didn't jump to line 2934 because the condition on line 2933 was never true
2934 data_append(tr, "tags", "masculine")
2935 elif cls == "tags":
2936 tagsets, topics = decode_tags(par)
2937 for tags in tagsets:
2938 data_extend(tr, "tags", tags)
2939 data_extend(tr, "topics", topics)
2940 elif cls == "english":
2941 # If the text contains any of certain grammatical words, treat it
2942 # as a "note" instead of "english"
2943 if re.search(tr_note_re, par):
2944 if par.endswith(":"): 2944 ↛ 2945line 2944 didn't jump to line 2945 because the condition on line 2944 was never true
2945 par = par[:-1]
2946 if par not in ("see entry for forms",): 2946 ↛ 2822line 2946 didn't jump to line 2822 because the condition on line 2946 was always true
2947 if note: 2947 ↛ 2948line 2947 didn't jump to line 2948 because the condition on line 2947 was never true
2948 note = note + ";" + par
2949 else:
2950 note = par
2951 else:
2952 # There can be more than one parenthesized english item, see
2953 # e.g. Aunt/English/Translations/Tamil
2954 if tr.get("english"): 2954 ↛ 2955line 2954 didn't jump to line 2955 because the condition on line 2954 was never true
2955 tr["english"] += "; " + par
2956 else:
2957 tr["english"] = par
2958 elif cls == "romanization":
2959 # print("roman text={!r} text cls={}"
2960 # .format(text, classify_desc(text)))
2961 if classify_desc(text) in (
2962 "english",
2963 "romanization",
2964 ) and lang not in ("Egyptian",):
2965 if beginning:
2966 restore_beginning += "({}) ".format(par)
2967 else:
2968 restore_end = " ({})".format(par) + restore_end
2969 else:
2970 if tr.get("roman"): 2970 ↛ 2971line 2970 didn't jump to line 2971 because the condition on line 2970 was never true
2971 wxr.wtp.debug(
2972 'more than one value in "roman": {} vs. {}'.format(
2973 tr["roman"], par
2974 ),
2975 sortid="form_descriptions/2013",
2976 )
2977 tr["roman"] = par
2978 elif cls == "taxonomic": 2978 ↛ 2979line 2978 didn't jump to line 2979 because the condition on line 2978 was never true
2979 if tr.get("taxonomic"):
2980 wxr.wtp.debug(
2981 'more than one value in "taxonomic": {} vs. {}'.format(
2982 tr["taxonomic"], par
2983 ),
2984 sortid="form_descriptions/2019",
2985 )
2986 if re.match(r"×[A-Z]", par):
2987 data_append(tr, "tags", "extinct")
2988 par = par[1:]
2989 tr["taxonomic"] = par
2990 elif cls == "other": 2990 ↛ 3000line 2990 didn't jump to line 3000 because the condition on line 2990 was always true
2991 if tr.get("alt"): 2991 ↛ 2992line 2991 didn't jump to line 2992 because the condition on line 2991 was never true
2992 wxr.wtp.debug(
2993 'more than one value in "alt": {} vs. {}'.format(
2994 tr["alt"], par
2995 ),
2996 sortid="form_descriptions/2028",
2997 )
2998 tr["alt"] = par
2999 else:
3000 wxr.wtp.debug(
3001 "parse_translation_desc unimplemented cls {}: {}".format(
3002 cls, par
3003 ),
3004 sortid="form_descriptions/2033",
3005 )
3007 # Check for gender indications in suffix
3008 text, final_tags = parse_head_final_tags(wxr, lang, text)
3009 data_extend(tr, "tags", final_tags)
3011 # Restore those parts that we did not want to remove (they are often
3012 # optional words or words that are always used with the given translation)
3013 text = restore_beginning + text + restore_end
3015 if note:
3016 tr["note"] = note.strip()
3017 if text and text not in ignored_translations:
3018 tr["word"] = text.strip()
3020 # Sometimes gender seems to be at the end of "roman" field, see e.g.
3021 # fire/English/Noun/Translations/Egyptian (for "oxidation reaction")
3022 roman = tr.get("roman")
3023 if roman:
3024 if roman.endswith(" f"): 3024 ↛ 3025line 3024 didn't jump to line 3025 because the condition on line 3024 was never true
3025 data_append(tr, "tags", "feminine")
3026 tr["roman"] = roman[:-2].strip()
3027 elif roman.endswith(" m"): 3027 ↛ 3028line 3027 didn't jump to line 3028 because the condition on line 3027 was never true
3028 data_append(tr, "tags", "masculine")
3029 tr["roman"] = roman[:-2].strip()
3031 # If the word now has "english" field but no "roman" field, and
3032 # the word would be classified "other" (generally non-latin
3033 # characters), and the value in "english" is only one lowercase
3034 # word, move it to "roman". This happens semi-frequently when the
3035 # translation is transliterated the same as some English word.
3036 roman = tr.get("roman")
3037 english = tr.get("english")
3038 if english and not roman and "word" in tr:
3039 cls = classify_desc(tr["word"])
3040 if cls == "other" and " " not in english and english[0].islower():
3041 del tr["english"]
3042 tr["roman"] = english
3044 # If the entry now has both tr["roman"] and tr["word"] and they have
3045 # the same value, delete tr["roman"] (e.g., man/English/Translations
3046 # Evenki)
3047 if tr.get("word") and tr.get("roman") == tr.get("word"): 3047 ↛ 3048line 3047 didn't jump to line 3048 because the condition on line 3047 was never true
3048 del tr["roman"]
3051def parse_alt_or_inflection_of(
3052 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str]
3053) -> Optional[tuple[list[str], Optional[list[AltOf]]]]:
3054 """Tries to parse an inflection-of or alt-of description. If successful,
3055 this returns (tags, alt-of/inflection-of-dict). If the description cannot
3056 be parsed, this returns None. This may also return (tags, None) when the
3057 gloss describes a form (or some other tags were extracted from it), but
3058 there was no alt-of/form-of/synonym-of word."""
3059 # print("parse_alt_or_inflection_of: {!r}".format(gloss))
3060 # Occasionally inflection_of/alt_of have "A(n) " etc. at the beginning.
3062 # Never interpret a gloss that is equal to the word itself as a tag
3063 # (e.g., instrumental/Romanian, instrumental/Spanish).
3064 if gloss.lower() == wxr.wtp.title.lower() or ( # type:ignore[union-attr]
3065 len(gloss) >= 5 and distw([gloss.lower()], wxr.wtp.title.lower()) < 0.2 # type:ignore[union-attr]
3066 ):
3067 return None
3069 # First try parsing it as-is
3070 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args)
3071 if parsed is not None:
3072 return parsed
3074 # Next try parsing it with the first character converted to lowercase if
3075 # it was previously uppercase.
3076 if gloss and gloss[0].isupper():
3077 gloss = gloss[0].lower() + gloss[1:]
3078 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args)
3079 if parsed is not None:
3080 return parsed
3082 return None
3085# These tags are not allowed in alt-or-inflection-of parsing
3086alt_infl_disallowed: set[str] = set(
3087 [
3088 "error-unknown-tag",
3089 "place", # Not in inflected forms and causes problems e.g. house/English
3090 ]
3091)
3094def parse_alt_or_inflection_of1(
3095 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str]
3096) -> Optional[tuple[list[str], Optional[list[AltOf]]]]:
3097 """Helper function for parse_alt_or_inflection_of. This handles a single
3098 capitalization."""
3099 if not gloss or not gloss.strip(): 3099 ↛ 3100line 3099 didn't jump to line 3100 because the condition on line 3099 was never true
3100 return None
3102 # Prevent some common errors where we would parse something we shouldn't
3103 if re.search(r"(?i)form of address ", gloss): 3103 ↛ 3104line 3103 didn't jump to line 3104 because the condition on line 3103 was never true
3104 return None
3106 gloss = re.sub(r"only used in [^,]+, ", "", gloss)
3108 # First try all formats ending with "of" (or other known last words that
3109 # can end a form description)
3110 matches = list(re.finditer(r"\b(of|for|by|as|letter|number) ", gloss))
3111 m: Optional[re.Match]
3112 for m in reversed(matches):
3113 desc = gloss[: m.end()].strip()
3114 base = gloss[m.end() :].strip()
3115 tagsets, topics = decode_tags(desc, no_unknown_starts=True)
3116 if not topics and any(
3117 not (alt_infl_disallowed & set(ts)) for ts in tagsets
3118 ):
3119 # Successfully parsed, including "of" etc.
3120 tags: list[str] = []
3121 # If you have ("Western-Armenian", ..., "form-of") as your
3122 # tag set, it's most probable that it's something like
3123 # "Western Armenian form of խոսել (xosel)", which should
3124 # get "alt-of" instead of "form-of" (inflection).
3125 # խօսիլ/Armenian
3126 for ts_t in tagsets:
3127 if "form-of" in ts_t and any(
3128 valid_tags.get(tk) == "dialect" for tk in ts_t
3129 ):
3130 ts_s = (set(ts_t) - {"form-of"}) | {"alt-of"}
3131 else:
3132 ts_s = set(ts_t)
3133 if not (alt_infl_disallowed & ts_s): 3133 ↛ 3126line 3133 didn't jump to line 3126 because the condition on line 3133 was always true
3134 tags.extend(ts_s)
3135 if (
3136 "alt-of" in tags
3137 or "form-of" in tags
3138 or "synonym-of" in tags
3139 or "compound-of" in tags
3140 ):
3141 break
3142 if m.group(1) == "of":
3143 # Try parsing without the final "of". This is commonly used in
3144 # various form-of expressions.
3145 desc = gloss[: m.start()]
3146 base = gloss[m.end() :]
3147 tagsets, topics = decode_tags(desc, no_unknown_starts=True)
3148 # print("ALT_OR_INFL: desc={!r} base={!r} tagsets={} topics={}"
3149 # .format(desc, base, tagsets, topics))
3150 if not topics and any(
3151 not (alt_infl_disallowed & set(t)) for t in tagsets
3152 ):
3153 tags = []
3154 for t in tagsets:
3155 if not (alt_infl_disallowed & set(t)): 3155 ↛ 3154line 3155 didn't jump to line 3154 because the condition on line 3155 was always true
3156 tags.extend(t)
3157 # It must have at least one tag from form_of_tags
3158 if set(tags) & form_of_tags:
3159 # Accept this as form-of
3160 tags.append("form-of")
3161 break
3162 if set(tags) & alt_of_tags:
3163 # Accept this as alt-of
3164 tags.append("alt-of")
3165 break
3167 else:
3168 # Did not find a form description based on last word; see if the
3169 # whole description is tags
3170 tagsets, topics = decode_tags(gloss, no_unknown_starts=True)
3171 if not topics and any(
3172 not (alt_infl_disallowed & set(ts)) and form_of_tags & set(ts)
3173 for ts in tagsets
3174 ):
3175 tags = []
3176 for ts in tagsets:
3177 if not (alt_infl_disallowed & set(ts)) and form_of_tags & set( 3177 ↛ 3176line 3177 didn't jump to line 3176 because the condition on line 3177 was always true
3178 ts
3179 ):
3180 tags.extend(ts)
3181 base = ""
3182 else:
3183 return None
3185 # kludge for Spanish (again): 'x of [word] combined with [clitic]'
3186 m = re.search(r"combined with \w+$", base)
3187 if m: 3187 ↛ 3188line 3187 didn't jump to line 3188 because the condition on line 3187 was never true
3188 tagsets, topics = decode_tags(m.group(0), no_unknown_starts=True)
3189 if not topics:
3190 for ts in tagsets:
3191 tags.extend(ts)
3192 base = base[: m.start()]
3194 # It is fairly common for form_of glosses to end with something like
3195 # "ablative case" or "in instructive case". Parse that ending.
3196 base = base.strip()
3197 lst = base.split()
3198 # print("parse_alt_or_inflection_of: lst={}".format(lst))
3199 if len(lst) >= 3 and lst[-1] in ("case", "case."): 3199 ↛ 3200line 3199 didn't jump to line 3200 because the condition on line 3199 was never true
3200 node = valid_sequences.children.get(lst[-2])
3201 if node and node.end:
3202 for s in node.tags:
3203 tags.extend(s.split(" "))
3204 lst = lst[:-2]
3205 if lst[-1] == "in" and len(lst) > 1:
3206 lst = lst[:-1]
3208 # Eliminate empty and duplicate tags
3209 tags = list(sorted(set(t for t in tags if t)))
3211 # Clean up some extra stuff from the linked word, separating the text
3212 # into ``base`` (the linked word) and ``extra`` (additional information,
3213 # such as English translation or clarifying word sense information).
3214 orig_base = base
3215 base = re.sub(alt_of_form_of_clean_re, "", orig_base)
3216 base = re.sub(r" [(⟨][^()]*[)⟩]", "", base) # Remove all (...) groups
3217 extra = orig_base[len(base) :]
3218 extra = re.sub(r"^[- :;.,,—]+", "", extra)
3219 if extra.endswith(".") and extra.count(".") == 1:
3220 extra = extra[:-1].strip()
3221 m = re.match(r"^\(([^()]*)\)$", extra)
3222 if m: 3222 ↛ 3223line 3222 didn't jump to line 3223 because the condition on line 3222 was never true
3223 extra = m.group(1)
3224 else:
3225 # These weird backets used in "slash mark"
3226 m = re.match(r"^⟨([^()]*)⟩$", extra)
3227 if m: 3227 ↛ 3228line 3227 didn't jump to line 3228 because the condition on line 3227 was never true
3228 extra = m.group(1)
3229 m = re.match(r'^[“"]([^"“”]*)["”]$', extra)
3230 if m: 3230 ↛ 3231line 3230 didn't jump to line 3231 because the condition on line 3230 was never true
3231 extra = m.group(1)
3232 # Note: base might still contain comma-separated values and values
3233 # separated by "and"
3234 base = base.strip()
3235 if base.endswith(",") and len(base) > 2: 3235 ↛ 3236line 3235 didn't jump to line 3236 because the condition on line 3235 was never true
3236 base = base[:-1].strip()
3237 while (
3238 base.endswith(".")
3239 and not wxr.wtp.page_exists(base)
3240 and base not in gloss_template_args
3241 ):
3242 base = base[:-1].strip()
3243 if base.endswith('(\u201cconjecture")'): 3243 ↛ 3244line 3243 didn't jump to line 3244 because the condition on line 3243 was never true
3244 base = base[:-14].strip()
3245 tags.append("conjecture")
3246 while ( 3246 ↛ 3251line 3246 didn't jump to line 3251 because the condition on line 3246 was never true
3247 base.endswith(".")
3248 and not wxr.wtp.page_exists(base)
3249 and base not in gloss_template_args
3250 ):
3251 base = base[:-1].strip()
3252 if ( 3252 ↛ 3257line 3252 didn't jump to line 3257 because the condition on line 3252 was never true
3253 base.endswith(".")
3254 and base not in gloss_template_args
3255 and base[:-1] in gloss_template_args
3256 ):
3257 base = base[:-1]
3258 base = base.strip()
3259 if not base:
3260 return tags, None
3262 # Kludge: Spanish verb forms seem to have a dot added at the end.
3263 # Remove it; we know of no Spanish verbs ending with a dot.
3264 language = wxr.wtp.section
3265 pos = wxr.wtp.subsection
3266 # print("language={} pos={} base={}".format(language, pos, base))
3267 if ( 3267 ↛ 3273line 3267 didn't jump to line 3273 because the condition on line 3267 was never true
3268 base.endswith(".")
3269 and len(base) > 1
3270 and base[-2].isalpha()
3271 and (language == "Spanish" and pos == "Verb")
3272 ):
3273 base = base[:-1]
3275 # Split base to alternatives when multiple alternatives provided
3276 parts = split_at_comma_semi(base, extra=[" / ", "/", r" \+ "])
3277 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "")
3278 if (
3279 len(parts) <= 1
3280 or base.startswith("/")
3281 or base.endswith("/")
3282 or "/" in titleword
3283 ):
3284 parts = [base]
3285 # Split base to alternatives when of form "a or b" and "a" and "b" are
3286 # similar (generally spelling variants of the same word or similar words)
3287 if len(parts) == 1:
3288 pp = base.split()
3289 if len(pp) == 3 and pp[1] == "or" and distw([pp[0]], pp[2]) < 0.4:
3290 parts = [pp[0], pp[2]]
3292 # Create form-of/alt-of entries based on the extracted data
3293 dt_lst: list[AltOf] = []
3294 for p in parts:
3295 # Check for some suspicious base forms
3296 m = re.search(r"[.,] |[{}()]", p)
3297 if m and not wxr.wtp.page_exists(p): 3297 ↛ 3298line 3297 didn't jump to line 3298 because the condition on line 3297 was never true
3298 wxr.wtp.debug(
3299 "suspicious alt_of/form_of with {!r}: {}".format(m.group(0), p),
3300 sortid="form_descriptions/2278",
3301 )
3302 if p.startswith("*") and len(p) >= 3 and p[1].isalpha(): 3302 ↛ 3303line 3302 didn't jump to line 3303 because the condition on line 3302 was never true
3303 p = p[1:]
3304 dt: AltOf = {"word": p}
3305 if extra:
3306 dt["extra"] = extra
3307 dt_lst.append(dt)
3308 # print("alt_or_infl_of returning tags={} lst={} base={!r}"
3309 # .format(tags, lst, base))
3310 return tags, dt_lst
3313@functools.lru_cache(maxsize=65536)
3314def classify_desc(
3315 desc: str,
3316 allow_unknown_tags=False,
3317 no_unknown_starts=False,
3318 accepted: Union[tuple[str, ...], frozenset[str]] = tuple(),
3319) -> str:
3320 """Determines whether the given description is most likely tags, english,
3321 a romanization, or something else. Returns one of: "tags", "english",
3322 "romanization", or "other". If ``allow_unknown_tags`` is True, then
3323 allow "tags" classification even when the only tags are those starting
3324 with a word in allowed_unknown_starts."""
3325 assert isinstance(desc, str)
3326 # Empty and whitespace-only strings are treated as "other"
3327 desc = desc.strip()
3328 if not desc:
3329 return "other"
3331 normalized_desc = unicodedata.normalize("NFKD", desc)
3333 # If it can be fully decoded as tags without errors, treat as tags
3334 tagsets, topics = decode_tags(desc, no_unknown_starts=no_unknown_starts)
3335 for tagset in tagsets:
3336 assert isinstance(tagset, (list, tuple, set))
3337 if "error-unknown-tag" not in tagset and (
3338 topics or allow_unknown_tags or any(" " not in x for x in tagset)
3339 ):
3340 return "tags"
3342 # Check if it looks like the taxonomic name of a species
3343 if desc in known_species:
3344 return "taxonomic"
3345 desc1 = re.sub(r"^×([A-Z])", r"\1", desc)
3346 desc1 = re.sub(r"\s*×.*", "", desc1)
3347 lst = desc1.split()
3348 if len(lst) > 1 and len(lst) <= 5 and lst[0] in known_firsts:
3349 have_non_english = 1 if lst[0].lower() not in english_words else 0
3350 for x in lst[1:]:
3351 if x in ("A", "B", "C", "D", "E", "F", "I", "II", "III", "IV", "V"):
3352 continue
3353 if x[0].isupper():
3354 break
3355 if x not in english_words:
3356 have_non_english += 1
3357 else:
3358 # Starts with known taxonomic term, does not contain uppercase
3359 # words (except allowed letters) and at least one word is not
3360 # English
3361 if have_non_english >= len(lst) - 1 and have_non_english > 0: 3361 ↛ 3367line 3361 didn't jump to line 3367 because the condition on line 3361 was always true
3362 return "taxonomic"
3364 # If all words are in our English dictionary, interpret as English.
3365 # [ -~] is regex black magic, "ALL CHARACTERS from space to tilde"
3366 # in ASCII. Took me a while to figure out.
3367 if re.match(r"[ -~―—“”…'‘’ʹ€]+$", normalized_desc) and len(desc) > 1:
3368 if desc in english_words and desc[0].isalpha():
3369 return "english" # Handles ones containing whitespace
3370 desc1 = re.sub(
3371 tokenizer_fixup_re, lambda m: tokenizer_fixup_map[m.group(0)], desc
3372 )
3373 tokens = tokenizer.tokenize(desc1)
3374 if not tokens: 3374 ↛ 3375line 3374 didn't jump to line 3375 because the condition on line 3374 was never true
3375 return "other"
3376 lst_bool = list(
3377 x not in not_english_words
3378 and
3379 # not x.isdigit() and
3380 (
3381 x in english_words
3382 or x.lower() in english_words
3383 or x in known_firsts
3384 or x[0].isdigit()
3385 or x in accepted
3386 or
3387 # (x[0].isupper() and x.find("-") < 0 and x.isascii()) or
3388 (
3389 x.endswith("s") and len(x) >= 4 and x[:-1] in english_words
3390 ) # Plural
3391 or (
3392 x.endswith("ies")
3393 and len(x) >= 5
3394 and x[:-3] + "y" in english_words
3395 ) # E.g. lily - lilies
3396 or (
3397 x.endswith("ing")
3398 and len(x) >= 5
3399 and x[:-3] in english_words
3400 ) # E.g. bring - bringing
3401 or (
3402 x.endswith("ing")
3403 and len(x) >= 5
3404 and x[:-3] + "e" in english_words
3405 ) # E.g., tone - toning
3406 or (
3407 x.endswith("ed") and len(x) >= 5 and x[:-2] in english_words
3408 ) # E.g. hang - hanged
3409 or (
3410 x.endswith("ed")
3411 and len(x) >= 5
3412 and x[:-2] + "e" in english_words
3413 ) # E.g. atone - atoned
3414 or (x.endswith("'s") and x[:-2] in english_words)
3415 or (x.endswith("s'") and x[:-2] in english_words)
3416 or (
3417 x.endswith("ise")
3418 and len(x) >= 5
3419 and x[:-3] + "ize" in english_words
3420 )
3421 or (
3422 x.endswith("ised")
3423 and len(x) >= 6
3424 and x[:-4] + "ized" in english_words
3425 )
3426 or (
3427 x.endswith("ising")
3428 and len(x) >= 7
3429 and x[:-5] + "izing" in english_words
3430 )
3431 or (
3432 re.search(r"[-/]", x)
3433 and all(
3434 ((y in english_words and len(y) > 2) or not y)
3435 for y in re.split(r"[-/]", x)
3436 )
3437 )
3438 )
3439 for x in tokens
3440 )
3441 cnt = lst_bool.count(True)
3442 rejected_words = tuple(
3443 x for i, x in enumerate(tokens) if not lst_bool[i]
3444 )
3445 if (
3446 any(
3447 lst_bool[i] and x[0].isalpha() and len(x) > 1
3448 for i, x in enumerate(tokens)
3449 )
3450 and not desc.startswith("-")
3451 and not desc.endswith("-")
3452 and re.search(r"\w+", desc)
3453 and (
3454 cnt == len(lst_bool)
3455 or (
3456 any(
3457 lst_bool[i] and len(x) > 3 for i, x in enumerate(tokens)
3458 )
3459 and cnt >= len(lst_bool) - 1
3460 )
3461 or cnt / len(lst_bool) >= 0.8
3462 or (
3463 all(x in potentially_english_words for x in rejected_words)
3464 and cnt / len(lst_bool) >= 0.50
3465 )
3466 )
3467 ):
3468 return "english"
3469 # Some translations have apparent pronunciation descriptions in /.../
3470 # which we'll put in the romanization field (even though they probably are
3471 # not exactly romanizations).
3472 if desc.startswith("/") and desc.endswith("/"):
3473 return "romanization"
3474 # If all characters are in classes that could occur in romanizations,
3475 # treat as romanization
3476 classes = list(
3477 unicodedata.category(x) if x not in ("-", ",", ":", "/", '"') else "OK"
3478 for x in normalized_desc
3479 )
3480 classes1 = []
3481 num_latin = 0
3482 num_greek = 0
3483 # part = ""
3484 # for ch, cl in zip(normalized_desc, classes):
3485 # part += f"{ch}({cl})"
3486 # print(part)
3487 for ch, cl in zip(normalized_desc, classes):
3488 if ch in (
3489 "'", # ' in Arabic, / in IPA-like parenthesized forms
3490 ".", # e.g., "..." in translations
3491 ";",
3492 ":",
3493 "!",
3494 "‘",
3495 "’",
3496 '"',
3497 "“",
3498 "”",
3499 "/",
3500 "?",
3501 "…", # alternative to "..."
3502 "⁉", # 見る/Japanese automatic transcriptions...
3503 "?",
3504 "!",
3505 "⁻", # superscript -, used in some Cantonese roman, e.g. "we"
3506 "ʔ",
3507 "ʼ",
3508 "ʾ",
3509 "ʹ",
3510 ): # ʹ e.g. in understand/English/verb Russian transl
3511 classes1.append("OK")
3512 continue
3513 if cl not in ("Ll", "Lu"):
3514 classes1.append(cl)
3515 continue
3516 try:
3517 name = unicodedata.name(ch)
3518 first = name.split()[0]
3519 if first == "LATIN":
3520 num_latin += 1
3521 elif first == "GREEK":
3522 num_greek += 1
3523 elif first == "COMBINING": # Combining diacritic 3523 ↛ 3524line 3523 didn't jump to line 3524 because the condition on line 3523 was never true
3524 cl = "OK"
3525 elif re.match(non_latin_scripts_re, name): 3525 ↛ 3529line 3525 didn't jump to line 3529 because the condition on line 3525 was always true
3526 cl = "NO" # Not acceptable in romanizations
3527 except ValueError:
3528 cl = "NO" # Not acceptable in romanizations
3529 classes1.append(cl)
3530 # print("classify_desc: {!r} classes1: {}".format(desc, classes1))
3531 # print(set(classes1) )
3532 if all(
3533 x in ("Ll", "Lu", "Lt", "Lm", "Mn", "Mc", "Zs", "Nd", "OK")
3534 for x in classes1
3535 ):
3536 if (
3537 (num_latin >= num_greek + 2 or num_greek == 0)
3538 and classes1.count("OK") < len(classes1)
3539 and classes1.count("Nd") < len(classes1)
3540 ):
3541 return "romanization"
3542 # Otherwise it is something else, such as hanji version of the word
3543 return "other"
3546def remove_text_in_parentheses(text: str) -> str:
3547 parentheses = 0
3548 new_text = ""
3549 for c in text:
3550 if c == "(":
3551 parentheses += 1
3552 elif c == ")":
3553 parentheses -= 1
3554 elif parentheses == 0:
3555 new_text += c
3556 return new_text