Coverage for src/wiktextract/extractor/en/form_descriptions.py: 70%
1321 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1# Code for parsing linguistic form descriptions and tags for word senses
2# (both the word entry head - initial part and parenthesized parts -
3# and tags at the beginning of word senses)
4#
5# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
7import functools
8import re
9import unicodedata
10from typing import (
11 Any,
12 Literal,
13 Optional,
14 Sequence,
15 Union,
16)
18import Levenshtein
19from nltk import TweetTokenizer # type:ignore[import-untyped]
21from ...datautils import data_append, data_extend, split_at_comma_semi
22from ...tags import (
23 alt_of_tags,
24 form_of_tags,
25 head_final_bantu_langs,
26 head_final_bantu_map,
27 head_final_numeric_langs,
28 head_final_other_langs,
29 head_final_other_map,
30 head_final_semitic_langs,
31 head_final_semitic_map,
32 uppercase_tags,
33 valid_tags,
34 xlat_descs_map,
35 xlat_head_map,
36 xlat_tags_map,
37)
38from ...topics import topic_generalize_map, valid_topics
39from ...wxr_context import WiktextractContext
40from .english_words import (
41 english_words,
42 not_english_words,
43 potentially_english_words,
44)
45from .form_descriptions_known_firsts import known_firsts
46from .taxondata import known_species
47from .type_utils import (
48 AltOf,
49 FormData,
50 LinkageData,
51 SenseData,
52 SoundData,
53 TranslationData,
54 WordData,
55)
57# Tokenizer for classify_desc()
58tokenizer = TweetTokenizer()
60# These are ignored as the value of a related form in form head.
61IGNORED_RELATED: set[str] = set(
62 [
63 "-",
64 "־",
65 "᠆",
66 "‐",
67 "‑",
68 "‒",
69 "–",
70 "—",
71 "―",
72 "−",
73 "⸺",
74 "⸻",
75 "﹘",
76 "﹣",
77 "-",
78 "?",
79 "(none)",
80 ]
81)
84# First words of unicodedata.name() that indicate scripts that cannot be
85# accepted in romanizations or english (i.e., should be considered "other"
86# in classify_desc()).
87non_latin_scripts: list[str] = [
88 "ADLAM",
89 "ARABIC",
90 "ARABIC-INDIC",
91 "ARMENIAN",
92 "BALINESE",
93 "BENGALI",
94 "BRAHMI",
95 "BRAILLE",
96 "CANADIAN",
97 "CHAKMA",
98 "CHAM",
99 "CHEROKEE",
100 "CJK",
101 "COPTIC",
102 "COUNTING ROD",
103 "CUNEIFORM",
104 "CYRILLIC",
105 "DOUBLE-STRUCK",
106 "EGYPTIAN",
107 "ETHIOPIC",
108 "EXTENDED ARABIC-INDIC",
109 "GEORGIAN",
110 "GLAGOLITIC",
111 "GOTHIC",
112 "GREEK",
113 "GUJARATI",
114 "GURMUKHI",
115 "HANGUL",
116 "HANIFI ROHINGYA",
117 "HEBREW",
118 "HIRAGANA",
119 "JAVANESE",
120 "KANNADA",
121 "KATAKANA",
122 "KAYAH LI",
123 "KHMER",
124 "KHUDAWADI",
125 "LAO",
126 "LEPCHA",
127 "LIMBU",
128 "MALAYALAM",
129 "MEETEI",
130 "MYANMAR",
131 "NEW TAI LUE",
132 "NKO",
133 "OL CHIKI",
134 "OLD PERSIAN",
135 "OLD SOUTH ARABIAN",
136 "ORIYA",
137 "OSMANYA",
138 "PHOENICIAN",
139 "SAURASHTRA",
140 "SHARADA",
141 "SINHALA",
142 "SUNDANESE",
143 "SYLOTI",
144 "TAI THAM",
145 "TAKRI",
146 "TAMIL",
147 "TELUGU",
148 "THAANA",
149 "THAI",
150 "TIBETAN",
151 "TIFINAGH",
152 "TIRHUTA",
153 "UGARITIC",
154 "WARANG CITI",
155 "YI",
156]
157non_latin_scripts_re = re.compile(
158 r"(" + r"|".join(re.escape(x) for x in non_latin_scripts) + r")\b"
159)
161# Sanity check xlat_head_map values
162for k, v in xlat_head_map.items():
163 if v.startswith("?"):
164 v = v[1:]
165 for tag in v.split():
166 if tag not in valid_tags: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true
167 print(
168 "WARNING: xlat_head_map[{}] contains unrecognized tag {}".format(
169 k, tag
170 )
171 )
173# Regexp for finding nested translations from translation items (these are
174# used in, e.g., year/English/Translations/Arabic). This is actually used
175# in page.py.
176nested_translations_re = re.compile(
177 r"\s+\((({}): ([^()]|\([^()]+\))+)\)".format(
178 "|".join(
179 re.escape(x.removeprefix("?"))
180 for x in sorted(xlat_head_map.values(), key=len, reverse=True)
181 if x and not x.startswith("class-")
182 )
183 )
184)
186# Regexp that matches head tag specifiers. Used to match tags from end of
187# translations and linkages
188head_final_re_text = r"( -)?( ({}))+".format(
189 "|".join(
190 re.escape(x)
191 for x in
192 # The sort is to put longer ones first, preferring them in
193 # the regexp match
194 sorted(xlat_head_map.keys(), key=len, reverse=True)
195 )
196)
197head_final_re = re.compile(head_final_re_text + "$")
199# Regexp used to match head tag specifiers at end of a form for certain
200# Bantu languages (particularly Swahili and similar languages).
201head_final_bantu_re_text = r" ({})".format(
202 "|".join(re.escape(x) for x in head_final_bantu_map.keys())
203)
204head_final_bantu_re = re.compile(head_final_bantu_re_text + "$")
206# Regexp used to match head tag specifiers at end of a form for certain
207# Semitic languages (particularly Arabic and similar languages).
208head_final_semitic_re_text = r" ({})".format(
209 "|".join(re.escape(x) for x in head_final_semitic_map.keys())
210)
211head_final_semitic_re = re.compile(head_final_semitic_re_text + "$")
213# Regexp used to match head tag specifiers at end of a form for certain
214# other languages (e.g., Lithuanian, Finnish, French).
215head_final_other_re_text = r" ({})".format(
216 "|".join(re.escape(x) for x in head_final_other_map.keys())
217)
218head_final_other_re = re.compile(head_final_other_re_text + "$")
220# Regexp for splitting heads. See parse_word_head().
221head_split_re_text = (
222 "("
223 + head_final_re_text
224 + "|"
225 + head_final_bantu_re_text
226 + "|"
227 + head_final_semitic_re_text
228 + "|"
229 + head_final_other_re_text
230 + ")?( or |[,;]+)"
231)
232head_split_re = re.compile(head_split_re_text)
233head_split_re_parens = 0
234for m in re.finditer(r"(^|[^\\])[(]+", head_split_re_text):
235 head_split_re_parens += m.group(0).count("(")
237# Parenthesized parts that are ignored in translations
238tr_ignored_parens: set[str] = set(
239 [
240 "please verify",
241 "(please verify)",
242 "transliteration needed",
243 "(transliteration needed)",
244 "in words with back vowel harmony",
245 "(in words with back vowel harmony)",
246 "in words with front vowel harmony",
247 "(in words with front vowel harmony)",
248 "see below",
249 "see usage notes below",
250 ]
251)
252tr_ignored_parens_re = re.compile(
253 r"^("
254 + "|".join(re.escape(x) for x in tr_ignored_parens)
255 + ")$"
256 + r"|^(Can we clean up|Can we verify|for other meanings see "
257 r"lit\. )"
258)
260# Translations that are ignored
261ignored_translations: set[str] = set(
262 [
263 "[script needed]",
264 "please add this translation if you can",
265 ]
266)
268# Put english text into the "note" field in a translation if it contains one
269# of these words
270tr_note_re = re.compile(
271 r"(\b(article|definite|indefinite|superlative|comparative|pattern|"
272 r"adjective|adjectives|clause|clauses|pronoun|pronouns|preposition|prep|"
273 r"postposition|postp|action|actions|articles|"
274 r"adverb|adverbs|noun|nouns|verb|verbs|before|"
275 r"after|placed|prefix|suffix|used with|translated|"
276 r"nominative|genitive|dative|infinitive|participle|past|perfect|imperfect|"
277 r"perfective|imperfective|auxiliary|negative|future|present|tense|aspect|"
278 r"conjugation|declension|class|category|plural|singular|positive|"
279 r"seldom used|formal|informal|familiar|unspoken|spoken|written|"
280 r"indicative|progressive|conditional|potential|"
281 r"accusative|adessive|inessive|superessive|elative|allative|"
282 r"dialect|dialects|object|subject|predicate|movies|recommended|language|"
283 r"locative|continuous|simple|continuousness|gerund|subjunctive|"
284 r"periphrastically|no equivalent|not used|not always used|"
285 r"used only with|not applicable|use the|signifying|wordplay|pronounced|"
286 r"preconsonantal|spelled|spelling|respelling|respellings|phonetic|"
287 r"may be replaced|stricter sense|for nonhumans|"
288 r"sense:|used:|in full:|informally used|followed by|"
289 r"not restricted to|pertaining to|or optionally with|are optional|"
290 r"in conjunction with|in compounds|depending on the relationship|"
291 r"person addressed|one person|multiple persons|may be replaced with|"
292 r"optionally completed with|in the phrase|in response to|"
293 r"before a|before an|preceded by|verbs ending|very common|after a verb|"
294 r"with verb|with uncountable|with the objects|with stative|"
295 r"can be replaced by|often after|used before|used after|"
296 r"used in|clipping of|spoken|somewhat|capitalized|"
297 r"short form|shortening of|shortened form|initialism of|"
298 r"said to|rare:|rarer also|is rarer|negatively connoted|"
299 r"previously mentioned|uncountable noun|countable noun|"
300 r"countable nouns|uncountable nouns|"
301 r"with predicative|with -|with imperfect|with a negated|"
302 r"colloquial|misspelling|holophrastic|frequently|esp\.|especially|"
303 r'"|'
304 r"general term|after a vowel|before a vowel|"
305 r"form|regular|irregular|alternative)"
306 r")($|[) ])|^("
307 # Following are only matched at the beginning of the string
308 r"pl|pl\.|see:|pl:|sg:|plurals:|e\.g\.|e\.g\.:|e\.g\.,|cf\.|compare|such as|"
309 r"see|only|often|usually|used|usage:|of|not|in|compare|usu\.|"
310 r"as|about|abbrv\.|abbreviation|abbr\.|that:|optionally|"
311 r"mainly|from|for|also|also:|acronym|"
312 r"\+|with) "
313)
314# \b does not work at the end???
316# Related forms matching this regexp will be considered suspicious if the
317# page title does not also match one of these.
318suspicious_related_re = re.compile(
319 r"(^| )(f|m|n|c|or|pl|sg|inan|anim|pers|anml|impf|pf|vir|nvir)( |$)"
320 r"|[][:=<>&#*|]"
321 r"| \d+$"
322)
324# Word forms (head forms, translations, etc) that will be considered ok and
325# silently accepted even if they would otherwise trigger a suspicious
326# form warning.
327ok_suspicious_forms: set[str] = set(
328 [
329 "but en or", # "golden goal"/English/Tr/French
330 "cœur en or", # "heart of gold"/Eng/Tr/French
331 "en or", # golden/Eng/Tr/French
332 "men du", # jet/Etym2/Noun/Tr/Cornish
333 "parachute en or", # "golden parachute"/Eng/Tr/French
334 "vieil or", # "old gold"/Eng/Tr/French
335 # "all that glitters is not gold"/Eng/Tr/French
336 "tout ce qui brille n’est pas or",
337 "μη αποκλειστικό or", # inclusive or/Eng/Tr/Greek
338 "period or full stop",
339 ]
340)
343# Replacements to be done in classify_desc before tokenizing. This is a
344# workaround for shortcomings in TweetTokenizer.
345tokenizer_fixup_map = {
346 r"a.m.": "AM",
347 r"p.m.": "PM",
348}
349tokenizer_fixup_re = re.compile(
350 r"\b("
351 + "|".join(
352 re.escape(x)
353 for x in sorted(
354 tokenizer_fixup_map.keys(), key=lambda x: len(x), reverse=True
355 )
356 )
357 + r")"
358)
360# Unknown tags starting with these words will be silently ignored.
361ignored_unknown_starts: set[str] = set(
362 [
363 "originally",
364 "e.g.",
365 "c.f.",
366 "supplanted by",
367 "supplied by",
368 ]
369)
371ignored_unknown_starts_re = re.compile(
372 r"^("
373 + "|".join(
374 re.escape(x)
375 for x in sorted(ignored_unknown_starts, key=lambda x: -len(x))
376 )
377 + ") "
378)
380# If an unknown sequence starts with one of these, it will continue as an
381# unknown sequence until the end, unless it turns out to have a replacement.
382allowed_unknown_starts: set[str] = set(
383 [
384 "Relating",
385 "accompanied",
386 "added",
387 "after",
388 "answering",
389 "as",
390 "based",
391 "before",
392 "conjugated",
393 "conjunction",
394 "construed",
395 "especially",
396 "expression:",
397 "figurative:",
398 "followed",
399 "for",
400 "forms",
401 "from",
402 "governs",
403 "in",
404 "indicating",
405 "modifying",
406 "normally",
407 "not",
408 "of",
409 "preceding",
410 "prefixed",
411 "referring",
412 "relating",
413 "revived",
414 "said",
415 "since",
416 "takes",
417 "used",
418 "with",
419 "With",
420 "without",
421 ]
422)
423# Allow the ignored unknown starts without complaining
424allowed_unknown_starts.update(ignored_unknown_starts)
426# Full unknown tags that will be ignored in decode_tags()
427# XXX this is unused, ask Tatu where the contents is now
428ignored_unknown_tags: set[str] = set([])
430# Head endings that are mapped to tags
431head_end_map = {
432 " 1st conj.": "conjugation-1",
433 " 2nd conj.": "conjugation-2",
434 " 3rd conj.": "conjugation-3",
435 " 4th conj.": "conjugation-4",
436 " 5th conj.": "conjugation-5",
437 " 6th conj.": "conjugation-6",
438 " 7th conj.": "conjugation-7",
439}
440head_end_re = re.compile(
441 r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$"
442)
445# Dictionary of language-specific parenthesized head part starts that
446# either introduce new tags or modify previous tags. The value for each
447# language is a dictionary that maps the first word of the head part to
448# (rem_tags, add_tags), where ``rem_tags`` can be True to remove all previous
449# tags or a space-separated string of tags to remove, and ``add_tags`` should
450# be a string of tags to add.
451lang_specific_head_map: dict[
452 str, dict[str, Union[tuple[str, str], tuple[Literal[True], str]]]
453] = {
454 "Danish": {
455 # prefix: (rem_tags space separate string/True, add_tags s-sep str)
456 "c": ("neuter", "common-gender"),
457 "n": ("common-gender", "neuter"),
458 "pl": ("singular neuter common-gender", "plural"),
459 "sg": ("plural neuter common-gender", "singular"),
460 },
461}
464# Regular expression used to strip additional stuff from the end of alt_of and
465# form_of.
466alt_of_form_of_clean_re = re.compile(
467 r"(?s)("
468 + "|".join(
469 [
470 r":",
471 r'[“"]',
472 r";",
473 r" \(",
474 r" - ",
475 r" ־ ",
476 r" ᠆ ",
477 r" ‐ ",
478 r" ‑ ",
479 r" ‒ ",
480 r" – ",
481 r" — ",
482 r" ― ",
483 r" − ",
484 r" ⸺ ",
485 r" ⸻ ",
486 r" ﹘ ",
487 r" ﹣ ",
488 r" - ",
489 r" \+ ",
490 r" \(with ",
491 r" with -ra/-re",
492 r"\. Used ",
493 r"\. Also ",
494 r"\. Since ",
495 r"\. A ",
496 r"\.\. A ",
497 r"\. An ",
498 r"\.\. An ",
499 r"\. an ",
500 r"\. The ",
501 r"\. Spanish ",
502 r"\. Language ",
503 r"\. former name of ",
504 r"\. AIM",
505 r"\. OT",
506 r"\. Not ",
507 r"\. Now ",
508 r"\. Nowadays ",
509 r"\. Early ",
510 r"\. ASEAN",
511 r"\. UN",
512 r"\. IMF",
513 r"\. WHO",
514 r"\. WIPO",
515 r"\. AC",
516 r"\. DC",
517 r"\. DNA",
518 r"\. RNA",
519 r"\. SOB",
520 r"\. IMO",
521 r"\. Behavior",
522 r"\. Income ",
523 r"\. More ",
524 r"\. Most ",
525 r"\. Only ",
526 r"\. Also ",
527 r"\. From ",
528 r"\. Of ",
529 r"\.\. Of ",
530 r"\. To ",
531 r"\. For ",
532 r"\. If ",
533 r"\. Praenominal ",
534 r"\. This ",
535 r"\. Replaced ",
536 r"\. CHCS is the ",
537 r"\. Equivalent ",
538 r"\. Initialism ",
539 r"\. Note ",
540 r"\. Alternative ",
541 r"\. Compare ",
542 r"\. Cf\. ",
543 r"\. Comparable ",
544 r"\. Involves ",
545 r"\. Sometimes ",
546 r"\. Commonly ",
547 r"\. Often ",
548 r"\. Typically ",
549 r"\. Possibly ",
550 r"\. Although ",
551 r"\. Rare ",
552 r"\. Instead ",
553 r"\. Integrated ",
554 r"\. Distinguished ",
555 r"\. Given ",
556 r"\. Found ",
557 r"\. Was ",
558 r"\. In ",
559 r"\. It ",
560 r"\.\. It ",
561 r"\. One ",
562 r"\. Any ",
563 r"\. They ",
564 r"\. Members ",
565 r"\. Each ",
566 r"\. Original ",
567 r"\. Especially ",
568 r"\. Usually ",
569 r"\. Known ",
570 r"\.\. Known ",
571 r"\. See ",
572 r"\. see ",
573 r"\. target was not ",
574 r"\. Popular ",
575 r"\. Pedantic ",
576 r"\. Positive ",
577 r"\. Society ",
578 r"\. Plan ",
579 r"\. Environmentally ",
580 r"\. Affording ",
581 r"\. Encompasses ",
582 r"\. Expresses ",
583 r"\. Indicates ",
584 r"\. Text ",
585 r"\. Large ",
586 r"\. Sub-sorting ",
587 r"\. Sax",
588 r"\. First-person ",
589 r"\. Second-person ",
590 r"\. Third-person ",
591 r"\. 1st ",
592 r"\. 2nd ",
593 r"\. 3rd ",
594 r"\. Term ",
595 r"\. Northeastern ",
596 r"\. Northwestern ",
597 r"\. Southeast ",
598 r"\. Egyptian ",
599 r"\. English ",
600 r"\. Cape Province was split into ",
601 r"\. Pañcat",
602 r"\. of the ",
603 r"\. is ",
604 r"\. after ",
605 r"\. or ",
606 r"\. chromed",
607 r"\. percussion",
608 r"\. with his ",
609 r"\. a\.k\.a\. ",
610 r"\. comparative form ",
611 r"\. singular ",
612 r"\. plural ",
613 r"\. present ",
614 r"\. his ",
615 r"\. her ",
616 r"\. equivalent ",
617 r"\. measuring ",
618 r"\. used in ",
619 r"\. cutely ",
620 r"\. Protects",
621 r'\. "',
622 r"\.^",
623 r"\. \+ ",
624 r"\., ",
625 r". — ",
626 r", a ",
627 r", an ",
628 r", the ",
629 r", obsolete ",
630 r", possessed", # 'd/English
631 r", imitating", # 1/English
632 r", derived from",
633 r", called ",
634 r", especially ",
635 r", slang for ",
636 r" corresponding to ",
637 r" equivalent to ",
638 r" popularized by ",
639 r" denoting ",
640 r" in its various senses\.",
641 r" used by ",
642 r" but not for ",
643 r" since ",
644 r" i\.e\. ",
645 r" i\. e\. ",
646 r" e\.g\. ",
647 r" eg\. ",
648 r" etc\. ",
649 r"\[http",
650 r" — used as ",
651 r" by K\. Forsyth ",
652 r" by J\. R\. Allen ",
653 r" by S\. Ferguson ",
654 r" by G\. Donaldson ",
655 r" May refer to ",
656 r" An area or region ",
657 ]
658 )
659 + r").*$"
660)
663class ValidNode:
664 """Node in the valid_sequences tree. Each node is part of a chain
665 or chains that form sequences built out of keys in key->tags
666 maps like xlat_tags, etc. The ValidNode's 'word' is the key
667 by which it is refered to in the root dict or a `children` dict,
668 `end` marks that the node is the end-terminus of a sequence (but
669 it can still continue if the sequence is shared by the start of
670 other sequences: "nominative$" and "nominative plural$" for example),
671 `tags` and `topics` are the dicts containing tag and topic strings
672 for terminal nodes (end==True)."""
674 __slots__ = (
675 "end",
676 "tags",
677 "topics",
678 "children",
679 )
681 def __init__(
682 self,
683 end=False,
684 tags: Optional[list[str]] = None,
685 topics: Optional[list[str]] = None,
686 children: Optional[dict[str, "ValidNode"]] = None,
687 ) -> None:
688 self.end = end
689 self.tags: list[str] = tags or []
690 self.topics: list[str] = topics or []
691 self.children: dict[str, "ValidNode"] = children or {}
694def add_to_valid_tree(tree: ValidNode, desc: str, v: Optional[str]) -> None:
695 """Helper function for building trees of valid tags/sequences during
696 initialization."""
697 assert isinstance(tree, ValidNode)
698 assert isinstance(desc, str)
699 assert v is None or isinstance(v, str)
700 node = tree
702 # Build the tree structure: each node has children nodes
703 # whose names are denoted by their dict key.
704 for w in desc.split(" "):
705 if w in node.children:
706 node = node.children[w]
707 else:
708 new_node = ValidNode()
709 node.children[w] = new_node
710 node = new_node
711 if not node.end:
712 node.end = True
713 if not v:
714 return None # Terminate early because there are no tags
716 tagslist = []
717 topicslist = []
718 for vv in v.split():
719 if vv in valid_tags:
720 tagslist.append(vv)
721 elif vv in valid_topics: 721 ↛ 724line 721 didn't jump to line 724 because the condition on line 721 was always true
722 topicslist.append(vv)
723 else:
724 print(
725 "WARNING: tag/topic {!r} maps to unknown {!r}".format(desc, vv)
726 )
727 topics = " ".join(topicslist)
728 tags = " ".join(tagslist)
729 # Changed to "_tags" and "_topics" to avoid possible key-collisions.
730 if topics:
731 node.topics.extend([topics])
732 if tags:
733 node.tags.extend([tags])
736def add_to_valid_tree1(
737 tree: ValidNode,
738 k: str,
739 v: Union[list[str], tuple[str, ...], str],
740 valid_values: Union[set[str], dict[str, Any]],
741) -> list[str]:
742 assert isinstance(tree, ValidNode)
743 assert isinstance(k, str)
744 assert v is None or isinstance(v, (list, tuple, str))
745 assert isinstance(valid_values, (set, dict))
746 if not v: 746 ↛ 747line 746 didn't jump to line 747 because the condition on line 746 was never true
747 add_to_valid_tree(valid_sequences, k, None)
748 return []
749 elif isinstance(v, str):
750 v = [v]
751 q = []
752 for vv in v:
753 assert isinstance(vv, str)
754 add_to_valid_tree(valid_sequences, k, vv)
755 vvs = vv.split()
756 for x in vvs:
757 q.append(x)
758 # return each individual tag
759 return q
762def add_to_valid_tree_mapping(
763 tree: ValidNode,
764 mapping: Union[dict[str, Union[list[str], str]], dict[str, str]],
765 valid_values: Union[set[str], dict[str, Any]],
766 recurse: bool,
767) -> None:
768 assert isinstance(tree, ValidNode)
769 assert isinstance(mapping, dict)
770 assert isinstance(valid_values, (set, dict))
771 assert recurse in (True, False)
772 for k, v in mapping.items():
773 assert isinstance(k, str)
774 assert isinstance(v, (list, str))
775 if isinstance(v, str):
776 q = add_to_valid_tree1(tree, k, [v], valid_values)
777 else:
778 q = add_to_valid_tree1(tree, k, v, valid_values)
779 if recurse:
780 visited = set()
781 while q:
782 v = q.pop()
783 if v in visited:
784 continue
785 visited.add(v)
786 if v not in mapping:
787 continue
788 vv = mapping[v]
789 qq = add_to_valid_tree1(tree, k, vv, valid_values)
790 q.extend(qq)
793# Tree of sequences considered to be tags (includes sequences that are
794# mapped to something that becomes one or more valid tags)
795valid_sequences = ValidNode()
796sequences_with_slashes: set[str] = set()
797for tag in valid_tags:
798 # The basic tags used in our tag system; some are a bit weird, but easier
799 # to implement this with 'false' positives than filter out stuff no one else
800 # uses.
801 if "/" in tag:
802 sequences_with_slashes.add(tag)
803 add_to_valid_tree(valid_sequences, tag, tag)
804for tag in uppercase_tags:
805 hyphenated = re.sub(r"\s+", "-", tag)
806 if hyphenated in valid_tags: 806 ↛ 807line 806 didn't jump to line 807 because the condition on line 806 was never true
807 print(
808 "DUPLICATE TAG: {} (from uppercase tag {!r})".format(
809 hyphenated, tag
810 )
811 )
812 assert hyphenated not in valid_tags
813 # Might as well, while we're here: Add hyphenated location tag.
814 valid_tags[hyphenated] = "dialect"
815 add_to_valid_tree(valid_sequences, hyphenated, hyphenated)
816for tag in uppercase_tags:
817 hyphenated = re.sub(r"\s+", "-", tag)
818 # XXX Move to above loop? Or is this here for readability?
819 if "/" in tag:
820 sequences_with_slashes.add(tag)
821 add_to_valid_tree(valid_sequences, tag, hyphenated)
822# xlat_tags_map!
823add_to_valid_tree_mapping(valid_sequences, xlat_tags_map, valid_tags, False)
824for k in xlat_tags_map:
825 if "/" in k:
826 sequences_with_slashes.add(k)
827# Add topics to the same table, with all generalized topics also added
828for topic in valid_topics:
829 assert " " not in topic
830 if "/" in topic: 830 ↛ 831line 830 didn't jump to line 831 because the condition on line 830 was never true
831 sequences_with_slashes.add(topic)
832 add_to_valid_tree(valid_sequences, topic, topic)
833# Let each original topic value stand alone. These are not generally on
834# valid_topics. We add the original topics with spaces replaced by hyphens.
835for topic in topic_generalize_map.keys():
836 hyphenated = topic.replace(" ", "-")
837 valid_topics.add(hyphenated)
838 if "/" in topic: 838 ↛ 839line 838 didn't jump to line 839 because the condition on line 838 was never true
839 sequences_with_slashes.add(tag)
840 add_to_valid_tree(valid_sequences, topic, hyphenated)
841# Add canonicalized/generalized topic values
842add_to_valid_tree_mapping(
843 valid_sequences, topic_generalize_map, valid_topics, True
844)
846# Regex used to divide a decode candidate into parts that shouldn't
847# have their slashes turned into spaces
848slashes_re = re.compile(
849 r"(" + "|".join((re.escape(s) for s in sequences_with_slashes)) + r")"
850)
852# Regexp used to find "words" from word heads and linguistic descriptions
853word_pattern = (
854 r"[^ ,;()\u200e]+|"
855 r"\([^ ,;()\u200e]+\)[^ ,;()\u200e]+|"
856 r"[\u2800-\u28ff]|" # Braille characters
857 r"\(([^()]|\([^()]*\))*\)"
858)
860word_re_global = re.compile(word_pattern)
863def distw(titleparts: Sequence[str], word: str) -> float:
864 """Computes how distinct ``word`` is from the most similar word in
865 ``titleparts``. Returns 1 if words completely distinct, 0 if
866 identical, or otherwise something in between."""
867 assert isinstance(titleparts, (list, tuple))
868 assert isinstance(word, str)
869 w = min(
870 Levenshtein.distance(word, tw) / max(len(tw), len(word))
871 for tw in titleparts
872 )
873 return w
876def map_with(
877 ht: Union[dict[str, Union[str, list[str]]], dict[str, str]],
878 lst: Sequence[str],
879) -> list[str]:
880 """Takes alternatives from ``lst``, maps them using ``ht`` to zero or
881 more alternatives each, and returns a combined list of alternatives."""
882 assert isinstance(ht, dict)
883 assert isinstance(lst, (list, tuple))
884 ret = []
885 for x in lst:
886 assert isinstance(x, str)
887 x = x.strip()
888 x = ht.get(x, x)
889 if isinstance(x, str): 889 ↛ 892line 889 didn't jump to line 892 because the condition on line 889 was always true
890 if x: 890 ↛ 885line 890 didn't jump to line 885 because the condition on line 890 was always true
891 ret.append(x)
892 elif isinstance(x, (list, tuple)):
893 ret.extend(x)
894 else:
895 raise RuntimeError("map_with unexpected value: {!r}".format(x))
896 return ret
899TagList = list[str]
900PosPathStep = tuple[int, TagList, TagList]
903def check_unknown(
904 from_i: int,
905 to_i: int,
906 i: int,
907 wordlst: Sequence[str],
908 allow_any: bool,
909 no_unknown_starts: bool,
910) -> list[PosPathStep]:
911 """Check if the current section from_i->to_i is actually unknown
912 or if it needs some special handling. We already presupposed that
913 this is UNKNOWN; this is just called to see what *kind* of UNKNOWN."""
914 assert isinstance(to_i, int)
915 assert isinstance(from_i, int)
916 assert isinstance(i, int)
917 # Adds unknown tag if needed. Returns new last_i
918 # print("check_unknown to_i={} from_i={} i={}"
919 # .format(to_i, from_i, i))
920 if from_i >= to_i:
921 return []
922 words = wordlst[from_i:to_i]
923 tag = " ".join(words)
924 assert tag
925 # print(f"{tag=}")
926 if re.match(ignored_unknown_starts_re, tag):
927 # Tags with this start are to be ignored
928 return [(from_i, ["UNKNOWN"], [])]
929 if tag in ignored_unknown_tags: 929 ↛ 930line 929 didn't jump to line 930 because the condition on line 929 was never true
930 return [] # One of the tags listed as to be ignored
931 if tag in ("and", "or"):
932 return []
933 if (
934 not allow_any
935 and not words[0].startswith("~")
936 and (
937 no_unknown_starts
938 or words[0] not in allowed_unknown_starts
939 or len(words) <= 1
940 )
941 ):
942 # print("ERR allow_any={} words={}"
943 # .format(allow_any, words))
944 return [
945 (from_i, ["UNKNOWN"], ["error-unknown-tag"])
946 ] # Add ``tag`` here to include
947 else:
948 return [(from_i, ["UNKNOWN"], [tag])]
951def add_new1(
952 node: ValidNode,
953 i: int,
954 start_i: int,
955 last_i: int,
956 new_paths: list[list[PosPathStep]],
957 new_nodes: list[tuple[ValidNode, int, int]],
958 pos_paths: list[list[list[PosPathStep]]],
959 wordlst: list[str],
960 allow_any: bool,
961 no_unknown_starts: bool,
962 max_last_i: int,
963) -> int:
964 assert isinstance(new_paths, list)
965 # print("add_new: start_i={} last_i={}".format(start_i, last_i))
966 # print("$ {} last_i={} start_i={}"
967 # .format(w, last_i, start_i))
968 max_last_i = max(max_last_i, last_i) # if last_i has grown
969 if (node, start_i, last_i) not in new_nodes:
970 new_nodes.append((node, start_i, last_i))
971 if node.end:
972 # We can see a terminal point in the search tree.
973 u = check_unknown(
974 last_i, start_i, i, wordlst, allow_any, no_unknown_starts
975 )
976 # Create new paths candidates based on different past possible
977 # paths; pos_path[last_i] contains possible paths, so add this
978 # new one at the beginning(?)
979 # The list comprehension inside the parens generates an iterable
980 # of lists, so this is .extend( [(last_i...)], [(last_i...)], ... )
981 # XXX: this is becoming impossible to annotate, nodes might
982 # need to become classed objects and not just dicts, or at least
983 # a TypedDict with a "children" node
984 new_paths.extend(
985 [(last_i, node.tags, node.topics)] + u + x
986 for x in pos_paths[last_i]
987 )
988 max_last_i = i + 1
989 return max_last_i
992@functools.lru_cache(maxsize=65536)
993def decode_tags(
994 src: str,
995 allow_any=False,
996 no_unknown_starts=False,
997) -> tuple[list[tuple[str, ...]], list[str]]:
998 tagsets, topics = decode_tags1(src, allow_any, no_unknown_starts)
999 # print(f"decode_tags: {src=}, {tagsets=}")
1001 # Insert retry-code here that modifies the text source
1002 if (
1003 any(s.startswith("error-") for tagset in tagsets for s in tagset)
1004 # I hate Python's *nested* list comprehension syntax ^
1005 or any(s.startswith("error-") for s in topics)
1006 ):
1007 new_tagsets: list[tuple[str, ...]] = []
1008 new_topics: list[str] = []
1010 if "/" in src:
1011 # slashes_re contains valid key entries with slashes; we're going
1012 # to skip them by splitting the string and skipping handling every
1013 # second entry, which contains the splitting group like "masculine/
1014 # feminine" style keys.
1015 split_parts = re.split(slashes_re, src)
1016 new_parts: list[str] = []
1017 if len(split_parts) > 1:
1018 for i, s in enumerate(split_parts):
1019 if i % 2 == 0:
1020 new_parts.append(s.replace("/", " "))
1021 else:
1022 new_parts.append(s)
1023 new_src = "".join(new_parts)
1024 else:
1025 new_src = src
1026 new_tagsets, new_topics = decode_tags1(
1027 new_src, allow_any, no_unknown_starts
1028 )
1029 elif " or " in src or " and " in src:
1030 # Annoying kludge.
1031 new_src = src.replace(" and ", " ")
1032 new_src = new_src.replace(" or ", " ")
1033 new_tagsets, new_topics = decode_tags1(
1034 new_src, allow_any, no_unknown_starts
1035 )
1036 # print(f"{new_tagsets=}")
1038 if new_tagsets or new_topics:
1039 old_errors = sum(
1040 1 for tagset in tagsets for s in tagset if s.startswith("error")
1041 )
1042 old_errors += sum(1 for s in topics if s.startswith("error"))
1043 new_errors = sum(
1044 1
1045 for new_tagset in new_tagsets
1046 for s in new_tagset
1047 if s.startswith("error")
1048 )
1049 new_errors += sum(1 for s in new_topics if s.startswith("error"))
1051 if new_errors <= old_errors: 1051 ↛ 1054line 1051 didn't jump to line 1054 because the condition on line 1051 was always true
1052 return new_tagsets, new_topics
1054 return tagsets, topics
1057def decode_tags1(
1058 src: str,
1059 allow_any=False,
1060 no_unknown_starts=False,
1061) -> tuple[list[tuple[str, ...]], list[str]]:
1062 """Decodes tags, doing some canonicalizations. This returns a list of
1063 lists of tags and a list of topics."""
1064 assert isinstance(src, str)
1066 # print("decode_tags: src={!r}".format(src))
1068 pos_paths: list[list[list[PosPathStep]]] = [[[]]]
1069 wordlst: list[str] = []
1070 max_last_i = 0 # pre-initialized here so that it can be used as a ref
1072 add_new = functools.partial(
1073 add_new1, # pre-set parameters and references for function
1074 pos_paths=pos_paths,
1075 wordlst=wordlst,
1076 allow_any=allow_any,
1077 no_unknown_starts=no_unknown_starts,
1078 max_last_i=max_last_i,
1079 )
1080 # First split the tags at commas and semicolons. Their significance is that
1081 # a multi-word sequence cannot continue across them.
1082 parts = split_at_comma_semi(src, extra=[";", ":"])
1084 for part in parts:
1085 max_last_i = len(wordlst) # "how far have we gone?"
1086 lst1 = part.split()
1087 if not lst1:
1088 continue
1089 wordlst.extend(lst1)
1090 cur_nodes: list[tuple[ValidNode, int, int]] = [] # Currently seen
1091 for w in lst1:
1092 i = len(pos_paths) - 1
1093 new_nodes: list[tuple[ValidNode, int, int]] = []
1094 # replacement nodes for next loop
1095 new_paths: list[list[PosPathStep]] = []
1096 # print("ITER i={} w={} max_last_i={} wordlst={}"
1097 # .format(i, w, max_last_i, wordlst))
1098 node: ValidNode
1099 start_i: int
1100 last_i: int
1101 for node, start_i, last_i in cur_nodes:
1102 # ValidNodes are part of a search tree that checks if a
1103 # phrase is found in xlat_tags_map and other text->tags dicts.
1104 if w in node.children:
1105 # the phrase continues down the tree
1106 # print("INC", w)
1107 max_last_i = add_new(
1108 node.children[w],
1109 i,
1110 start_i,
1111 last_i,
1112 new_paths,
1113 new_nodes,
1114 )
1115 if node.end:
1116 # we've hit an end point, the tags and topics have already
1117 # been gathered at some point, don't do anything with the
1118 # old stuff
1119 if w in valid_sequences.children:
1120 # This starts a *new* possible section
1121 max_last_i = add_new(
1122 valid_sequences.children[w], # root->
1123 i,
1124 i,
1125 i,
1126 new_paths,
1127 new_nodes,
1128 )
1129 if w not in node.children and not node.end:
1130 # print("w not in node and $: i={} last_i={} wordlst={}"
1131 # .format(i, last_i, wordlst))
1132 # If i == last_i == 0, for example (beginning)
1133 if (
1134 i == last_i
1135 or no_unknown_starts
1136 or wordlst[last_i] not in allowed_unknown_starts
1137 ):
1138 # print("NEW", w)
1139 if w in valid_sequences.children:
1140 # Start new sequences here
1141 max_last_i = add_new(
1142 valid_sequences.children[w],
1143 i,
1144 i,
1145 last_i,
1146 new_paths,
1147 new_nodes,
1148 )
1149 if not new_nodes:
1150 # This is run at the start when i == max_last_i == 0,
1151 # which is what populates the first node in new_nodes.
1152 # Some initial words cause the rest to be interpreted as unknown
1153 # print("not new nodes: i={} last_i={} wordlst={}"
1154 # .format(i, max_last_i, wordlst))
1155 if (
1156 i == max_last_i
1157 or no_unknown_starts
1158 or wordlst[max_last_i] not in allowed_unknown_starts
1159 ):
1160 # print("RECOVER w={} i={} max_last_i={} wordlst={}"
1161 # .format(w, i, max_last_i, wordlst))
1162 if w in valid_sequences.children:
1163 max_last_i = add_new(
1164 # new sequence from root
1165 valid_sequences.children[w],
1166 i,
1167 i,
1168 max_last_i,
1169 new_paths,
1170 new_nodes,
1171 )
1172 cur_nodes = new_nodes # Completely replace nodes!
1173 # 2023-08-18, fix to improve performance
1174 # Decode tags does a big search of the best-shortest matching
1175 # sequences of tags, but the original algorithm didn't have
1176 # any culling happen during operation, so in a case with
1177 # a lot of tags (for example, big blocks of text inserted
1178 # somewhere by mistake that is processed by decode_tags),
1179 # it would lead to exponential growth of new_paths contents.
1180 # This culling, using the same weighting algorithm code as
1181 # in the original is just applied to new_paths before it is
1182 # added to pos_paths. Basically it's "take the 10 best paths".
1183 # This *can* cause bugs if it gets stuck in a local minimum
1184 # or something, but this whole process is one-dimensional
1185 # and not that complex, so hopefully it works out...
1186 pw = []
1187 path: list[PosPathStep]
1188 for path in new_paths:
1189 weight = len(path)
1190 if any(x[1] == ["UNKNOWN"] for x in path):
1191 weight += 100 # Penalize unknown paths
1192 pw.append((weight, path))
1193 new_paths = [weightpath[1] for weightpath in sorted(pw)[:10]]
1194 pos_paths.append(new_paths)
1196 # print("END max_last_i={} len(wordlst)={} len(pos_paths)={}"
1197 # .format(max_last_i, len(wordlst), len(pos_paths)))
1199 if cur_nodes:
1200 # print("END HAVE_NODES")
1201 for node, start_i, last_i in cur_nodes:
1202 if node.end:
1203 # print("$ END start_i={} last_i={}"
1204 # .format(start_i, last_i))
1205 for path in pos_paths[start_i]:
1206 pos_paths[-1].append(
1207 [(last_i, node.tags, node.topics)] + path
1208 )
1209 else:
1210 # print("UNK END start_i={} last_i={} wordlst={}"
1211 # .format(start_i, last_i, wordlst))
1212 u = check_unknown(
1213 last_i,
1214 len(wordlst),
1215 len(wordlst),
1216 wordlst,
1217 allow_any,
1218 no_unknown_starts,
1219 )
1220 if pos_paths[start_i]:
1221 for path in pos_paths[start_i]:
1222 pos_paths[-1].append(u + path)
1223 else:
1224 pos_paths[-1].append(u)
1225 else:
1226 # Check for a final unknown tag
1227 # print("NO END NODES max_last_i={}".format(max_last_i))
1228 paths = pos_paths[max_last_i] or [[]]
1229 u = check_unknown(
1230 max_last_i,
1231 len(wordlst),
1232 len(wordlst),
1233 wordlst,
1234 allow_any,
1235 no_unknown_starts,
1236 )
1237 if u: 1237 ↛ 1084line 1237 didn't jump to line 1084 because the condition on line 1237 was always true
1238 # print("end max_last_i={}".format(max_last_i))
1239 for path in list(paths): # Copy in case it is the last pos
1240 pos_paths[-1].append(u + path)
1242 # import json
1243 # print("POS_PATHS:", json.dumps(pos_paths, indent=2, sort_keys=True))
1245 if not pos_paths[-1]: 1245 ↛ 1247line 1245 didn't jump to line 1247 because the condition on line 1245 was never true
1246 # print("decode_tags: {}: EMPTY POS_PATHS[-1]".format(src))
1247 return [], []
1249 # Find the best path
1250 pw = []
1251 for path in pos_paths[-1]:
1252 weight = len(path)
1253 if any(x[1] == ["UNKNOWN"] for x in path):
1254 weight += 100 # Penalize unknown paths
1255 pw.append((weight, path))
1256 path = min(pw)[1]
1258 # Convert the best path to tagsets and topics
1259 tagsets: list[list[str]] = [[]]
1260 topics: list[str] = []
1261 for i, tagspec, topicspec in path:
1262 if len(tagsets or "") > 16: 1262 ↛ 1265line 1262 didn't jump to line 1265 because the condition on line 1262 was never true
1263 # ctx.error("Too many tagsets! This is probably exponential",
1264 # sortid="form_descriptions/20230818")
1265 return [("error-unknown-tag", "error-exponential-tagsets")], []
1266 if tagspec == ["UNKNOWN"]:
1267 new_tagsets = []
1268 for x in tagsets:
1269 new_tagsets.append(x + topicspec)
1270 tagsets = new_tagsets
1271 continue
1272 if tagspec:
1273 new_tagsets = []
1274 for x in tagsets:
1275 for t in tagspec:
1276 if t: 1276 ↛ 1283line 1276 didn't jump to line 1283 because the condition on line 1276 was always true
1277 new_tags = list(x)
1278 for tag in t.split():
1279 if tag not in new_tags:
1280 new_tags.append(tag)
1281 new_tagsets.append(new_tags)
1282 else:
1283 new_tagsets.append(x)
1284 tagsets = new_tagsets
1285 if topicspec:
1286 for t in topicspec:
1287 for topic in t.split():
1288 if topic not in topics:
1289 topics.append(topic)
1291 # print("unsorted tagsets:", tagsets)
1292 ret_tagsets = sorted(set(tuple(sorted(set(tags))) for tags in tagsets))
1293 # topics = list(sorted(set(topics))) XXX tests expect not sorted
1294 # print("decode_tags: {} -> {} topics {}".format(src, tagsets, topics))
1295 # Yes, ret_tagsets is a list of tags in tuples, while topics is a LIST
1296 # of tags. Turning topics into a tuple breaks tests, turning the tuples
1297 # inside tagsets into lists breaks tests, I'm leaving them mismatched
1298 # for now. XXX
1299 return ret_tagsets, topics
1302def parse_head_final_tags(
1303 wxr: WiktextractContext, lang: str, form: str
1304) -> tuple[str, list[str]]:
1305 """Parses tags that are allowed at the end of a form head from the end
1306 of the form. This can also be used for parsing the final gender etc tags
1307 from translations and linkages."""
1308 assert isinstance(wxr, WiktextractContext)
1309 assert isinstance(lang, str) # Should be language that "form" is for
1310 assert isinstance(form, str)
1312 # print("parse_head_final_tags: lang={} form={!r}".format(lang, form))
1314 # Make sure there are no double spaces in the form as this code does not
1315 # handle them otherwise.
1316 form = re.sub(r"\s+", " ", form.strip())
1317 if not form:
1318 return form, []
1320 origform = form
1322 tags = []
1324 # If parsing for certain Bantu languages (e.g., Swahili), handle
1325 # some extra head-final tags first
1326 if lang in head_final_bantu_langs:
1327 m = re.search(head_final_bantu_re, form)
1328 if m is not None:
1329 tagkeys = m.group(1)
1330 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1330 ↛ 1345line 1330 didn't jump to line 1345 because the condition on line 1330 was always true
1331 form = form[: m.start()]
1332 v = head_final_bantu_map[tagkeys]
1333 if v.startswith("?"): 1333 ↛ 1334line 1333 didn't jump to line 1334 because the condition on line 1333 was never true
1334 v = v[1:]
1335 wxr.wtp.debug(
1336 "suspicious suffix {!r} in language {}: {}".format(
1337 tagkeys, lang, origform
1338 ),
1339 sortid="form_descriptions/1028",
1340 )
1341 tags.extend(v.split())
1343 # If parsing for certain Semitic languages (e.g., Arabic), handle
1344 # some extra head-final tags first
1345 if lang in head_final_semitic_langs:
1346 m = re.search(head_final_semitic_re, form)
1347 if m is not None:
1348 tagkeys = m.group(1)
1349 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1349 ↛ 1364line 1349 didn't jump to line 1364 because the condition on line 1349 was always true
1350 form = form[: m.start()]
1351 v = head_final_semitic_map[tagkeys]
1352 if v.startswith("?"): 1352 ↛ 1353line 1352 didn't jump to line 1353 because the condition on line 1352 was never true
1353 v = v[1:]
1354 wxr.wtp.debug(
1355 "suspicious suffix {!r} in language {}: {}".format(
1356 tagkeys, lang, origform
1357 ),
1358 sortid="form_descriptions/1043",
1359 )
1360 tags.extend(v.split())
1362 # If parsing for certain other languages (e.g., Lithuanian,
1363 # French, Finnish), handle some extra head-final tags first
1364 if lang in head_final_other_langs:
1365 m = re.search(head_final_other_re, form)
1366 if m is not None:
1367 tagkeys = m.group(1)
1368 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1368 ↛ 1373line 1368 didn't jump to line 1373 because the condition on line 1368 was always true
1369 form = form[: m.start()]
1370 tags.extend(head_final_other_map[tagkeys].split(" "))
1372 # Handle normal head-final tags
1373 m = re.search(head_final_re, form)
1374 if m is not None:
1375 tagkeys = m.group(3)
1376 # Only replace tags ending with numbers in languages that have
1377 # head-final numeric tags (e.g., Bantu classes); also, don't replace
1378 # tags if the main title ends with them (then presume they are part
1379 # of the word)
1380 # print("head_final_tags form={!r} tagkeys={!r} lang={}"
1381 # .format(form, tagkeys, lang))
1382 tagkeys_contains_digit = re.search(r"\d", tagkeys)
1383 if (
1384 (not tagkeys_contains_digit or lang in head_final_numeric_langs)
1385 and not wxr.wtp.title.endswith(" " + tagkeys) # type:ignore[union-attr]
1386 and
1387 # XXX the above test does not capture when the whole word is a
1388 # xlat_head_map key, so I added the below test to complement
1389 # it; does this break anything?
1390 not wxr.wtp.title == tagkeys
1391 ): # defunct/English,
1392 # "more defunct" -> "more" ["archaic"]
1393 if not tagkeys_contains_digit or lang in head_final_numeric_langs: 1393 ↛ 1407line 1393 didn't jump to line 1407 because the condition on line 1393 was always true
1394 form = form[: m.start()]
1395 v = xlat_head_map[tagkeys]
1396 if v.startswith("?"): 1396 ↛ 1397line 1396 didn't jump to line 1397 because the condition on line 1396 was never true
1397 v = v[1:]
1398 wxr.wtp.debug(
1399 "suspicious suffix {!r} in language {}: {}".format(
1400 tagkeys, lang, origform
1401 ),
1402 sortid="form_descriptions/1077",
1403 )
1404 tags.extend(v.split())
1406 # Generate warnings about words ending in " or" after processing
1407 if (
1408 (form.endswith(" or") and not origform.endswith(" or"))
1409 or re.search(
1410 r" (1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|"
1411 r"1a|2a|9a|10a|m1|f1|f2|m2|f3|m3|f4|m4|f5|m5|or|\?)"
1412 r"($|/| (f|m|sg|pl|anim|inan))",
1413 form,
1414 )
1415 or form.endswith(" du")
1416 ):
1417 if form not in ok_suspicious_forms:
1418 wxr.wtp.debug(
1419 "suspicious unhandled suffix in {}: {!r}, originally {!r}".format(
1420 lang, form, origform
1421 ),
1422 sortid="form_descriptions/1089",
1423 )
1425 # print("parse_head_final_tags: form={!r} tags={}".format(form, tags))
1426 return form, tags
1429def quote_kept_parens(s: str) -> str:
1430 """Changes certain parenthesized expressions so that they won't be
1431 interpreted as parentheses. This is used for parts that are kept as
1432 part of the word, such as "read admiral (upper half)"."""
1433 return re.sub(
1434 r"\((lower half|upper half|k|s|n|II|III|A|C|G|U|Y|"
1435 r"vinyl|p-phenylene vinylene|\(\(\s*\)\))\)",
1436 r"__lpar__\1__rpar__",
1437 s,
1438 )
1441def quote_kept_ruby(
1442 wxr: WiktextractContext,
1443 ruby_tuples: list[
1444 tuple[
1445 str,
1446 str,
1447 ]
1448 ],
1449 s: str,
1450) -> str:
1451 if len(ruby_tuples) < 1:
1452 wxr.wtp.debug(
1453 "quote_kept_ruby called with no ruby",
1454 sortid="form_description/1114/20230517",
1455 )
1456 return s
1457 ks = []
1458 rs = []
1459 for k, r in ruby_tuples:
1460 ks.append(re.escape(k))
1461 rs.append(re.escape(r))
1462 if not (ks and rs):
1463 wxr.wtp.debug(
1464 f"empty column in ruby_tuples: {ruby_tuples}",
1465 sortid="form_description/1124/20230606",
1466 )
1467 return s
1468 newm = re.compile(
1469 r"({})\s*\(\s*({})\s*\)".format("|".join(ks), "|".join(rs))
1470 )
1471 rub_re = re.compile(
1472 r"({})".format(
1473 r"|".join(
1474 r"{}\(*{}\)*".format(
1475 re.escape(k),
1476 re.escape(r),
1477 )
1478 for k, r in ruby_tuples
1479 )
1480 )
1481 )
1483 def paren_replace(m: re.Match) -> str:
1484 return re.sub(newm, r"\1__lrub__\2__rrub__", m.group(0))
1486 return re.sub(rub_re, paren_replace, s)
1489def unquote_kept_parens(s: str) -> str:
1490 """Conerts the quoted parentheses back to normal parentheses."""
1491 return re.sub(r"__lpar__(.*?)__rpar__", r"(\1)", s)
1494def add_romanization(
1495 wxr: WiktextractContext,
1496 data: WordData,
1497 roman: str,
1498 text: str,
1499 is_reconstruction: bool,
1500 head_group: Optional[int],
1501 ruby: Sequence[tuple[str, str]],
1502) -> None:
1503 tags_lst = ["romanization"]
1504 m = re.match(r"([^:]+):(.+)", roman)
1505 # This function's purpose is to intercept broken romanizations,
1506 # like "Yale: hēnpyeng" style tags. Most romanization styles
1507 # are already present as tags, so we can use decode_tags to find
1508 # them.
1509 if m: 1509 ↛ 1510line 1509 didn't jump to line 1510 because the condition on line 1509 was never true
1510 tagsets, topics = decode_tags(m.group(1))
1511 if tagsets:
1512 for tags in tagsets:
1513 tags_lst.extend(tags)
1514 roman = m.group(2)
1515 add_related(
1516 wxr,
1517 data,
1518 tags_lst,
1519 [roman],
1520 text,
1521 True,
1522 is_reconstruction,
1523 head_group,
1524 ruby,
1525 )
1528def add_related(
1529 wxr: WiktextractContext,
1530 data: WordData,
1531 tags_lst: Union[list[str], tuple[str, ...]],
1532 related_list: list[str],
1533 origtext: str,
1534 add_all_canonicals: bool,
1535 is_reconstruction: bool,
1536 head_group: Optional[int],
1537 ruby_data: Optional[Sequence[tuple[str, str]]] = None,
1538) -> Optional[list[tuple[str, ...]]]:
1539 """Internal helper function for some post-processing entries for related
1540 forms (e.g., in word head). This returns a list of list of tags to be
1541 added to following related forms or None (cf. walrus/English word head,
1542 parenthesized part starting with "both")."""
1543 assert isinstance(wxr, WiktextractContext)
1544 assert isinstance(tags_lst, (list, tuple))
1545 for x in tags_lst:
1546 assert isinstance(x, str)
1547 assert isinstance(related_list, (list, tuple))
1548 assert isinstance(origtext, str)
1549 assert add_all_canonicals in (True, False)
1550 assert isinstance(ruby_data, (list, tuple)) or ruby_data is None
1551 if ruby_data is None: 1551 ↛ 1552line 1551 didn't jump to line 1552 because the condition on line 1551 was never true
1552 ruby_data = []
1553 # print("add_related: tags_lst={} related={}".format(tags_lst, related))
1554 related = " ".join(related_list)
1555 if related == "[please provide]": 1555 ↛ 1556line 1555 didn't jump to line 1556 because the condition on line 1555 was never true
1556 return None
1557 if related in IGNORED_RELATED: 1557 ↛ 1558line 1557 didn't jump to line 1558 because the condition on line 1557 was never true
1558 return None
1559 if is_reconstruction and related.startswith("*") and len(related) > 1:
1560 related = related[1:]
1562 # Get title word, with any reconstruction prefix removed
1563 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title) # type:ignore[arg-type]
1565 def check_related(related: str) -> None:
1566 # Warn about some suspicious related forms
1567 m = re.search(suspicious_related_re, related)
1568 if (m and m.group(0) not in titleword) or (
1569 related in ("f", "m", "n", "c") and len(titleword) >= 3
1570 ):
1571 if "eumhun" in tags_lst: 1571 ↛ 1572line 1571 didn't jump to line 1572 because the condition on line 1571 was never true
1572 return
1573 if "cangjie-input" in tags_lst: 1573 ↛ 1574line 1573 didn't jump to line 1574 because the condition on line 1573 was never true
1574 return
1575 if "class" in tags_lst: 1575 ↛ 1576line 1575 didn't jump to line 1576 because the condition on line 1575 was never true
1576 return
1577 if wxr.wtp.section == "Korean" and re.search( 1577 ↛ 1581line 1577 didn't jump to line 1581 because the condition on line 1577 was never true
1578 r"^\s*\w*>\w*\s*$", related
1579 ):
1580 # ignore Korean "i>ni" / "라>나" values
1581 return
1582 if ( 1582 ↛ 1589line 1582 didn't jump to line 1589 because the condition on line 1582 was never true
1583 wxr.wtp.section == "Burmese"
1584 and "romanization" in tags_lst
1585 and re.search(r":", related)
1586 ):
1587 # ignore Burmese with ":", that is used in Burmese
1588 # translitteration of "း", the high-tone visarga.
1589 return
1590 wxr.wtp.debug(
1591 "suspicious related form tags {}: {!r} in {!r}".format(
1592 tags_lst, related, origtext
1593 ),
1594 sortid="form_descriptions/1147",
1595 )
1597 following_tagsets = None # Tagsets to add to following related forms
1598 roman = None
1599 tagsets1: list[tuple[str, ...]] = [tuple()]
1600 topics1: list[str] = []
1602 m = re.match(r"\((([^()]|\([^()]*\))*)\)\s+", related)
1603 if m:
1604 paren = m.group(1)
1605 related = related[m.end() :]
1606 m = re.match(r"^(all|both) (.*)", paren)
1607 if m: 1607 ↛ 1608line 1607 didn't jump to line 1608 because the condition on line 1607 was never true
1608 tagsets1, topics1 = decode_tags(m.group(2))
1609 following_tagsets = tagsets1
1610 else:
1611 tagsets1, topics1 = decode_tags(paren)
1612 else:
1613 m = re.search(r"\s+\((([^()]|\([^()]*\))*)\)$", related)
1614 if m:
1615 paren = m.group(1)
1616 if paren.startswith("U+"): 1616 ↛ 1617line 1616 didn't jump to line 1617 because the condition on line 1616 was never true
1617 related = related[: m.start()]
1618 else:
1619 cls = classify_desc(paren)
1620 if ( 1620 ↛ 1627line 1620 didn't jump to line 1627 because the condition on line 1620 was always true
1621 cls in ("romanization", "english")
1622 and classify_desc(related[: m.start()]) == "other"
1623 ):
1624 roman = paren
1625 related = related[: m.start()]
1626 else:
1627 related = related[: m.start()]
1628 tagsets1, topics1 = decode_tags(paren)
1629 if related and related.startswith("{{"): 1629 ↛ 1630line 1629 didn't jump to line 1630 because the condition on line 1629 was never true
1630 wxr.wtp.debug(
1631 "{{ in word head form - possible Wiktionary error: {!r}".format(
1632 related
1633 ),
1634 sortid="form_descriptions/1177",
1635 )
1636 return None # Likely Wiktionary coding error
1637 related = unquote_kept_parens(related)
1638 # Split related by "/" (e.g., grande/Spanish) superlative in head
1639 # Do not split if / in word title, see π//Japanese
1640 if len(related) > 5 and "/" not in wxr.wtp.title: # type:ignore[operator]
1641 alts = split_at_comma_semi(related, separators=["/"])
1642 else:
1643 alts = [related]
1644 if ruby_data: 1644 ↛ 1646line 1644 didn't jump to line 1646 because the condition on line 1644 was never true
1645 # prepare some regex stuff in advance
1646 ks, rs = [], []
1647 for k, r in ruby_data:
1648 ks.append(re.escape(k))
1649 rs.append(re.escape(r))
1650 splitter = r"((?:{})__lrub__(?:{})__rrub__)".format(
1651 "|".join(ks), "|".join(rs)
1652 )
1653 for related in alts:
1654 ruby: list[tuple[str, str]] = []
1655 if ruby_data: 1655 ↛ 1656line 1655 didn't jump to line 1656 because the condition on line 1655 was never true
1656 new_related = []
1657 rub_split = re.split(splitter, related)
1658 for s in rub_split:
1659 m = re.match(r"(.+)__lrub__(.+)__rrub__", s)
1660 if m:
1661 # add ruby with (\1, \2)
1662 ruby.append((m.group(1), m.group(2)))
1663 new_related.append(m.group(1))
1664 else:
1665 new_related.append(s)
1666 related = "".join(new_related)
1667 tagsets2, topics2 = decode_tags(" ".join(tags_lst))
1668 for tags1 in tagsets1:
1669 assert isinstance(tags1, (list, tuple))
1670 for tags2 in tagsets2:
1671 assert isinstance(tags1, (list, tuple))
1672 dt: LinkageData = {"word": related}
1673 if roman:
1674 dt["roman"] = roman
1675 if ruby: 1675 ↛ 1676line 1675 didn't jump to line 1676 because the condition on line 1675 was never true
1676 dt["ruby"] = ruby
1677 if "alt-of" in tags2: 1677 ↛ 1678line 1677 didn't jump to line 1678 because the condition on line 1677 was never true
1678 check_related(related)
1679 data_extend(data, "tags", tags1)
1680 data_extend(data, "tags", tags2)
1681 data_extend(data, "topics", topics1)
1682 data_extend(data, "topics", topics2)
1683 data_append(data, "alt_of", dt)
1684 elif "form-of" in tags2: 1684 ↛ 1685line 1684 didn't jump to line 1685 because the condition on line 1684 was never true
1685 check_related(related)
1686 data_extend(data, "tags", tags1)
1687 data_extend(data, "tags", tags2)
1688 data_extend(data, "topics", topics1)
1689 data_extend(data, "topics", topics2)
1690 data_append(data, "form_of", dt)
1691 elif "compound-of" in tags2: 1691 ↛ 1692line 1691 didn't jump to line 1692 because the condition on line 1691 was never true
1692 check_related(related)
1693 data_extend(data, "tags", tags1)
1694 data_extend(data, "tags", tags2)
1695 data_extend(data, "topics", topics1)
1696 data_extend(data, "topics", topics2)
1697 data_append(data, "compound", related)
1698 else:
1699 lang = wxr.wtp.section or "LANG_MISSING"
1700 related, final_tags = parse_head_final_tags(
1701 wxr, lang, related
1702 )
1703 # print("add_related: related={!r} tags1={!r} tags2={!r} "
1704 # "final_tags={!r}"
1705 # .format(related, tags1, tags2, final_tags))
1706 tags = list(tags1) + list(tags2) + list(final_tags)
1707 check_related(related)
1708 form: FormData = {"form": related}
1709 if head_group:
1710 form["head_nr"] = head_group
1711 if roman:
1712 form["roman"] = roman
1713 if ruby: 1713 ↛ 1714line 1713 didn't jump to line 1714 because the condition on line 1713 was never true
1714 form["ruby"] = ruby
1715 data_extend(form, "topics", topics1)
1716 data_extend(form, "topics", topics2)
1717 if topics1 or topics2: 1717 ↛ 1718line 1717 didn't jump to line 1718 because the condition on line 1717 was never true
1718 wxr.wtp.debug(
1719 "word head form has topics: {}".format(form),
1720 sortid="form_descriptions/1233",
1721 )
1722 # Add tags from canonical form into the main entry
1723 if "canonical" in tags:
1724 if related in ("m", "f") and len(titleword) > 1: 1724 ↛ 1725line 1724 didn't jump to line 1725 because the condition on line 1724 was never true
1725 wxr.wtp.debug(
1726 "probably incorrect canonical form "
1727 "{!r} ignored (probably tag combination "
1728 "missing from xlat_head_map)".format(related),
1729 sortid="form_descriptions/1241",
1730 )
1731 continue
1732 if (
1733 related != titleword
1734 or add_all_canonicals
1735 or topics1
1736 or topics2
1737 or ruby
1738 ):
1739 data_extend(form, "tags", list(sorted(set(tags))))
1740 else:
1741 # We won't add canonical form here
1742 filtered_tags = list(
1743 x for x in tags if x != "canonical"
1744 )
1745 data_extend(data, "tags", filtered_tags)
1746 continue
1747 else:
1748 data_extend(form, "tags", list(sorted(set(tags))))
1749 # Only insert if the form is not already there
1750 for old in data.get("forms", ()):
1751 if form == old: 1751 ↛ 1752line 1751 didn't jump to line 1752 because the condition on line 1751 was never true
1752 break
1753 else:
1754 data_append(data, "forms", form)
1756 # If this form had pre-tags that started with "both" or "all", add those
1757 # tags also to following related forms that don't have their own tags
1758 # specified.
1759 return following_tagsets
1762def parse_word_head(
1763 wxr: WiktextractContext,
1764 pos: str,
1765 text: str,
1766 data: WordData,
1767 is_reconstruction: bool,
1768 head_group: Optional[int],
1769 ruby=None,
1770 links=None,
1771) -> None:
1772 """Parses the head line for a word for in a particular language and
1773 part-of-speech, extracting tags and related forms."""
1774 assert isinstance(wxr, WiktextractContext)
1775 assert isinstance(pos, str)
1776 assert isinstance(text, str)
1777 assert isinstance(data, dict)
1778 assert isinstance(ruby, (list, tuple)) or ruby is None
1779 if ruby is None:
1780 ruby = []
1781 assert is_reconstruction in (True, False)
1782 # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text))
1783 # print(f"PARSE_WORD_HEAD: {data=}")
1784 if links is None:
1785 links = []
1787 if len(links) > 0:
1788 # if we have link data (that is, links with stuff like commas and
1789 # spaces, replace word_re with a modified local scope pattern
1790 word_re = re.compile(
1791 r"|".join(
1792 sorted((re.escape(s) for s in links), key=lambda x: -len(x))
1793 )
1794 + r"|"
1795 + word_pattern
1796 )
1797 else:
1798 word_re = word_re_global
1800 if "Lua execution error" in text or "Lua timeout error" in text: 1800 ↛ 1801line 1800 didn't jump to line 1801 because the condition on line 1800 was never true
1801 return
1803 # In Aug 2021, some words had spurious Template:en at the end of head forms
1804 # due to a Wiktionary error.
1805 text = re.sub(r"\s+Template:[-a-zA-Z]+\s*$", "", text)
1807 # Fix words with "superlative:" or "comparative:" at end of head
1808 # e.g. grande/Spanish/Adj
1809 text = re.sub(r" (superlative|comparative): (.*)", r" (\1 \2)", text)
1811 # Parse Arabic non-past forms, e.g. أبلع/Arabic/Verb
1812 m = re.search(r", non-past ([^)]+ \([^)]+\))", text)
1813 if m:
1814 add_related(
1815 wxr,
1816 data,
1817 ["non-past"],
1818 [m.group(1)],
1819 text,
1820 True,
1821 is_reconstruction,
1822 head_group,
1823 ruby,
1824 )
1825 text = text[: m.start()] + text[m.end() :]
1827 language = wxr.wtp.section
1828 titleword = re.sub(
1829 r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "MISSING_TITLE"
1830 )
1831 titleparts = list(
1832 m.group(0)
1833 for m in re.finditer(word_re, wxr.wtp.title or "MISSING_TITLE")
1834 )
1835 if not titleparts: 1835 ↛ 1836line 1835 didn't jump to line 1836 because the condition on line 1835 was never true
1836 return
1838 # Remove " or" from the end to prevent weird canonical forms
1839 if text.endswith(" or"):
1840 for tp in titleparts:
1841 if text.endswith(tp): 1841 ↛ 1842line 1841 didn't jump to line 1842 because the condition on line 1841 was never true
1842 break
1843 else:
1844 text = text.removesuffix(" or").rstrip()
1846 # Handle the part of the head that is not in parentheses. However, certain
1847 # parenthesized parts are part of word, and those must be handled
1848 # specially here.
1849 if ruby: 1849 ↛ 1850line 1849 didn't jump to line 1850 because the condition on line 1849 was never true
1850 text = quote_kept_ruby(wxr, ruby, text)
1851 base = text
1852 base = quote_kept_parens(base)
1853 base = remove_text_in_parentheses(base)
1854 base = base.replace("?", "") # Removes uncertain articles etc
1855 base = re.sub(r"\s+", " ", base)
1856 base = re.sub(r" ([,;])", r"\1", base)
1857 base = re.sub(r"(.*) •.*", r"\1", base)
1858 # Many languages use • as a punctuation mark separating the base
1859 # from the rest of the head. στάδιος/Ancient Greek, issue #176
1860 base = base.strip()
1862 # Check for certain endings in head (mostly for compatibility with weird
1863 # heads, e.g. rata/Romanian "1st conj." at end)
1864 m = re.search(head_end_re, base)
1865 tags: Union[tuple[str, ...], list[str]] = []
1866 if m: 1866 ↛ 1867line 1866 didn't jump to line 1867 because the condition on line 1866 was never true
1867 tags = head_end_map[m.group(1).lower()].split()
1868 data_extend(data, "tags", tags)
1869 base = base[: m.start()]
1871 # Special case: handle Hán Nôm readings for Vietnamese characters
1872 m = re.match(
1873 r"{}: (Hán Nôm) readings: (.*)".format(re.escape(titleword)), base
1874 )
1875 if m: 1875 ↛ 1876line 1875 didn't jump to line 1876 because the condition on line 1875 was never true
1876 tag, readings = m.groups()
1877 tag = re.sub(r"\s+", "-", tag)
1878 for reading in split_at_comma_semi(readings, skipped=links):
1879 add_related(
1880 wxr,
1881 data,
1882 [tag],
1883 [reading],
1884 text,
1885 True,
1886 is_reconstruction,
1887 head_group,
1888 ruby,
1889 )
1890 return
1892 # Special case: Hebrew " [pattern: nnn]" ending
1893 m = re.search(r"\s+\[pattern: ([^]]+)\]", base)
1894 if m: 1894 ↛ 1895line 1894 didn't jump to line 1895 because the condition on line 1894 was never true
1895 add_related(
1896 wxr,
1897 data,
1898 ["class"],
1899 [m.group(1)],
1900 text,
1901 True,
1902 is_reconstruction,
1903 head_group,
1904 ruby,
1905 )
1906 base = base[: m.start()] + base[m.end() :]
1908 # Clean away some messy "Upload an image" template text used in
1909 # American Sign Language:
1910 # S@NearBaseForearm-PalmUp Frontandback S@BaseForearm-PalmUp
1911 m = re.search(r"Upload .+ gif image.", base)
1912 if m: 1912 ↛ 1913line 1912 didn't jump to line 1913 because the condition on line 1912 was never true
1913 base = base[: m.start()] + base[m.end() :]
1915 # Split the head into alternatives. This is a complicated task, as
1916 # we do not want so split on "or" or "," when immediately followed by more
1917 # head-final tags, but otherwise do want to split by them.
1918 # 20230907 added "or" to this to handle 'true or false', titles with 'or'
1919 if wxr.wtp.title and ("," in wxr.wtp.title or " or " in wxr.wtp.title):
1920 # A kludge to handle article titles/phrases with commas.
1921 # Preprocess splits to first capture the title, then handle
1922 # all the others as usual.
1923 presplits = re.split(r"({})".format(wxr.wtp.title), base)
1924 splits = []
1925 for psplit in presplits:
1926 if psplit == wxr.wtp.title:
1927 splits.append(psplit)
1928 else:
1929 splits.extend(re.split(head_split_re, psplit))
1930 else:
1931 # Do the normal split; previous only-behavior.
1932 splits = re.split(head_split_re, base)
1933 # print("SPLITS:", splits)
1934 alts: list[str] = []
1935 # print("parse_word_head: splits:", splits,
1936 # "head_split_re_parens:", head_split_re_parens)
1937 for i in range(
1938 0, len(splits) - head_split_re_parens, head_split_re_parens + 1
1939 ):
1940 v = splits[i]
1941 ending = splits[i + 1] or "" # XXX is this correct???
1942 # print("parse_word_head alts v={!r} ending={!r} alts={}"
1943 # .format(v, ending, alts))
1944 if alts and (v == "" and ending):
1945 assert ending[0] == " "
1946 alts[-1] += " or" + ending # endings starts with space
1947 elif v or ending: 1947 ↛ 1937line 1947 didn't jump to line 1937 because the condition on line 1947 was always true
1948 alts.append((v or "") + (ending or ""))
1949 last = splits[-1].strip()
1950 conn = "" if len(splits) < 3 else splits[-2]
1951 # print("parse_word_head alts last={!r} conn={!r} alts={}"
1952 # .format(last, conn, alts))
1953 if (
1954 alts
1955 and last
1956 and (
1957 last.split()[0] in xlat_head_map
1958 or (
1959 conn == " or "
1960 and (alts[-1] + " or " + last).strip() in xlat_head_map
1961 )
1962 )
1963 ):
1964 alts[-1] += " or " + last
1965 elif last:
1966 alts.append(last)
1968 # print("parse_word_head alts: {}".format(alts))
1969 # print(f"{base=}")
1971 # Process the head alternatives
1972 canonicals: list[tuple[list[str], list[str]]] = []
1973 mode: Optional[str] = None
1974 for alt_i, alt in enumerate(alts):
1975 alt = alt.strip()
1976 if alt.startswith("compound form:"): 1976 ↛ 1977line 1976 didn't jump to line 1977 because the condition on line 1976 was never true
1977 mode = "compound-form"
1978 alt = alt[14:].strip()
1979 if mode == "compound-form": 1979 ↛ 1980line 1979 didn't jump to line 1980 because the condition on line 1979 was never true
1980 add_related(
1981 wxr,
1982 data,
1983 ["in-compounds"],
1984 [alt],
1985 text,
1986 True,
1987 is_reconstruction,
1988 head_group,
1989 ruby,
1990 )
1991 continue
1992 # For non-first parts, see if it can be treated as tags-only
1993 if alt_i == 0:
1994 expanded_alts = [alt]
1995 else:
1996 expanded_alts = map_with(xlat_descs_map, [alt])
1997 # print("EXPANDED_ALTS:", expanded_alts)
1998 tagsets: Optional[list[tuple[str, ...]]]
1999 for alt in expanded_alts:
2000 baseparts = list(m.group(0) for m in re.finditer(word_re, alt))
2001 if alt_i > 0:
2002 tagsets, topics = decode_tags(" ".join(baseparts))
2003 if not any("error-unknown-tag" in x for x in tagsets):
2004 data_extend(data, "topics", topics)
2005 for tags1 in tagsets:
2006 data_extend(data, "tags", tags1)
2007 continue
2009 alt, tags = parse_head_final_tags(
2010 wxr, language or "MISSING_LANG", alt
2011 )
2012 tags = list(tags) # Make sure we don't modify anything cached
2013 tags.append("canonical")
2014 if alt_i == 0 and "," in wxr.wtp.title: # type:ignore[operator]
2015 # Kludge to handle article titles/phrases with commas.
2016 # basepart's regex strips commas, which leads to a
2017 # canonical form that is the title phrase without a comma.
2018 # basepart in add_related is almost immediately joined with
2019 # spaces anyhow. XXX not exactly sure why it's
2020 # canonicals.append((tags, baseparts)) and not (tags, [alt])
2021 baseparts = [alt]
2022 canonicals.append((tags, baseparts))
2023 for tags, baseparts in canonicals:
2024 add_related(
2025 wxr,
2026 data,
2027 tags,
2028 baseparts,
2029 text,
2030 len(canonicals) > 1,
2031 is_reconstruction,
2032 head_group,
2033 ruby,
2034 )
2036 # Handle parenthesized descriptors for the word form and links to
2037 # related words
2038 text = quote_kept_parens(text)
2039 parens = list(
2040 m.group(2)
2041 for m in re.finditer(r"(^|\s)\((([^()]|\([^()]*\))*)\)", text)
2042 )
2043 parens.extend(
2044 m.group(1)
2045 for m in re.finditer(r"[^\s]\((([^()]|\([^()]*\))*)\)($|\s)", text)
2046 )
2047 have_romanization = False
2048 have_ruby = False
2049 hiragana = ""
2050 katakana = ""
2051 for paren in parens:
2052 paren = paren.strip()
2053 if not paren: 2053 ↛ 2054line 2053 didn't jump to line 2054 because the condition on line 2053 was never true
2054 continue
2055 if paren.startswith("see "):
2056 continue
2057 if paren.startswith("U+"): 2057 ↛ 2058line 2057 didn't jump to line 2058 because the condition on line 2057 was never true
2058 continue
2059 # In some rare cases, strip word that inflects form the form
2060 # description, e.g. "look through rose-tinted glasses"/English.
2061 paren = re.sub(r"\s*\(\[[^])]*\]\)", "", paren)
2063 # If it starts with hiragana or katakana, treat as such form. Note
2064 # that each hiragana/katakana character is in separate parentheses,
2065 # so we must concatenate them.
2066 try:
2067 un = unicodedata.name(paren[0]).split()[0]
2068 except ValueError:
2069 un = "INVALID"
2070 if un == "KATAKANA": 2070 ↛ 2071line 2070 didn't jump to line 2071 because the condition on line 2070 was never true
2071 katakana += paren
2072 have_ruby = True
2073 continue
2074 if un == "HIRAGANA": 2074 ↛ 2075line 2074 didn't jump to line 2075 because the condition on line 2074 was never true
2075 hiragana += paren
2076 have_ruby = True
2077 continue
2079 # Parse format ", 16 (Japan, Mainland), 17 (Hong Kong, Taiwan) strokes,"
2080 # in the middle of the parenthesized expression, e.g. 薄
2081 def strokes_repl(m: re.Match) -> str:
2082 strokes1, tags1, strokes2, tags2 = m.groups()
2083 for strokes, tags in [[strokes1, tags1], [strokes2, tags2]]:
2084 tags = tags.split(", ")
2085 tags = list(
2086 "Mainland China" if t == "Mainland" else t for t in tags
2087 )
2088 tags.append("strokes")
2089 add_related(
2090 wxr,
2091 data,
2092 tags,
2093 [strokes],
2094 text,
2095 True,
2096 is_reconstruction,
2097 head_group,
2098 ruby,
2099 )
2100 return ", "
2102 paren = re.sub(
2103 r", (\d+) \(([^()]+)\), (\d+) \(([^()]+)\) strokes, ",
2104 strokes_repl,
2105 paren,
2106 )
2108 descriptors = map_with(xlat_descs_map, [paren])
2109 new_desc = []
2110 for desc in descriptors:
2111 new_desc.extend(
2112 map_with(
2113 xlat_tags_map,
2114 split_at_comma_semi(desc, extra=[", or "], skipped=links),
2115 )
2116 )
2117 prev_tags: Union[list[list[str]], list[tuple[str, ...]], None] = None
2118 following_tags = None # Added to prev_tags from previous parenthesized
2119 # part, e.g. walrus/English
2120 # "(both nonstandard, proscribed, uncommon)"
2121 for desc_i, desc in enumerate(new_desc):
2122 # print("HEAD DESC: {!r}".format(desc))
2124 # Abort on certain descriptors (assume remaining values are
2125 # examples or uninteresting, cf. gaan/Navajo, horior/Latin)
2126 if re.match(r"^(per |e\.g\.$)", desc): 2126 ↛ 2127line 2126 didn't jump to line 2127 because the condition on line 2126 was never true
2127 break
2129 # If it all consists of CJK characters, add it with the
2130 # CJK tag. This is used at least for some Vietnamese
2131 # words (e.g., ba/Vietnamese)
2132 try:
2133 if all(unicodedata.name(x).startswith("CJK ") for x in desc): 2133 ↛ 2134line 2133 didn't jump to line 2134 because the condition on line 2133 was never true
2134 add_related(
2135 wxr,
2136 data,
2137 ["CJK"],
2138 [desc],
2139 text,
2140 True,
2141 is_reconstruction,
2142 head_group,
2143 ruby,
2144 )
2145 continue
2146 except ValueError:
2147 pass
2149 # Handle some special cases
2150 splitdesc = desc.split()
2151 if ( 2151 ↛ 2160line 2151 didn't jump to line 2160 because the condition on line 2151 was never true
2152 len(splitdesc) >= 3
2153 and splitdesc[1] == "superlative"
2154 and classify_desc(splitdesc[0]) != "tags"
2155 and prev_tags
2156 ):
2157 # Handle the special case of second comparative after comma,
2158 # followed by superlative without comma. E.g.
2159 # mal/Portuguese/Adv
2160 for ts in prev_tags:
2161 add_related(
2162 wxr,
2163 data,
2164 ts,
2165 [splitdesc[0]],
2166 text,
2167 True,
2168 is_reconstruction,
2169 head_group,
2170 ruby,
2171 )
2172 desc = " ".join(splitdesc[1:])
2173 elif ( 2173 ↛ 2181line 2173 didn't jump to line 2181 because the condition on line 2173 was never true
2174 len(splitdesc) == 2
2175 and splitdesc[0] in ("also", "and")
2176 and prev_tags
2177 and classify_desc(splitdesc[1]) != "tags"
2178 ):
2179 # Sometimes alternative forms are prefixed with "also" or
2180 # "and"
2181 for ts in prev_tags:
2182 add_related(
2183 wxr,
2184 data,
2185 ts,
2186 [splitdesc[1]],
2187 text,
2188 True,
2189 is_reconstruction,
2190 head_group,
2191 ruby,
2192 )
2193 continue
2194 elif len(splitdesc) >= 2 and splitdesc[0] in ("including",): 2194 ↛ 2195line 2194 didn't jump to line 2195 because the condition on line 2194 was never true
2195 continue
2197 # If only one word, assume it is comma-separated alternative
2198 # to the previous one
2199 if " " not in desc:
2200 cls = classify_desc(desc)
2201 if cls != "tags":
2202 if prev_tags: 2202 ↛ 2204line 2202 didn't jump to line 2204 because the condition on line 2202 was never true
2203 # Assume comma-separated alternative to previous one
2204 for ts in prev_tags:
2205 add_related(
2206 wxr,
2207 data,
2208 ts,
2209 [desc],
2210 text,
2211 True,
2212 is_reconstruction,
2213 head_group,
2214 ruby,
2215 )
2216 continue
2217 elif distw(titleparts, desc) <= 0.5: 2217 ↛ 2220line 2217 didn't jump to line 2220 because the condition on line 2217 was never true
2218 # Similar to head word, assume a dialectal variation to
2219 # the base form. Cf. go/Alemannic German/Verb
2220 add_related(
2221 wxr,
2222 data,
2223 ["alternative"],
2224 [desc],
2225 text,
2226 True,
2227 is_reconstruction,
2228 head_group,
2229 ruby,
2230 )
2231 continue
2232 elif ( 2232 ↛ 2253line 2232 didn't jump to line 2253 because the condition on line 2232 was always true
2233 cls in ("romanization", "english")
2234 and not have_romanization
2235 and classify_desc(titleword) == "other"
2236 and not (
2237 "categories" in data and desc in data["categories"]
2238 )
2239 ):
2240 # Assume it to be a romanization
2241 add_romanization(
2242 wxr,
2243 data,
2244 desc,
2245 text,
2246 is_reconstruction,
2247 head_group,
2248 ruby,
2249 )
2250 have_romanization = True
2251 continue
2253 m = re.match(r"^(\d+) strokes?$", desc)
2254 if m:
2255 # Special case, used to give #strokes for Han characters
2256 add_related(
2257 wxr,
2258 data,
2259 ["strokes"],
2260 [m.group(1)],
2261 text,
2262 True,
2263 is_reconstruction,
2264 head_group,
2265 ruby,
2266 )
2267 continue
2269 # See if it is radical+strokes
2270 m = re.match(
2271 r"^([\u2F00-\u2FDF\u2E80-\u2EFF\U00018800-\U00018AFF"
2272 r"\uA490-\uA4CF\u4E00-\u9FFF]\+\d+)"
2273 r"( in (Japanese|Chinese|traditional Chinese|"
2274 r"simplified Chinese))?$",
2275 desc,
2276 )
2277 if m: 2277 ↛ 2280line 2277 didn't jump to line 2280 because the condition on line 2277 was never true
2278 # Special case, used to give radical + strokes for Han
2279 # characters
2280 radical_strokes = m.group(1)
2281 lang = m.group(3)
2282 t = ["radical+strokes"]
2283 if lang:
2284 t.extend(lang.split())
2285 add_related(
2286 wxr,
2287 data,
2288 t,
2289 [radical_strokes],
2290 text,
2291 True,
2292 is_reconstruction,
2293 head_group,
2294 ruby,
2295 )
2296 prev_tags = None
2297 following_tags = None
2298 continue
2300 # See if it indicates historical Katakana ortography (←) or
2301 # just otherwise katakana/hiragana form
2302 m = re.match(r"←\s*|kana\s+", desc)
2303 if m: 2303 ↛ 2304line 2303 didn't jump to line 2304 because the condition on line 2303 was never true
2304 if desc.startswith("←"):
2305 t1 = "historical "
2306 else:
2307 t1 = ""
2308 x = desc[m.end() :]
2309 if x.endswith("?"):
2310 x = x[:-1]
2311 # XXX should we add a tag indicating uncertainty?
2312 if x:
2313 name = unicodedata.name(x[0])
2314 if name.startswith("HIRAGANA "):
2315 desc = t1 + "hiragana " + x
2316 elif name.startswith("KATAKANA "):
2317 desc = t1 + "katakana " + x
2319 # See if it is "n strokes in Chinese" or similar
2320 m = re.match(
2321 r"(\d+) strokes in (Chinese|Japanese|"
2322 r"traditional Chinese|simplified Chinese)$",
2323 desc,
2324 )
2325 if m: 2325 ↛ 2327line 2325 didn't jump to line 2327 because the condition on line 2325 was never true
2326 # Special case, used to give just strokes for some Han chars
2327 strokes = m.group(1)
2328 lang = m.group(2)
2329 t = ["strokes"]
2330 t.extend(lang.split())
2331 add_related(
2332 wxr,
2333 data,
2334 t,
2335 [strokes],
2336 text,
2337 True,
2338 is_reconstruction,
2339 head_group,
2340 ruby,
2341 )
2342 prev_tags = None
2343 following_tags = None
2344 continue
2346 # American Sign Language has images (or requests for image)
2347 # as heads, + this ASL gloss after.
2348 m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text)
2349 if m2: 2349 ↛ 2350line 2349 didn't jump to line 2350 because the condition on line 2349 was never true
2350 add_related(
2351 wxr,
2352 data,
2353 ["ASL-gloss"],
2354 [m2.group(1)],
2355 text,
2356 True,
2357 is_reconstruction,
2358 head_group,
2359 ruby,
2360 )
2361 continue
2363 parts = list(m.group(0) for m in re.finditer(word_re, desc))
2364 if not parts: 2364 ↛ 2365line 2364 didn't jump to line 2365 because the condition on line 2364 was never true
2365 prev_tags = None
2366 following_tags = None
2367 continue
2369 # Check for certain language-specific header part starts that
2370 # modify
2371 if len(parts) == 2 and language in lang_specific_head_map: 2371 ↛ 2372line 2371 didn't jump to line 2372 because the condition on line 2371 was never true
2372 ht = lang_specific_head_map[language]
2373 if parts[0] in ht:
2374 rem_tags, add_tags = ht[parts[0]]
2375 new_prev_tags1: list[list[str]] = []
2376 tags2: Union[tuple[str, ...], list[str]]
2377 for tags2 in prev_tags or [()]:
2378 if rem_tags is True: # Remove all old tags
2379 tsets = set()
2380 else:
2381 tsets = set(tags2) - set(rem_tags.split())
2382 tsets = tsets | set(add_tags.split())
2383 tags = list(sorted(tsets))
2384 add_related(
2385 wxr,
2386 data,
2387 tags,
2388 [parts[1]],
2389 text,
2390 True,
2391 is_reconstruction,
2392 head_group,
2393 ruby,
2394 )
2395 new_prev_tags1.append(tags)
2396 prev_tags = new_prev_tags1
2397 following_tags = None
2398 continue
2400 # Handle the special case of descriptors that are parenthesized,
2401 # e.g., (archaic or Scotland)
2402 m = re.match(r"\(([^)]+)\)\s+(.*)$", desc)
2403 if m is not None and classify_desc(m.group(1)) == "tags": 2403 ↛ 2404line 2403 didn't jump to line 2404 because the condition on line 2403 was never true
2404 tagpart = m.group(1)
2405 related = [m.group(2)]
2406 tagsets, topics = decode_tags(tagpart, no_unknown_starts=True)
2407 if topics:
2408 wxr.wtp.debug(
2409 "parenthized head part {!r} contains topics: {}".format(
2410 tagpart, topics
2411 ),
2412 sortid="form_descriptions/1647",
2413 )
2414 elif m is not None and re.match(r"in the sense ", m.group(1)): 2414 ↛ 2417line 2414 didn't jump to line 2417 because the condition on line 2414 was never true
2415 # Handle certain ignored cases
2416 # e.g. bord/Danish: in the sense "plank"
2417 related = [m.group(2)]
2418 tagsets = [()]
2419 else:
2420 # Normal parsing of the descriptor
2421 alt_related = None
2422 alt_tagsets = None
2423 tagsets = None
2424 for i in range(len(parts), 0, -1): 2424 ↛ 2469line 2424 didn't jump to line 2469 because the loop on line 2424 didn't complete
2425 related = parts[i:]
2426 tagparts = parts[:i]
2427 # print(" i={} related={} tagparts={}"
2428 # .format(i, related, tagparts))
2429 tagsets, topics = decode_tags(
2430 " ".join(tagparts), no_unknown_starts=True
2431 )
2432 # print("tagparts={!r} tagsets={} topics={} related={} "
2433 # "alt_related={} distw={:.2f}"
2434 # .format(tagparts, tagsets, topics, related,
2435 # alt_related,
2436 # distw(titleparts, parts[i - 1])))
2437 if (
2438 topics
2439 or not tagsets
2440 or any("error-unknown-tag" in x for x in tagsets)
2441 ):
2442 if alt_related is not None: 2442 ↛ 2444line 2442 didn't jump to line 2444 because the condition on line 2442 was never true
2443 # We already had a good division, so let's stop.
2444 break
2445 # Bad division, try deeper
2446 continue
2447 if ( 2447 ↛ 2462line 2447 didn't jump to line 2462 because the condition on line 2447 was never true
2448 i > 1
2449 and len(parts[i - 1]) >= 4
2450 and distw(titleparts, parts[i - 1]) <= 0.4
2451 # Fixes wiktextract #983, where "participle"
2452 # was too close to "Martinize" and so this accepted
2453 # ["participle", "Martinize"] as matching; this
2454 # kludge prevents this from happening if titleparts
2455 # is shorter than what would be 'related'.
2456 # This breaks if we want to detect stuff that
2457 # actually gets an extra space-separated word when
2458 # 'inflected'.
2459 and len(titleparts) >= len(parts[i - 1:])
2460 ):
2461 # print(f"Reached; {parts=}, {parts[i-1]=}")
2462 alt_related = related
2463 alt_tagsets = tagsets
2464 continue
2465 alt_related = None
2466 alt_tagsets = None
2467 break
2468 else:
2469 if alt_related is None:
2470 # Check if the parenthesized part is likely a
2471 # romanization
2472 if (
2473 (have_ruby or classify_desc(base) == "other")
2474 and classify_desc(paren) == "romanization"
2475 and not (
2476 "categories" in data
2477 and desc in data["categories"]
2478 )
2479 ):
2480 for r in split_at_comma_semi(
2481 paren, extra=[" or "], skipped=links
2482 ):
2483 add_romanization(
2484 wxr,
2485 data,
2486 r,
2487 text,
2488 is_reconstruction,
2489 head_group,
2490 ruby,
2491 )
2492 have_romanization = True
2493 continue
2494 tagsets = [("error-unrecognized-head-form",)]
2495 wxr.wtp.debug(
2496 "unrecognized head form: {}".format(desc),
2497 sortid="form_descriptions/1698",
2498 )
2499 continue
2501 if alt_related is not None: 2501 ↛ 2502line 2501 didn't jump to line 2502 because the condition on line 2501 was never true
2502 related = alt_related
2503 tagsets = alt_tagsets
2505 # print("FORM END: tagsets={} related={}".format(tagsets, related))
2506 # print("==================")
2507 if not tagsets: 2507 ↛ 2508line 2507 didn't jump to line 2508 because the condition on line 2507 was never true
2508 continue
2510 assert isinstance(related, (list, tuple))
2511 related_str = " ".join(related)
2512 if "or" in titleparts:
2513 alts = [related_str]
2514 else:
2515 alts = split_at_comma_semi(
2516 related_str, separators=[" or "], skipped=links
2517 )
2518 if not alts:
2519 alts = [""]
2520 for related_str in alts:
2521 if related_str:
2522 if prev_tags and (
2523 all(
2524 all(
2525 t in ["nonstandard", "dialectal"]
2526 or valid_tags[t] == "dialect"
2527 for t in tags
2528 )
2529 for ts in tagsets
2530 )
2531 or (
2532 any("participle" in ts for ts in prev_tags)
2533 and all(
2534 "attributive" in ts
2535 or any(valid_tags[t] == "gender" for t in ts)
2536 for ts in tagsets
2537 )
2538 )
2539 ):
2540 # Merged with previous tags. Don't update previous
2541 # tags here; cf. burn/English/Verb
2542 for tags_l in tagsets:
2543 for ts in prev_tags:
2544 tags_l1 = list(sorted(set(tags_l) | set(ts)))
2545 add_related(
2546 wxr,
2547 data,
2548 tags_l1,
2549 [related_str],
2550 text,
2551 True,
2552 is_reconstruction,
2553 head_group,
2554 ruby,
2555 )
2556 else:
2557 # Not merged with previous tags
2558 for tags_l in tagsets:
2559 if following_tags is not None: 2559 ↛ 2560line 2559 didn't jump to line 2560 because the condition on line 2559 was never true
2560 for ts in following_tags:
2561 tags_l1 = list(
2562 sorted(set(tags_l) | set(ts))
2563 )
2564 add_related(
2565 wxr,
2566 data,
2567 tags_l1,
2568 [related_str],
2569 text,
2570 True,
2571 is_reconstruction,
2572 head_group,
2573 ruby,
2574 )
2575 else:
2576 ret = add_related(
2577 wxr,
2578 data,
2579 tags_l,
2580 [related_str],
2581 text,
2582 True,
2583 is_reconstruction,
2584 head_group,
2585 ruby,
2586 )
2587 if ret is not None: 2587 ↛ 2588line 2587 didn't jump to line 2588 because the condition on line 2587 was never true
2588 following_tags = ret
2589 prev_tags = tagsets
2590 else:
2591 if desc_i < len(new_desc) - 1 and all( 2591 ↛ 2598line 2591 didn't jump to line 2598 because the condition on line 2591 was never true
2592 "participle" in ts or "infinitive" in ts
2593 for ts in tagsets
2594 ):
2595 # Interpret it as a standalone form description
2596 # in the middle, probably followed by forms or
2597 # language-specific descriptors. cf. drikke/Danish
2598 new_prev_tags2 = []
2599 for ts1 in prev_tags or [()]:
2600 for ts2 in tagsets:
2601 ts = tuple(sorted(set(ts1) | set(ts2)))
2602 new_prev_tags2.append(ts)
2603 prev_tags = new_prev_tags2
2604 continue
2605 for tags in tagsets:
2606 data_extend(data, "tags", tags)
2607 prev_tags = tagsets
2608 following_tags = None
2610 # Finally, if we collected hirakana/katakana, add them now
2611 if hiragana: 2611 ↛ 2612line 2611 didn't jump to line 2612 because the condition on line 2611 was never true
2612 add_related(
2613 wxr,
2614 data,
2615 ["hiragana"],
2616 [hiragana],
2617 text,
2618 True,
2619 is_reconstruction,
2620 head_group,
2621 ruby,
2622 )
2623 if katakana: 2623 ↛ 2624line 2623 didn't jump to line 2624 because the condition on line 2623 was never true
2624 add_related(
2625 wxr,
2626 data,
2627 ["katakana"],
2628 [katakana],
2629 text,
2630 True,
2631 is_reconstruction,
2632 head_group,
2633 ruby,
2634 )
2636 # XXX check if this is actually relevant, tags in word root data
2637 # is extremely rare (not sure where they slip through).
2638 tags = data.get("tags", []) # type:ignore
2639 if len(tags) > 0:
2640 # wxr.wtp.debug(
2641 # f"Tags appear in word root data: {data['tags']=}", # type:ignore
2642 # sortid="form_descriptions/2620/20240606",
2643 # ) # Messes up tests.
2644 data["tags"] = list(sorted(set(tags))) # type:ignore
2647def parse_sense_qualifier(
2648 wxr: WiktextractContext, text: str, data: Union[SenseData, LinkageData]
2649) -> None:
2650 """Parses tags or topics for a sense or some other data. The values are
2651 added into the dictionary ``data``."""
2652 assert isinstance(wxr, WiktextractContext)
2653 assert isinstance(text, str)
2654 assert isinstance(data, dict)
2655 # print("parse_sense_qualifier:", text)
2656 if re.match(r"\([^()]+\)$", text): 2656 ↛ 2657line 2656 didn't jump to line 2657 because the condition on line 2656 was never true
2657 text = text[1:-1]
2658 if re.match(r'"[^"]+"$', text): 2658 ↛ 2659line 2658 didn't jump to line 2659 because the condition on line 2658 was never true
2659 text = text[1:-1]
2660 lst = map_with(xlat_descs_map, [text])
2661 sense_tags: list[str] = []
2662 for text in lst:
2663 for semi in split_at_comma_semi(text):
2664 if not semi: 2664 ↛ 2665line 2664 didn't jump to line 2665 because the condition on line 2664 was never true
2665 continue
2666 orig_semi = semi
2667 idx = semi.find(":")
2668 if idx >= 0: 2668 ↛ 2669line 2668 didn't jump to line 2669 because the condition on line 2668 was never true
2669 semi = semi[:idx]
2670 cls = classify_desc(semi, allow_unknown_tags=True)
2671 # print("parse_sense_qualifier: classify_desc: {} -> {}"
2672 # .format(semi, cls))
2673 if cls == "tags": 2673 ↛ 2682line 2673 didn't jump to line 2682 because the condition on line 2673 was always true
2674 tagsets, topics = decode_tags(semi)
2675 data_extend(data, "topics", topics)
2676 # XXX should think how to handle distinct options better,
2677 # e.g., "singular and plural genitive"; that can't really be
2678 # done with changing the calling convention of this function.
2679 # Should split sense if more than one category of tags differs.
2680 for tags in tagsets:
2681 sense_tags.extend(tags)
2682 elif cls == "taxonomic":
2683 if re.match(r"×[A-Z]", semi):
2684 sense_tags.append("extinct")
2685 semi = semi[1:]
2686 data["taxonomic"] = semi
2687 elif cls == "english":
2688 if "qualifier" in data and data["qualifier"] != orig_semi:
2689 data["qualifier"] += "; " + orig_semi
2690 else:
2691 data["qualifier"] = orig_semi
2692 else:
2693 wxr.wtp.debug(
2694 "unrecognized sense qualifier: {}".format(text),
2695 sortid="form_descriptions/1831",
2696 )
2697 sense_tags = list(sorted(set(sense_tags)))
2698 data_extend(data, "tags", sense_tags)
2701def parse_pronunciation_tags(
2702 wxr: WiktextractContext, text: str, data: SoundData
2703) -> None:
2704 assert isinstance(wxr, WiktextractContext)
2705 assert isinstance(text, str)
2706 assert isinstance(data, dict)
2707 text = text.strip()
2708 if not text: 2708 ↛ 2709line 2708 didn't jump to line 2709 because the condition on line 2708 was never true
2709 return
2710 cls = classify_desc(text)
2711 notes = []
2712 if cls == "tags":
2713 tagsets, topics = decode_tags(text)
2714 data_extend(data, "topics", topics)
2715 for tagset in tagsets:
2716 for t in tagset:
2717 if " " in t: 2717 ↛ 2718line 2717 didn't jump to line 2718 because the condition on line 2717 was never true
2718 notes.append(t)
2719 else:
2720 data_append(data, "tags", t)
2721 else:
2722 notes.append(text)
2723 if notes:
2724 data["note"] = "; ".join(notes)
2727def parse_translation_desc(
2728 wxr: WiktextractContext, lang: str, text: str, tr: TranslationData
2729) -> None:
2730 assert isinstance(wxr, WiktextractContext)
2731 assert isinstance(lang, str) # The language of ``text``
2732 assert isinstance(text, str)
2733 assert isinstance(tr, dict)
2734 # print("parse_translation_desc:", text)
2736 # Process all parenthesized parts from the translation item
2737 note = None
2738 restore_beginning = ""
2739 restore_end = ""
2740 while True:
2741 beginning = False
2742 # See if we can find a parenthesized expression at the end
2743 m = re.search(r"\s*\((([^()]|\([^()]+\))+)\)\.?$", text)
2744 if m:
2745 par = m.group(1)
2746 text = text[: m.start()]
2747 if par.startswith(("literally ", "lit.")): 2747 ↛ 2748line 2747 didn't jump to line 2748 because the condition on line 2747 was never true
2748 continue # Not useful for disambiguation in many idioms
2749 else:
2750 # See if we can find a parenthesized expression at the start
2751 m = re.match(r"^\^?\((([^()]|\([^()]+\))+)\):?(\s+|$)", text)
2752 if m: 2752 ↛ 2753line 2752 didn't jump to line 2753 because the condition on line 2752 was never true
2753 par = m.group(1)
2754 text = text[m.end() :]
2755 beginning = True
2756 if re.match(r"^(\d|\s|,| or | and )+$", par):
2757 # Looks like this beginning parenthesized expression only
2758 # contains digits or their combinations. We assume such
2759 # to be sense descriptions if no sense has been selected,
2760 # or otherwise just ignore them.
2761 if not tr.get("sense"):
2762 tr["sense"] = par
2763 continue
2764 else:
2765 # See if we can find a parenthesized expression in the middle.
2766 # Romanizations are sometimes between word and gender marker,
2767 # e.g. wife/English/Tr/Yiddish.
2768 m = re.search(r"\s+\((([^()]|\([^()]+\))+)\)", text)
2769 if m: 2769 ↛ 2770line 2769 didn't jump to line 2770 because the condition on line 2769 was never true
2770 par = m.group(1)
2771 text = text[: m.start()] + text[m.end() :]
2772 else:
2773 # No more parenthesized expressions - break out of the loop
2774 break
2776 # Some cleanup of artifacts that may result from skipping some templates
2777 # in earlier stages
2778 if par.startswith(": "): 2778 ↛ 2779line 2778 didn't jump to line 2779 because the condition on line 2778 was never true
2779 par = par[2:]
2780 if par.endswith(","): 2780 ↛ 2781line 2780 didn't jump to line 2781 because the condition on line 2780 was never true
2781 par = par[:-1]
2782 if re.match(r'^[“"]([^“”"]*)[“”"]$', par): 2782 ↛ 2783line 2782 didn't jump to line 2783 because the condition on line 2782 was never true
2783 par = par[1:-1]
2784 par = par.strip()
2786 # Check for special script pronunciation followed by romanization,
2787 # used in many Asian languages.
2788 lst = par.split(", ")
2789 if len(lst) == 2: 2789 ↛ 2790line 2789 didn't jump to line 2790 because the condition on line 2789 was never true
2790 a, r = lst
2791 if classify_desc(a) == "other":
2792 cls = classify_desc(r)
2793 # print("parse_translation_desc: r={} cls={}".format(r, cls))
2794 if cls == "romanization" or (
2795 cls == "english" and len(r.split()) == 1 and r[0].islower()
2796 ):
2797 if tr.get("alt") and tr.get("alt") != a:
2798 wxr.wtp.debug(
2799 'more than one value in "alt": {} vs. {}'.format(
2800 tr["alt"], a
2801 ),
2802 sortid="form_descriptions/1930",
2803 )
2804 tr["alt"] = a
2805 if tr.get("roman") and tr.get("roman") != r:
2806 wxr.wtp.debug(
2807 'more than one value in "roman": '
2808 "{} vs. {}".format(tr["roman"], r),
2809 sortid="form_descriptions/1936",
2810 )
2811 tr["roman"] = r
2812 continue
2814 # Check for certain comma-separated tags combined with English text
2815 # at the beginning or end of a comma-separated parenthesized list
2816 while len(lst) > 1: 2816 ↛ 2817line 2816 didn't jump to line 2817 because the condition on line 2816 was never true
2817 cls = classify_desc(lst[0])
2818 if cls == "tags":
2819 tagsets, topics = decode_tags(lst[0])
2820 for t in tagsets:
2821 data_extend(tr, "tags", t)
2822 data_extend(tr, "topics", topics)
2823 lst = lst[1:]
2824 continue
2825 cls = classify_desc(lst[-1])
2826 if cls == "tags":
2827 tagsets, topics = decode_tags(lst[-1])
2828 for t in tagsets:
2829 data_extend(tr, "tags", t)
2830 data_extend(tr, "topics", topics)
2831 lst = lst[:-1]
2832 continue
2833 break
2834 par = ", ".join(lst)
2836 if not par: 2836 ↛ 2837line 2836 didn't jump to line 2837 because the condition on line 2836 was never true
2837 continue
2838 if re.search(tr_ignored_parens_re, par): 2838 ↛ 2839line 2838 didn't jump to line 2839 because the condition on line 2838 was never true
2839 continue
2840 if par.startswith("numeral:"): 2840 ↛ 2841line 2840 didn't jump to line 2841 because the condition on line 2840 was never true
2841 par = par[8:].strip()
2843 # Classify the part in parenthesis and process accordingly
2844 cls = classify_desc(par)
2845 # print("parse_translation_desc classify: {!r} -> {}"
2846 # .format(par, cls))
2847 if par == text: 2847 ↛ 2848line 2847 didn't jump to line 2848 because the condition on line 2847 was never true
2848 pass
2849 if par == "f": 2849 ↛ 2850line 2849 didn't jump to line 2850 because the condition on line 2849 was never true
2850 data_append(tr, "tags", "feminine")
2851 elif par == "m": 2851 ↛ 2852line 2851 didn't jump to line 2852 because the condition on line 2851 was never true
2852 data_append(tr, "tags", "masculine")
2853 elif cls == "tags": 2853 ↛ 2854line 2853 didn't jump to line 2854 because the condition on line 2853 was never true
2854 tagsets, topics = decode_tags(par)
2855 for tags in tagsets:
2856 data_extend(tr, "tags", tags)
2857 data_extend(tr, "topics", topics)
2858 elif cls == "english":
2859 # If the text contains any of certain grammatical words, treat it
2860 # as a "note" instead of "english"
2861 if re.search(tr_note_re, par): 2861 ↛ 2862line 2861 didn't jump to line 2862 because the condition on line 2861 was never true
2862 if par.endswith(":"):
2863 par = par[:-1]
2864 if par not in ("see entry for forms",):
2865 if note:
2866 note = note + ";" + par
2867 else:
2868 note = par
2869 else:
2870 # There can be more than one parenthesized english item, see
2871 # e.g. Aunt/English/Translations/Tamil
2872 if tr.get("english"): 2872 ↛ 2873line 2872 didn't jump to line 2873 because the condition on line 2872 was never true
2873 tr["english"] += "; " + par
2874 else:
2875 tr["english"] = par
2876 elif cls == "romanization": 2876 ↛ 2896line 2876 didn't jump to line 2896 because the condition on line 2876 was always true
2877 # print("roman text={!r} text cls={}"
2878 # .format(text, classify_desc(text)))
2879 if classify_desc(text) in ( 2879 ↛ 2883line 2879 didn't jump to line 2883 because the condition on line 2879 was never true
2880 "english",
2881 "romanization",
2882 ) and lang not in ("Egyptian",):
2883 if beginning:
2884 restore_beginning += "({}) ".format(par)
2885 else:
2886 restore_end = " ({})".format(par) + restore_end
2887 else:
2888 if tr.get("roman"): 2888 ↛ 2889line 2888 didn't jump to line 2889 because the condition on line 2888 was never true
2889 wxr.wtp.debug(
2890 'more than one value in "roman": {} vs. {}'.format(
2891 tr["roman"], par
2892 ),
2893 sortid="form_descriptions/2013",
2894 )
2895 tr["roman"] = par
2896 elif cls == "taxonomic":
2897 if tr.get("taxonomic"):
2898 wxr.wtp.debug(
2899 'more than one value in "taxonomic": {} vs. {}'.format(
2900 tr["taxonomic"], par
2901 ),
2902 sortid="form_descriptions/2019",
2903 )
2904 if re.match(r"×[A-Z]", par):
2905 data_append(tr, "tags", "extinct")
2906 par = par[1:]
2907 tr["taxonomic"] = par
2908 elif cls == "other":
2909 if tr.get("alt"):
2910 wxr.wtp.debug(
2911 'more than one value in "alt": {} vs. {}'.format(
2912 tr["alt"], par
2913 ),
2914 sortid="form_descriptions/2028",
2915 )
2916 tr["alt"] = par
2917 else:
2918 wxr.wtp.debug(
2919 "parse_translation_desc unimplemented cls {}: {}".format(
2920 cls, par
2921 ),
2922 sortid="form_descriptions/2033",
2923 )
2925 # Check for gender indications in suffix
2926 text, final_tags = parse_head_final_tags(wxr, lang, text)
2927 data_extend(tr, "tags", final_tags)
2929 # Restore those parts that we did not want to remove (they are often
2930 # optional words or words that are always used with the given translation)
2931 text = restore_beginning + text + restore_end
2933 if note: 2933 ↛ 2934line 2933 didn't jump to line 2934 because the condition on line 2933 was never true
2934 tr["note"] = note.strip()
2935 if text and text not in ignored_translations: 2935 ↛ 2940line 2935 didn't jump to line 2940 because the condition on line 2935 was always true
2936 tr["word"] = text.strip()
2938 # Sometimes gender seems to be at the end of "roman" field, see e.g.
2939 # fire/English/Noun/Translations/Egyptian (for "oxidation reaction")
2940 roman = tr.get("roman")
2941 if roman:
2942 if roman.endswith(" f"): 2942 ↛ 2943line 2942 didn't jump to line 2943 because the condition on line 2942 was never true
2943 data_append(tr, "tags", "feminine")
2944 tr["roman"] = roman[:-2].strip()
2945 elif roman.endswith(" m"): 2945 ↛ 2946line 2945 didn't jump to line 2946 because the condition on line 2945 was never true
2946 data_append(tr, "tags", "masculine")
2947 tr["roman"] = roman[:-2].strip()
2949 # If the word now has "english" field but no "roman" field, and
2950 # the word would be classified "other" (generally non-latin
2951 # characters), and the value in "english" is only one lowercase
2952 # word, move it to "roman". This happens semi-frequently when the
2953 # translation is transliterated the same as some English word.
2954 roman = tr.get("roman")
2955 english = tr.get("english")
2956 if english and not roman and "word" in tr:
2957 cls = classify_desc(tr["word"])
2958 if cls == "other" and " " not in english and english[0].islower(): 2958 ↛ 2965line 2958 didn't jump to line 2965 because the condition on line 2958 was always true
2959 del tr["english"]
2960 tr["roman"] = english
2962 # If the entry now has both tr["roman"] and tr["word"] and they have
2963 # the same value, delete tr["roman"] (e.g., man/English/Translations
2964 # Evenki)
2965 if tr.get("word") and tr.get("roman") == tr.get("word"): 2965 ↛ 2966line 2965 didn't jump to line 2966 because the condition on line 2965 was never true
2966 del tr["roman"]
2969def parse_alt_or_inflection_of(
2970 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str]
2971) -> Optional[tuple[list[str], Optional[list[AltOf]]]]:
2972 """Tries to parse an inflection-of or alt-of description. If successful,
2973 this returns (tags, alt-of/inflection-of-dict). If the description cannot
2974 be parsed, this returns None. This may also return (tags, None) when the
2975 gloss describes a form (or some other tags were extracted from it), but
2976 there was no alt-of/form-of/synonym-of word."""
2977 # print("parse_alt_or_inflection_of: {!r}".format(gloss))
2978 # Occasionally inflection_of/alt_of have "A(n) " etc. at the beginning.
2980 # Never interpret a gloss that is equal to the word itself as a tag
2981 # (e.g., instrumental/Romanian, instrumental/Spanish).
2982 if gloss.lower() == wxr.wtp.title.lower() or ( # type:ignore[union-attr] 2982 ↛ 2985line 2982 didn't jump to line 2985 because the condition on line 2982 was never true
2983 len(gloss) >= 5 and distw([gloss.lower()], wxr.wtp.title.lower()) < 0.2 # type:ignore[union-attr]
2984 ):
2985 return None
2987 # First try parsing it as-is
2988 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args)
2989 if parsed is not None:
2990 return parsed
2992 # Next try parsing it with the first character converted to lowercase if
2993 # it was previously uppercase.
2994 if gloss and gloss[0].isupper():
2995 gloss = gloss[0].lower() + gloss[1:]
2996 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args)
2997 if parsed is not None: 2997 ↛ 2998line 2997 didn't jump to line 2998 because the condition on line 2997 was never true
2998 return parsed
3000 return None
3003# These tags are not allowed in alt-or-inflection-of parsing
3004alt_infl_disallowed: set[str] = set(
3005 [
3006 "error-unknown-tag",
3007 "place", # Not in inflected forms and causes problems e.g. house/English
3008 ]
3009)
3012def parse_alt_or_inflection_of1(
3013 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str]
3014) -> Optional[tuple[list[str], Optional[list[AltOf]]]]:
3015 """Helper function for parse_alt_or_inflection_of. This handles a single
3016 capitalization."""
3017 if not gloss or not gloss.strip(): 3017 ↛ 3018line 3017 didn't jump to line 3018 because the condition on line 3017 was never true
3018 return None
3020 # Prevent some common errors where we would parse something we shouldn't
3021 if re.search(r"(?i)form of address ", gloss): 3021 ↛ 3022line 3021 didn't jump to line 3022 because the condition on line 3021 was never true
3022 return None
3024 gloss = re.sub(r"only used in [^,]+, ", "", gloss)
3026 # First try all formats ending with "of" (or other known last words that
3027 # can end a form description)
3028 matches = list(re.finditer(r"\b(of|for|by|as|letter|number) ", gloss))
3029 m: Optional[re.Match]
3030 for m in reversed(matches):
3031 desc = gloss[: m.end()].strip()
3032 base = gloss[m.end() :].strip()
3033 tagsets, topics = decode_tags(desc, no_unknown_starts=True)
3034 if not topics and any(
3035 not (alt_infl_disallowed & set(ts)) for ts in tagsets
3036 ):
3037 # Successfully parsed, including "of" etc.
3038 tags: list[str] = []
3039 # If you have ("Western-Armenian", ..., "form-of") as your
3040 # tag set, it's most probable that it's something like
3041 # "Western Armenian form of խոսել (xosel)", which should
3042 # get "alt-of" instead of "form-of" (inflection).
3043 # խօսիլ/Armenian
3044 for ts_t in tagsets:
3045 if "form-of" in ts_t and any(
3046 valid_tags.get(tk) == "dialect" for tk in ts_t
3047 ):
3048 ts_s = (set(ts_t) - {"form-of"}) | {"alt-of"}
3049 else:
3050 ts_s = set(ts_t)
3051 if not (alt_infl_disallowed & ts_s): 3051 ↛ 3044line 3051 didn't jump to line 3044 because the condition on line 3051 was always true
3052 tags.extend(ts_s)
3053 if (
3054 "alt-of" in tags
3055 or "form-of" in tags
3056 or "synonym-of" in tags
3057 or "compound-of" in tags
3058 ):
3059 break
3060 if m.group(1) == "of":
3061 # Try parsing without the final "of". This is commonly used in
3062 # various form-of expressions.
3063 desc = gloss[: m.start()]
3064 base = gloss[m.end() :]
3065 tagsets, topics = decode_tags(desc, no_unknown_starts=True)
3066 # print("ALT_OR_INFL: desc={!r} base={!r} tagsets={} topics={}"
3067 # .format(desc, base, tagsets, topics))
3068 if not topics and any(
3069 not (alt_infl_disallowed & set(t)) for t in tagsets
3070 ):
3071 tags = []
3072 for t in tagsets:
3073 if not (alt_infl_disallowed & set(t)): 3073 ↛ 3072line 3073 didn't jump to line 3072 because the condition on line 3073 was always true
3074 tags.extend(t)
3075 # It must have at least one tag from form_of_tags
3076 if set(tags) & form_of_tags: 3076 ↛ 3080line 3076 didn't jump to line 3080 because the condition on line 3076 was always true
3077 # Accept this as form-of
3078 tags.append("form-of")
3079 break
3080 if set(tags) & alt_of_tags:
3081 # Accept this as alt-of
3082 tags.append("alt-of")
3083 break
3085 else:
3086 # Did not find a form description based on last word; see if the
3087 # whole description is tags
3088 tagsets, topics = decode_tags(gloss, no_unknown_starts=True)
3089 if not topics and any(
3090 not (alt_infl_disallowed & set(ts)) and form_of_tags & set(ts)
3091 for ts in tagsets
3092 ):
3093 tags = []
3094 for ts in tagsets:
3095 if not (alt_infl_disallowed & set(ts)) and form_of_tags & set( 3095 ↛ 3094line 3095 didn't jump to line 3094 because the condition on line 3095 was always true
3096 ts
3097 ):
3098 tags.extend(ts)
3099 base = ""
3100 else:
3101 return None
3103 # kludge for Spanish (again): 'x of [word] combined with [clitic]'
3104 m = re.search(r"combined with \w+$", base)
3105 if m: 3105 ↛ 3106line 3105 didn't jump to line 3106 because the condition on line 3105 was never true
3106 tagsets, topics = decode_tags(m.group(0), no_unknown_starts=True)
3107 if not topics:
3108 for ts in tagsets:
3109 tags.extend(ts)
3110 base = base[: m.start()]
3112 # It is fairly common for form_of glosses to end with something like
3113 # "ablative case" or "in instructive case". Parse that ending.
3114 base = base.strip()
3115 lst = base.split()
3116 # print("parse_alt_or_inflection_of: lst={}".format(lst))
3117 if len(lst) >= 3 and lst[-1] in ("case", "case."): 3117 ↛ 3118line 3117 didn't jump to line 3118 because the condition on line 3117 was never true
3118 node = valid_sequences.children.get(lst[-2])
3119 if node and node.end:
3120 for s in node.tags:
3121 tags.extend(s.split(" "))
3122 lst = lst[:-2]
3123 if lst[-1] == "in" and len(lst) > 1:
3124 lst = lst[:-1]
3126 # Eliminate empty and duplicate tags
3127 tags = list(sorted(set(t for t in tags if t)))
3129 # Clean up some extra stuff from the linked word, separating the text
3130 # into ``base`` (the linked word) and ``extra`` (additional information,
3131 # such as English translation or clarifying word sense information).
3132 orig_base = base
3133 base = re.sub(alt_of_form_of_clean_re, "", orig_base)
3134 base = re.sub(r" [(⟨][^()]*[)⟩]", "", base) # Remove all (...) groups
3135 extra = orig_base[len(base) :]
3136 extra = re.sub(r"^[- :;.,,—]+", "", extra)
3137 if extra.endswith(".") and extra.count(".") == 1: 3137 ↛ 3138line 3137 didn't jump to line 3138 because the condition on line 3137 was never true
3138 extra = extra[:-1].strip()
3139 m = re.match(r"^\(([^()]*)\)$", extra)
3140 if m: 3140 ↛ 3141line 3140 didn't jump to line 3141 because the condition on line 3140 was never true
3141 extra = m.group(1)
3142 else:
3143 # These weird backets used in "slash mark"
3144 m = re.match(r"^⟨([^()]*)⟩$", extra)
3145 if m: 3145 ↛ 3146line 3145 didn't jump to line 3146 because the condition on line 3145 was never true
3146 extra = m.group(1)
3147 m = re.match(r'^[“"]([^"“”]*)["”]$', extra)
3148 if m: 3148 ↛ 3149line 3148 didn't jump to line 3149 because the condition on line 3148 was never true
3149 extra = m.group(1)
3150 # Note: base might still contain comma-separated values and values
3151 # separated by "and"
3152 base = base.strip()
3153 if base.endswith(",") and len(base) > 2: 3153 ↛ 3154line 3153 didn't jump to line 3154 because the condition on line 3153 was never true
3154 base = base[:-1].strip()
3155 while (
3156 base.endswith(".")
3157 and not wxr.wtp.page_exists(base)
3158 and base not in gloss_template_args
3159 ):
3160 base = base[:-1].strip()
3161 if base.endswith('(\u201cconjecture")'): 3161 ↛ 3162line 3161 didn't jump to line 3162 because the condition on line 3161 was never true
3162 base = base[:-14].strip()
3163 tags.append("conjecture")
3164 while ( 3164 ↛ 3169line 3164 didn't jump to line 3169 because the condition on line 3164 was never true
3165 base.endswith(".")
3166 and not wxr.wtp.page_exists(base)
3167 and base not in gloss_template_args
3168 ):
3169 base = base[:-1].strip()
3170 if ( 3170 ↛ 3175line 3170 didn't jump to line 3175 because the condition on line 3170 was never true
3171 base.endswith(".")
3172 and base not in gloss_template_args
3173 and base[:-1] in gloss_template_args
3174 ):
3175 base = base[:-1]
3176 base = base.strip()
3177 if not base:
3178 return tags, None
3180 # Kludge: Spanish verb forms seem to have a dot added at the end.
3181 # Remove it; we know of no Spanish verbs ending with a dot.
3182 language = wxr.wtp.section
3183 pos = wxr.wtp.subsection
3184 # print("language={} pos={} base={}".format(language, pos, base))
3185 if ( 3185 ↛ 3191line 3185 didn't jump to line 3191 because the condition on line 3185 was never true
3186 base.endswith(".")
3187 and len(base) > 1
3188 and base[-2].isalpha()
3189 and (language == "Spanish" and pos == "Verb")
3190 ):
3191 base = base[:-1]
3193 # Split base to alternatives when multiple alternatives provided
3194 parts = split_at_comma_semi(base, extra=[" / ", "/", r" \+ "])
3195 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "")
3196 if ( 3196 ↛ 3205line 3196 didn't jump to line 3205 because the condition on line 3196 was always true
3197 len(parts) <= 1
3198 or base.startswith("/")
3199 or base.endswith("/")
3200 or "/" in titleword
3201 ):
3202 parts = [base]
3203 # Split base to alternatives when of form "a or b" and "a" and "b" are
3204 # similar (generally spelling variants of the same word or similar words)
3205 if len(parts) == 1: 3205 ↛ 3211line 3205 didn't jump to line 3211 because the condition on line 3205 was always true
3206 pp = base.split()
3207 if len(pp) == 3 and pp[1] == "or" and distw([pp[0]], pp[2]) < 0.4:
3208 parts = [pp[0], pp[2]]
3210 # Create form-of/alt-of entries based on the extracted data
3211 dt_lst: list[AltOf] = []
3212 for p in parts:
3213 # Check for some suspicious base forms
3214 m = re.search(r"[.,] |[{}()]", p)
3215 if m and not wxr.wtp.page_exists(p): 3215 ↛ 3216line 3215 didn't jump to line 3216 because the condition on line 3215 was never true
3216 wxr.wtp.debug(
3217 "suspicious alt_of/form_of with {!r}: {}".format(m.group(0), p),
3218 sortid="form_descriptions/2278",
3219 )
3220 if p.startswith("*") and len(p) >= 3 and p[1].isalpha(): 3220 ↛ 3221line 3220 didn't jump to line 3221 because the condition on line 3220 was never true
3221 p = p[1:]
3222 dt: AltOf = {"word": p}
3223 if extra:
3224 dt["extra"] = extra
3225 dt_lst.append(dt)
3226 # print("alt_or_infl_of returning tags={} lst={} base={!r}"
3227 # .format(tags, lst, base))
3228 return tags, dt_lst
3231@functools.lru_cache(maxsize=65536)
3232def classify_desc(
3233 desc: str,
3234 allow_unknown_tags=False,
3235 no_unknown_starts=False,
3236 accepted: Union[tuple[str, ...], frozenset[str]] = tuple(),
3237) -> str:
3238 """Determines whether the given description is most likely tags, english,
3239 a romanization, or something else. Returns one of: "tags", "english",
3240 "romanization", or "other". If ``allow_unknown_tags`` is True, then
3241 allow "tags" classification even when the only tags are those starting
3242 with a word in allowed_unknown_starts."""
3243 assert isinstance(desc, str)
3244 # Empty and whitespace-only strings are treated as "other"
3245 desc = desc.strip()
3246 if not desc:
3247 return "other"
3249 normalized_desc = unicodedata.normalize("NFKD", desc)
3251 # If it can be fully decoded as tags without errors, treat as tags
3252 tagsets, topics = decode_tags(desc, no_unknown_starts=no_unknown_starts)
3253 for tagset in tagsets:
3254 assert isinstance(tagset, (list, tuple, set))
3255 if "error-unknown-tag" not in tagset and (
3256 topics or allow_unknown_tags or any(" " not in x for x in tagset)
3257 ):
3258 return "tags"
3260 # Check if it looks like the taxonomic name of a species
3261 if desc in known_species:
3262 return "taxonomic"
3263 desc1 = re.sub(r"^×([A-Z])", r"\1", desc)
3264 desc1 = re.sub(r"\s*×.*", "", desc1)
3265 lst = desc1.split()
3266 if len(lst) > 1 and len(lst) <= 5 and lst[0] in known_firsts:
3267 have_non_english = 1 if lst[0].lower() not in english_words else 0
3268 for x in lst[1:]:
3269 if x in ("A", "B", "C", "D", "E", "F", "I", "II", "III", "IV", "V"):
3270 continue
3271 if x[0].isupper():
3272 break
3273 if x not in english_words:
3274 have_non_english += 1
3275 else:
3276 # Starts with known taxonomic term, does not contain uppercase
3277 # words (except allowed letters) and at least one word is not
3278 # English
3279 if have_non_english >= len(lst) - 1 and have_non_english > 0: 3279 ↛ 3285line 3279 didn't jump to line 3285 because the condition on line 3279 was always true
3280 return "taxonomic"
3282 # If all words are in our English dictionary, interpret as English.
3283 # [ -~] is regex black magic, "ALL CHARACTERS from space to tilde"
3284 # in ASCII. Took me a while to figure out.
3285 if re.match(r"[ -~―—“”…'‘’ʹ€]+$", normalized_desc) and len(desc) > 1:
3286 if desc in english_words and desc[0].isalpha():
3287 return "english" # Handles ones containing whitespace
3288 desc1 = re.sub(
3289 tokenizer_fixup_re, lambda m: tokenizer_fixup_map[m.group(0)], desc
3290 )
3291 tokens = tokenizer.tokenize(desc1)
3292 if not tokens: 3292 ↛ 3293line 3292 didn't jump to line 3293 because the condition on line 3292 was never true
3293 return "other"
3294 lst_bool = list(
3295 x not in not_english_words
3296 and
3297 # not x.isdigit() and
3298 (
3299 x in english_words
3300 or x.lower() in english_words
3301 or x in known_firsts
3302 or x[0].isdigit()
3303 or x in accepted
3304 or
3305 # (x[0].isupper() and x.find("-") < 0 and x.isascii()) or
3306 (
3307 x.endswith("s") and len(x) >= 4 and x[:-1] in english_words
3308 ) # Plural
3309 or (
3310 x.endswith("ies")
3311 and len(x) >= 5
3312 and x[:-3] + "y" in english_words
3313 ) # E.g. lily - lilies
3314 or (
3315 x.endswith("ing")
3316 and len(x) >= 5
3317 and x[:-3] in english_words
3318 ) # E.g. bring - bringing
3319 or (
3320 x.endswith("ing")
3321 and len(x) >= 5
3322 and x[:-3] + "e" in english_words
3323 ) # E.g., tone - toning
3324 or (
3325 x.endswith("ed") and len(x) >= 5 and x[:-2] in english_words
3326 ) # E.g. hang - hanged
3327 or (
3328 x.endswith("ed")
3329 and len(x) >= 5
3330 and x[:-2] + "e" in english_words
3331 ) # E.g. atone - atoned
3332 or (x.endswith("'s") and x[:-2] in english_words)
3333 or (x.endswith("s'") and x[:-2] in english_words)
3334 or (
3335 x.endswith("ise")
3336 and len(x) >= 5
3337 and x[:-3] + "ize" in english_words
3338 )
3339 or (
3340 x.endswith("ised")
3341 and len(x) >= 6
3342 and x[:-4] + "ized" in english_words
3343 )
3344 or (
3345 x.endswith("ising")
3346 and len(x) >= 7
3347 and x[:-5] + "izing" in english_words
3348 )
3349 or (
3350 re.search(r"[-/]", x)
3351 and all(
3352 ((y in english_words and len(y) > 2) or not y)
3353 for y in re.split(r"[-/]", x)
3354 )
3355 )
3356 )
3357 for x in tokens
3358 )
3359 cnt = lst_bool.count(True)
3360 rejected_words = tuple(
3361 x for i, x in enumerate(tokens) if not lst_bool[i]
3362 )
3363 if (
3364 any(
3365 lst_bool[i] and x[0].isalpha() and len(x) > 1
3366 for i, x in enumerate(tokens)
3367 )
3368 and not desc.startswith("-")
3369 and not desc.endswith("-")
3370 and re.search(r"\w+", desc)
3371 and (
3372 cnt == len(lst_bool)
3373 or (
3374 any(
3375 lst_bool[i] and len(x) > 3 for i, x in enumerate(tokens)
3376 )
3377 and cnt >= len(lst_bool) - 1
3378 )
3379 or cnt / len(lst_bool) >= 0.8
3380 or (
3381 all(x in potentially_english_words for x in rejected_words)
3382 and cnt / len(lst_bool) >= 0.50
3383 )
3384 )
3385 ):
3386 return "english"
3387 # Some translations have apparent pronunciation descriptions in /.../
3388 # which we'll put in the romanization field (even though they probably are
3389 # not exactly romanizations).
3390 if desc.startswith("/") and desc.endswith("/"):
3391 return "romanization"
3392 # If all characters are in classes that could occur in romanizations,
3393 # treat as romanization
3394 classes = list(
3395 unicodedata.category(x) if x not in ("-", ",", ":", "/", '"') else "OK"
3396 for x in normalized_desc
3397 )
3398 classes1 = []
3399 num_latin = 0
3400 num_greek = 0
3401 # part = ""
3402 # for ch, cl in zip(normalized_desc, classes):
3403 # part += f"{ch}({cl})"
3404 # print(part)
3405 for ch, cl in zip(normalized_desc, classes):
3406 if ch in (
3407 "'", # ' in Arabic, / in IPA-like parenthesized forms
3408 ".", # e.g., "..." in translations
3409 ";",
3410 ":",
3411 "!",
3412 "‘",
3413 "’",
3414 '"',
3415 "“",
3416 "”",
3417 "/",
3418 "?",
3419 "…", # alternative to "..."
3420 "⁉", # 見る/Japanese automatic transcriptions...
3421 "?",
3422 "!",
3423 "⁻", # superscript -, used in some Cantonese roman, e.g. "we"
3424 "ʔ",
3425 "ʼ",
3426 "ʾ",
3427 "ʹ",
3428 ): # ʹ e.g. in understand/English/verb Russian transl
3429 classes1.append("OK")
3430 continue
3431 if cl not in ("Ll", "Lu"):
3432 classes1.append(cl)
3433 continue
3434 try:
3435 name = unicodedata.name(ch)
3436 first = name.split()[0]
3437 if first == "LATIN":
3438 num_latin += 1
3439 elif first == "GREEK":
3440 num_greek += 1
3441 elif first == "COMBINING": # Combining diacritic 3441 ↛ 3442line 3441 didn't jump to line 3442 because the condition on line 3441 was never true
3442 cl = "OK"
3443 elif re.match(non_latin_scripts_re, name): 3443 ↛ 3447line 3443 didn't jump to line 3447 because the condition on line 3443 was always true
3444 cl = "NO" # Not acceptable in romanizations
3445 except ValueError:
3446 cl = "NO" # Not acceptable in romanizations
3447 classes1.append(cl)
3448 # print("classify_desc: {!r} classes1: {}".format(desc, classes1))
3449 # print(set(classes1) )
3450 if all(
3451 x in ("Ll", "Lu", "Lt", "Lm", "Mn", "Mc", "Zs", "Nd", "OK")
3452 for x in classes1
3453 ):
3454 if (
3455 (num_latin >= num_greek + 2 or num_greek == 0)
3456 and classes1.count("OK") < len(classes1)
3457 and classes1.count("Nd") < len(classes1)
3458 ):
3459 return "romanization"
3460 # Otherwise it is something else, such as hanji version of the word
3461 return "other"
3464def remove_text_in_parentheses(text: str) -> str:
3465 parentheses = 0
3466 new_text = ""
3467 for c in text:
3468 if c == "(":
3469 parentheses += 1
3470 elif c == ")":
3471 parentheses -= 1
3472 elif parentheses == 0:
3473 new_text += c
3474 return new_text