Coverage for src / wiktextract / extractor / en / form_descriptions.py: 79%
1344 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-06 11:18 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-06 11:18 +0000
1# Code for parsing linguistic form descriptions and tags for word senses
2# (both the word entry head - initial part and parenthesized parts -
3# and tags at the beginning of word senses)
4#
5# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
7import functools
8import re
9import unicodedata
10from typing import (
11 Any,
12 Literal,
13 Optional,
14 Sequence,
15 Union,
16)
18import Levenshtein
19from nltk import TweetTokenizer # type:ignore[import-untyped]
21from ...datautils import data_append, data_extend, split_at_comma_semi
22from ...tags import (
23 alt_of_tags,
24 form_of_tags,
25 head_final_bantu_langs,
26 head_final_bantu_map,
27 head_final_numeric_langs,
28 head_final_other_langs,
29 head_final_other_map,
30 head_final_semitic_langs,
31 head_final_semitic_map,
32 uppercase_tags,
33 valid_tags,
34 xlat_descs_map,
35 xlat_head_map,
36 xlat_tags_map,
37)
38from ...topics import topic_generalize_map, valid_topics
39from ...wxr_context import WiktextractContext
40from .english_words import (
41 english_words,
42 not_english_words,
43 potentially_english_words,
44)
45from .form_descriptions_known_firsts import known_firsts
46from .taxondata import known_species
47from .type_utils import (
48 AltOf,
49 FormData,
50 LinkageData,
51 SenseData,
52 SoundData,
53 TranslationData,
54 WordData,
55)
57# Tokenizer for classify_desc()
58tokenizer = TweetTokenizer()
60# These are ignored as the value of a related form in form head.
61IGNORED_RELATED: set[str] = set(
62 [
63 "-",
64 "־",
65 "᠆",
66 "‐",
67 "‑",
68 "‒",
69 "–",
70 "—",
71 "―",
72 "−",
73 "⸺",
74 "⸻",
75 "﹘",
76 "﹣",
77 "-",
78 "?",
79 "(none)",
80 ]
81)
84# First words of unicodedata.name() that indicate scripts that cannot be
85# accepted in romanizations or english (i.e., should be considered "other"
86# in classify_desc()).
87non_latin_scripts: list[str] = [
88 "ADLAM",
89 "ARABIC",
90 "ARABIC-INDIC",
91 "ARMENIAN",
92 "BALINESE",
93 "BENGALI",
94 "BRAHMI",
95 "BRAILLE",
96 "CANADIAN",
97 "CHAKMA",
98 "CHAM",
99 "CHEROKEE",
100 "CJK",
101 "COPTIC",
102 "COUNTING ROD",
103 "CUNEIFORM",
104 "CYRILLIC",
105 "DOUBLE-STRUCK",
106 "EGYPTIAN",
107 "ETHIOPIC",
108 "EXTENDED ARABIC-INDIC",
109 "GEORGIAN",
110 "GLAGOLITIC",
111 "GOTHIC",
112 "GREEK",
113 "GUJARATI",
114 "GURMUKHI",
115 "HANGUL",
116 "HANIFI ROHINGYA",
117 "HEBREW",
118 "HIRAGANA",
119 "JAVANESE",
120 "KANNADA",
121 "KATAKANA",
122 "KAYAH LI",
123 "KHMER",
124 "KHUDAWADI",
125 "LAO",
126 "LEPCHA",
127 "LIMBU",
128 "MALAYALAM",
129 "MEETEI",
130 "MYANMAR",
131 "NEW TAI LUE",
132 "NKO",
133 "OL CHIKI",
134 "OLD PERSIAN",
135 "OLD SOUTH ARABIAN",
136 "ORIYA",
137 "OSMANYA",
138 "PHOENICIAN",
139 "SAURASHTRA",
140 "SHARADA",
141 "SINHALA",
142 "SUNDANESE",
143 "SYLOTI",
144 "TAI THAM",
145 "TAKRI",
146 "TAMIL",
147 "TELUGU",
148 "THAANA",
149 "THAI",
150 "TIBETAN",
151 "TIFINAGH",
152 "TIRHUTA",
153 "UGARITIC",
154 "WARANG CITI",
155 "YI",
156]
157non_latin_scripts_re = re.compile(
158 r"(" + r"|".join(re.escape(x) for x in non_latin_scripts) + r")\b"
159)
161# Sanity check xlat_head_map values
162for k, v in xlat_head_map.items():
163 if v.startswith("?"):
164 v = v[1:]
165 for tag in v.split():
166 if tag not in valid_tags: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true
167 print(
168 "WARNING: xlat_head_map[{}] contains unrecognized tag {}".format(
169 k, tag
170 )
171 )
173# Regexp for finding nested translations from translation items (these are
174# used in, e.g., year/English/Translations/Arabic). This is actually used
175# in page.py.
176nested_translations_re = re.compile(
177 r"\s+\((({}): ([^()]|\([^()]+\))+)\)".format(
178 "|".join(
179 re.escape(x.removeprefix("?"))
180 for x in sorted(xlat_head_map.values(), key=len, reverse=True)
181 if x and not x.startswith("class-")
182 )
183 )
184)
186# Regexp that matches head tag specifiers. Used to match tags from end of
187# translations and linkages
188head_final_re_text = r"( -)?( ({}))+".format(
189 "|".join(
190 re.escape(x)
191 for x in
192 # The sort is to put longer ones first, preferring them in
193 # the regexp match
194 sorted(xlat_head_map.keys(), key=len, reverse=True)
195 )
196)
197head_final_re = re.compile(head_final_re_text + r"$")
199# Regexp used to match head tag specifiers at end of a form for certain
200# Bantu languages (particularly Swahili and similar languages).
201head_final_bantu_re_text = r" ({})".format(
202 "|".join(re.escape(x) for x in head_final_bantu_map.keys())
203)
204head_final_bantu_re = re.compile(head_final_bantu_re_text + "$")
206# Regexp used to match head tag specifiers at end of a form for certain
207# Semitic languages (particularly Arabic and similar languages).
208head_final_semitic_re_text = r" ({})".format(
209 "|".join(re.escape(x) for x in head_final_semitic_map.keys())
210)
211head_final_semitic_re = re.compile(head_final_semitic_re_text + "$")
213# Regexp used to match head tag specifiers at end of a form for certain
214# other languages (e.g., Lithuanian, Finnish, French).
215head_final_other_re_text = r" ({})".format(
216 "|".join(re.escape(x) for x in head_final_other_map.keys())
217)
218head_final_other_re = re.compile(head_final_other_re_text + "$")
220# Regexp for splitting heads. See parse_word_head().
221head_split_re_text_part_1 = (
222 "("
223 + head_final_re_text
224 + "|"
225 + head_final_bantu_re_text
226 + "|"
227 + head_final_semitic_re_text
228 + "|"
229 + head_final_other_re_text
230)
232head_split_re_text = head_split_re_text_part_1 + ")?( or |[,;]+| *$)"
234head_split_re_text_no_semicolon = head_split_re_text_part_1 + ")?( or |,+| *$)"
236head_split_re = re.compile(head_split_re_text)
237head_split_no_semicolon_re = re.compile(head_split_re_text_no_semicolon)
239head_split_re_parens = 0
240for m in re.finditer(r"(^|[^\\])[(]+", head_split_re_text):
241 head_split_re_parens += m.group(0).count("(")
243# Parenthesized parts that are ignored in translations
244tr_ignored_parens: set[str] = set(
245 [
246 "please verify",
247 "(please verify)",
248 "transliteration needed",
249 "(transliteration needed)",
250 "in words with back vowel harmony",
251 "(in words with back vowel harmony)",
252 "in words with front vowel harmony",
253 "(in words with front vowel harmony)",
254 "see below",
255 "see usage notes below",
256 ]
257)
258tr_ignored_parens_re = re.compile(
259 r"^("
260 + "|".join(re.escape(x) for x in tr_ignored_parens)
261 + ")$"
262 + r"|^(Can we clean up|Can we verify|for other meanings see "
263 r"lit\. )"
264)
266# Translations that are ignored
267ignored_translations: set[str] = set(
268 [
269 "[script needed]",
270 "please add this translation if you can",
271 ]
272)
274# Put english text into the "note" field in a translation if it contains one
275# of these words
276tr_note_re = re.compile(
277 r"(\b(article|definite|indefinite|superlative|comparative|pattern|"
278 r"adjective|adjectives|clause|clauses|pronoun|pronouns|preposition|prep|"
279 r"postposition|postp|action|actions|articles|"
280 r"adverb|adverbs|noun|nouns|verb|verbs|before|"
281 r"after|placed|prefix|suffix|used with|translated|"
282 r"nominative|genitive|dative|infinitive|participle|past|perfect|imperfect|"
283 r"perfective|imperfective|auxiliary|negative|future|present|tense|aspect|"
284 r"conjugation|declension|class|category|plural|singular|positive|"
285 r"seldom used|formal|informal|familiar|unspoken|spoken|written|"
286 r"indicative|progressive|conditional|potential|"
287 r"accusative|adessive|inessive|superessive|elative|allative|"
288 r"dialect|dialects|object|subject|predicate|movies|recommended|language|"
289 r"locative|continuous|simple|continuousness|gerund|subjunctive|"
290 r"periphrastically|no equivalent|not used|not always used|"
291 r"used only with|not applicable|use the|signifying|wordplay|pronounced|"
292 r"preconsonantal|spelled|spelling|respelling|respellings|phonetic|"
293 r"may be replaced|stricter sense|for nonhumans|"
294 r"sense:|used:|in full:|informally used|followed by|"
295 r"not restricted to|pertaining to|or optionally with|are optional|"
296 r"in conjunction with|in compounds|depending on the relationship|"
297 r"person addressed|one person|multiple persons|may be replaced with|"
298 r"optionally completed with|in the phrase|in response to|"
299 r"before a|before an|preceded by|verbs ending|very common|after a verb|"
300 r"with verb|with uncountable|with the objects|with stative|"
301 r"can be replaced by|often after|used before|used after|"
302 r"used in|clipping of|spoken|somewhat|capitalized|"
303 r"short form|shortening of|shortened form|initialism of|"
304 r"said to|rare:|rarer also|is rarer|negatively connoted|"
305 r"previously mentioned|uncountable noun|countable noun|"
306 r"countable nouns|uncountable nouns|"
307 r"with predicative|with -|with imperfect|with a negated|"
308 r"colloquial|misspelling|holophrastic|frequently|esp\.|especially|"
309 r'"|'
310 r"general term|after a vowel|before a vowel|"
311 r"form|regular|irregular|alternative)"
312 r")($|[) ])|^("
313 # Following are only matched at the beginning of the string
314 r"pl|pl\.|see:|pl:|sg:|plurals:|e\.g\.|e\.g\.:|e\.g\.,|cf\.|compare|such as|"
315 r"see|only|often|usually|used|usage:|of|not|in|compare|usu\.|"
316 r"as|about|abbrv\.|abbreviation|abbr\.|that:|optionally|"
317 r"mainly|from|for|also|also:|acronym|"
318 r"\+|with) "
319)
320# \b does not work at the end???
322# Related forms matching this regexp will be considered suspicious if the
323# page title does not also match one of these.
324suspicious_related_re = re.compile(
325 r"(^| )(f|m|n|c|or|pl|sg|inan|anim|pers|anml|impf|pf|vir|nvir)( |$)"
326 r"|[][:=<>&#*|]"
327 r"| \d+$"
328)
330# Word forms (head forms, translations, etc) that will be considered ok and
331# silently accepted even if they would otherwise trigger a suspicious
332# form warning.
333ok_suspicious_forms: set[str] = set(
334 [
335 "but en or", # "golden goal"/English/Tr/French
336 "cœur en or", # "heart of gold"/Eng/Tr/French
337 "en or", # golden/Eng/Tr/French
338 "men du", # jet/Etym2/Noun/Tr/Cornish
339 "parachute en or", # "golden parachute"/Eng/Tr/French
340 "vieil or", # "old gold"/Eng/Tr/French
341 # "all that glitters is not gold"/Eng/Tr/French
342 "tout ce qui brille n’est pas or",
343 "μη αποκλειστικό or", # inclusive or/Eng/Tr/Greek
344 "period or full stop",
345 ]
346)
349# Replacements to be done in classify_desc before tokenizing. This is a
350# workaround for shortcomings in TweetTokenizer.
351tokenizer_fixup_map = {
352 r"a.m.": "AM",
353 r"p.m.": "PM",
354}
355tokenizer_fixup_re = re.compile(
356 r"\b("
357 + "|".join(
358 re.escape(x)
359 for x in sorted(
360 tokenizer_fixup_map.keys(), key=lambda x: len(x), reverse=True
361 )
362 )
363 + r")"
364)
366# Unknown tags starting with these words will be silently ignored.
367ignored_unknown_starts: set[str] = set(
368 [
369 "originally",
370 "e.g.",
371 "c.f.",
372 "supplanted by",
373 "supplied by",
374 ]
375)
377ignored_unknown_starts_re = re.compile(
378 r"^("
379 + "|".join(
380 re.escape(x)
381 for x in sorted(ignored_unknown_starts, key=lambda x: -len(x))
382 )
383 + ") "
384)
386# If an unknown sequence starts with one of these, it will continue as an
387# unknown sequence until the end, unless it turns out to have a replacement.
388allowed_unknown_starts: set[str] = set(
389 [
390 "Relating",
391 "accompanied",
392 "added",
393 "after",
394 "answering",
395 "as",
396 "based",
397 "before",
398 "conjugated",
399 "conjunction",
400 "construed",
401 "especially",
402 "expression:",
403 "figurative:",
404 "followed",
405 "for",
406 "forms",
407 "from",
408 "governs",
409 "in",
410 "indicating",
411 "modifying",
412 "normally",
413 "not",
414 "of",
415 "preceding",
416 "prefixed",
417 "referring",
418 "relating",
419 "revived",
420 "said",
421 "since",
422 "takes",
423 "used",
424 "with",
425 "With",
426 "without",
427 ]
428)
429# Allow the ignored unknown starts without complaining
430allowed_unknown_starts.update(ignored_unknown_starts)
432# Full unknown tags that will be ignored in decode_tags()
433# XXX this is unused, ask Tatu where the contents is now
434ignored_unknown_tags: set[str] = set([])
436# Head endings that are mapped to tags
437head_end_map = {
438 " 1st conj.": "conjugation-1",
439 " 2nd conj.": "conjugation-2",
440 " 3rd conj.": "conjugation-3",
441 " 4th conj.": "conjugation-4",
442 " 5th conj.": "conjugation-5",
443 " 6th conj.": "conjugation-6",
444 " 7th conj.": "conjugation-7",
445}
446head_end_re = re.compile(
447 r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$"
448)
451# Dictionary of language-specific parenthesized head part starts that
452# either introduce new tags or modify previous tags. The value for each
453# language is a dictionary that maps the first word of the head part to
454# (rem_tags, add_tags), where ``rem_tags`` can be True to remove all previous
455# tags or a space-separated string of tags to remove, and ``add_tags`` should
456# be a string of tags to add.
457lang_specific_head_map: dict[
458 str, dict[str, Union[tuple[str, str], tuple[Literal[True], str]]]
459] = {
460 "Danish": {
461 # prefix: (rem_tags space separate string/True, add_tags s-sep str)
462 "c": ("neuter", "common-gender"),
463 "n": ("common-gender", "neuter"),
464 "pl": ("singular neuter common-gender", "plural"),
465 "sg": ("plural neuter common-gender", "singular"),
466 },
467}
470# Regular expression used to strip additional stuff from the end of alt_of and
471# form_of.
472alt_of_form_of_clean_re = re.compile(
473 r"(?s)("
474 + "|".join(
475 [
476 r":",
477 r'[“"]',
478 r";",
479 r" \(",
480 r" - ",
481 r" ־ ",
482 r" ᠆ ",
483 r" ‐ ",
484 r" ‑ ",
485 r" ‒ ",
486 r" – ",
487 r" — ",
488 r" ― ",
489 r" − ",
490 r" ⸺ ",
491 r" ⸻ ",
492 r" ﹘ ",
493 r" ﹣ ",
494 r" - ",
495 r" \+ ",
496 r" \(with ",
497 r" with -ra/-re",
498 r"\. Used ",
499 r"\. Also ",
500 r"\. Since ",
501 r"\. A ",
502 r"\.\. A ",
503 r"\. An ",
504 r"\.\. An ",
505 r"\. an ",
506 r"\. The ",
507 r"\. Spanish ",
508 r"\. Language ",
509 r"\. former name of ",
510 r"\. AIM",
511 r"\. OT",
512 r"\. Not ",
513 r"\. Now ",
514 r"\. Nowadays ",
515 r"\. Early ",
516 r"\. ASEAN",
517 r"\. UN",
518 r"\. IMF",
519 r"\. WHO",
520 r"\. WIPO",
521 r"\. AC",
522 r"\. DC",
523 r"\. DNA",
524 r"\. RNA",
525 r"\. SOB",
526 r"\. IMO",
527 r"\. Behavior",
528 r"\. Income ",
529 r"\. More ",
530 r"\. Most ",
531 r"\. Only ",
532 r"\. Also ",
533 r"\. From ",
534 r"\. Of ",
535 r"\.\. Of ",
536 r"\. To ",
537 r"\. For ",
538 r"\. If ",
539 r"\. Praenominal ",
540 r"\. This ",
541 r"\. Replaced ",
542 r"\. CHCS is the ",
543 r"\. Equivalent ",
544 r"\. Initialism ",
545 r"\. Note ",
546 r"\. Alternative ",
547 r"\. Compare ",
548 r"\. Cf\. ",
549 r"\. Comparable ",
550 r"\. Involves ",
551 r"\. Sometimes ",
552 r"\. Commonly ",
553 r"\. Often ",
554 r"\. Typically ",
555 r"\. Possibly ",
556 r"\. Although ",
557 r"\. Rare ",
558 r"\. Instead ",
559 r"\. Integrated ",
560 r"\. Distinguished ",
561 r"\. Given ",
562 r"\. Found ",
563 r"\. Was ",
564 r"\. In ",
565 r"\. It ",
566 r"\.\. It ",
567 r"\. One ",
568 r"\. Any ",
569 r"\. They ",
570 r"\. Members ",
571 r"\. Each ",
572 r"\. Original ",
573 r"\. Especially ",
574 r"\. Usually ",
575 r"\. Known ",
576 r"\.\. Known ",
577 r"\. See ",
578 r"\. see ",
579 r"\. target was not ",
580 r"\. Popular ",
581 r"\. Pedantic ",
582 r"\. Positive ",
583 r"\. Society ",
584 r"\. Plan ",
585 r"\. Environmentally ",
586 r"\. Affording ",
587 r"\. Encompasses ",
588 r"\. Expresses ",
589 r"\. Indicates ",
590 r"\. Text ",
591 r"\. Large ",
592 r"\. Sub-sorting ",
593 r"\. Sax",
594 r"\. First-person ",
595 r"\. Second-person ",
596 r"\. Third-person ",
597 r"\. 1st ",
598 r"\. 2nd ",
599 r"\. 3rd ",
600 r"\. Term ",
601 r"\. Northeastern ",
602 r"\. Northwestern ",
603 r"\. Southeast ",
604 r"\. Egyptian ",
605 r"\. English ",
606 r"\. Cape Province was split into ",
607 r"\. Pañcat",
608 r"\. of the ",
609 r"\. is ",
610 r"\. after ",
611 r"\. or ",
612 r"\. chromed",
613 r"\. percussion",
614 r"\. with his ",
615 r"\. a\.k\.a\. ",
616 r"\. comparative form ",
617 r"\. singular ",
618 r"\. plural ",
619 r"\. present ",
620 r"\. his ",
621 r"\. her ",
622 r"\. equivalent ",
623 r"\. measuring ",
624 r"\. used in ",
625 r"\. cutely ",
626 r"\. Protects",
627 r'\. "',
628 r"\.^",
629 r"\. \+ ",
630 r"\., ",
631 r". — ",
632 r", a ",
633 r", an ",
634 r", the ",
635 r", obsolete ",
636 r", possessed", # 'd/English
637 r", imitating", # 1/English
638 r", derived from",
639 r", called ",
640 r", especially ",
641 r", slang for ",
642 r", used to", # c/o /English
643 r", commonly", # b/w /English
644 r" corresponding to ",
645 r" equivalent to ",
646 r" popularized by ",
647 r" denoting ",
648 r" in its various senses\.",
649 r" used by ",
650 r" but not for ",
651 r" since ",
652 r" i\.e\. ",
653 r" i\. e\. ",
654 r" e\.g\. ",
655 r" eg\. ",
656 r" etc\. ",
657 r"\[http",
658 r" — used as ",
659 r" by K\. Forsyth ",
660 r" by J\. R\. Allen ",
661 r" by S\. Ferguson ",
662 r" by G\. Donaldson ",
663 r" May refer to ",
664 r" An area or region ",
665 ]
666 )
667 + r").*$"
668)
671class ValidNode:
672 """Node in the valid_sequences tree. Each node is part of a chain
673 or chains that form sequences built out of keys in key->tags
674 maps like xlat_tags, etc. The ValidNode's 'word' is the key
675 by which it is refered to in the root dict or a `children` dict,
676 `end` marks that the node is the end-terminus of a sequence (but
677 it can still continue if the sequence is shared by the start of
678 other sequences: "nominative$" and "nominative plural$" for example),
679 `tags` and `topics` are the dicts containing tag and topic strings
680 for terminal nodes (end==True)."""
682 __slots__ = (
683 "end",
684 "tags",
685 "topics",
686 "children",
687 )
689 def __init__(
690 self,
691 end=False,
692 tags: Optional[list[str]] = None,
693 topics: Optional[list[str]] = None,
694 children: Optional[dict[str, "ValidNode"]] = None,
695 ) -> None:
696 self.end = end
697 self.tags: list[str] = tags or []
698 self.topics: list[str] = topics or []
699 self.children: dict[str, "ValidNode"] = children or {}
702def add_to_valid_tree(tree: ValidNode, desc: str, v: Optional[str]) -> None:
703 """Helper function for building trees of valid tags/sequences during
704 initialization."""
705 assert isinstance(tree, ValidNode)
706 assert isinstance(desc, str)
707 assert v is None or isinstance(v, str)
708 node = tree
710 # Build the tree structure: each node has children nodes
711 # whose names are denoted by their dict key.
712 for w in desc.split(" "):
713 if w in node.children:
714 node = node.children[w]
715 else:
716 new_node = ValidNode()
717 node.children[w] = new_node
718 node = new_node
719 if not node.end:
720 node.end = True
721 if not v:
722 return None # Terminate early because there are no tags
724 tagslist = []
725 topicslist = []
726 for vv in v.split():
727 if vv in valid_tags:
728 tagslist.append(vv)
729 elif vv in valid_topics: 729 ↛ 732line 729 didn't jump to line 732 because the condition on line 729 was always true
730 topicslist.append(vv)
731 else:
732 print(
733 "WARNING: tag/topic {!r} maps to unknown {!r}".format(desc, vv)
734 )
735 topics = " ".join(topicslist)
736 tags = " ".join(tagslist)
737 # Changed to "_tags" and "_topics" to avoid possible key-collisions.
738 if topics:
739 node.topics.extend([topics])
740 if tags:
741 node.tags.extend([tags])
744def add_to_valid_tree1(
745 tree: ValidNode,
746 k: str,
747 v: Union[list[str], tuple[str, ...], str],
748 valid_values: Union[set[str], dict[str, Any]],
749) -> list[str]:
750 assert isinstance(tree, ValidNode)
751 assert isinstance(k, str)
752 assert v is None or isinstance(v, (list, tuple, str))
753 assert isinstance(valid_values, (set, dict))
754 if not v: 754 ↛ 755line 754 didn't jump to line 755 because the condition on line 754 was never true
755 add_to_valid_tree(valid_sequences, k, None)
756 return []
757 elif isinstance(v, str):
758 v = [v]
759 q = []
760 for vv in v:
761 assert isinstance(vv, str)
762 add_to_valid_tree(valid_sequences, k, vv)
763 vvs = vv.split()
764 for x in vvs:
765 q.append(x)
766 # return each individual tag
767 return q
770def add_to_valid_tree_mapping(
771 tree: ValidNode,
772 mapping: Union[dict[str, Union[list[str], str]], dict[str, str]],
773 valid_values: Union[set[str], dict[str, Any]],
774 recurse: bool,
775) -> None:
776 assert isinstance(tree, ValidNode)
777 assert isinstance(mapping, dict)
778 assert isinstance(valid_values, (set, dict))
779 assert recurse in (True, False)
780 for k, v in mapping.items():
781 assert isinstance(k, str)
782 assert isinstance(v, (list, str))
783 if isinstance(v, str):
784 q = add_to_valid_tree1(tree, k, [v], valid_values)
785 else:
786 q = add_to_valid_tree1(tree, k, v, valid_values)
787 if recurse:
788 visited = set()
789 while q:
790 v = q.pop()
791 if v in visited:
792 continue
793 visited.add(v)
794 if v not in mapping:
795 continue
796 vv = mapping[v]
797 qq = add_to_valid_tree1(tree, k, vv, valid_values)
798 q.extend(qq)
801# Tree of sequences considered to be tags (includes sequences that are
802# mapped to something that becomes one or more valid tags)
803valid_sequences = ValidNode()
804sequences_with_slashes: set[str] = set()
805for tag in valid_tags:
806 # The basic tags used in our tag system; some are a bit weird, but easier
807 # to implement this with 'false' positives than filter out stuff no one else
808 # uses.
809 if "/" in tag:
810 sequences_with_slashes.add(tag)
811 add_to_valid_tree(valid_sequences, tag, tag)
812for tag in uppercase_tags:
813 hyphenated = re.sub(r"\s+", "-", tag)
814 if "/" in tag:
815 sequences_with_slashes.add(tag)
816 add_to_valid_tree(valid_sequences, tag, hyphenated)
818# xlat_tags_map!
819add_to_valid_tree_mapping(valid_sequences, xlat_tags_map, valid_tags, False)
820for k in xlat_tags_map:
821 if "/" in k:
822 sequences_with_slashes.add(k)
823# Add topics to the same table, with all generalized topics also added
824for topic in valid_topics:
825 assert " " not in topic
826 if "/" in topic: 826 ↛ 827line 826 didn't jump to line 827 because the condition on line 826 was never true
827 sequences_with_slashes.add(topic)
828 add_to_valid_tree(valid_sequences, topic, topic)
829# Let each original topic value stand alone. These are not generally on
830# valid_topics. We add the original topics with spaces replaced by hyphens.
831for topic in topic_generalize_map.keys():
832 hyphenated = re.sub(r"\s+", "-", topic)
833 if "/" in topic: 833 ↛ 834line 833 didn't jump to line 834 because the condition on line 833 was never true
834 sequences_with_slashes.add(topic)
835 add_to_valid_tree(valid_sequences, topic, hyphenated)
836# Add canonicalized/generalized topic values
837add_to_valid_tree_mapping(
838 valid_sequences, topic_generalize_map, valid_topics, True
839)
841# Regex used to divide a decode candidate into parts that shouldn't
842# have their slashes turned into spaces
843slashes_re = re.compile(
844 r"(" + "|".join((re.escape(s) for s in sequences_with_slashes)) + r")"
845)
847# Regexp used to find "words" from word heads and linguistic descriptions
848word_pattern = (
849 r"[^ ,;()\u200e]+|"
850 r"\([^ ,;()\u200e]+\)[^ ,;()\u200e]+|"
851 r"[\u2800-\u28ff]|" # Braille characters
852 r"\(([^()]|\([^()]*\))*\)"
853)
855word_re_global = re.compile(word_pattern)
858def distw(titleparts: Sequence[str], word: str) -> float:
859 """Computes how distinct ``word`` is from the most similar word in
860 ``titleparts``. Returns 1 if words completely distinct, 0 if
861 identical, or otherwise something in between."""
862 assert isinstance(titleparts, (list, tuple))
863 assert isinstance(word, str)
864 w = min(
865 Levenshtein.distance(word, tw) / max(len(tw), len(word))
866 for tw in titleparts
867 )
868 return w
871def map_with(
872 ht: dict[str, str | list[str]] | dict[str, str],
873 lst: Sequence[str],
874) -> list[str]:
875 """Takes alternatives from ``lst``, maps them using ``ht`` to zero or
876 more alternatives each, and returns a combined list of alternatives."""
877 assert isinstance(ht, dict)
878 assert isinstance(lst, (list, tuple))
879 ret = []
880 for x in lst:
881 assert isinstance(x, str)
882 x = x.strip()
883 x = ht.get(x, x)
884 if isinstance(x, str): 884 ↛ 887line 884 didn't jump to line 887 because the condition on line 884 was always true
885 if x: 885 ↛ 880line 885 didn't jump to line 880 because the condition on line 885 was always true
886 ret.append(x)
887 elif isinstance(x, (list, tuple)):
888 ret.extend(x)
889 else:
890 raise RuntimeError("map_with unexpected value: {!r}".format(x))
891 return ret
894TagList = list[str]
895PosPathStep = tuple[int, TagList, TagList]
898def check_unknown(
899 from_i: int,
900 to_i: int,
901 i: int,
902 wordlst: Sequence[str],
903 allow_any: bool,
904 no_unknown_starts: bool,
905) -> list[PosPathStep]:
906 """Check if the current section from_i->to_i is actually unknown
907 or if it needs some special handling. We already presupposed that
908 this is UNKNOWN; this is just called to see what *kind* of UNKNOWN."""
909 assert isinstance(to_i, int)
910 assert isinstance(from_i, int)
911 assert isinstance(i, int)
912 # Adds unknown tag if needed. Returns new last_i
913 # print("check_unknown to_i={} from_i={} i={}"
914 # .format(to_i, from_i, i))
915 if from_i >= to_i:
916 return []
917 words = wordlst[from_i:to_i]
918 tag = " ".join(words)
919 assert tag
920 # print(f"{tag=}")
921 if re.match(ignored_unknown_starts_re, tag):
922 # Tags with this start are to be ignored
923 return [(from_i, ["UNKNOWN"], [])]
924 if tag in ignored_unknown_tags: 924 ↛ 925line 924 didn't jump to line 925 because the condition on line 924 was never true
925 return [] # One of the tags listed as to be ignored
926 if tag in ("and", "or"):
927 return []
928 if (
929 not allow_any
930 and not words[0].startswith("~")
931 and (
932 no_unknown_starts
933 or words[0] not in allowed_unknown_starts
934 or len(words) <= 1
935 )
936 ):
937 # print("ERR allow_any={} words={}"
938 # .format(allow_any, words))
939 return [
940 (from_i, ["UNKNOWN"], ["error-unknown-tag"])
941 ] # Add ``tag`` here to include
942 else:
943 return [(from_i, ["UNKNOWN"], [tag])]
946def add_new1(
947 node: ValidNode,
948 i: int,
949 start_i: int,
950 last_i: int,
951 new_paths: list[list[PosPathStep]],
952 new_nodes: list[tuple[ValidNode, int, int]],
953 pos_paths: list[list[list[PosPathStep]]],
954 wordlst: list[str],
955 allow_any: bool,
956 no_unknown_starts: bool,
957 max_last_i: int,
958) -> int:
959 assert isinstance(new_paths, list)
960 # print("add_new: start_i={} last_i={}".format(start_i, last_i))
961 # print("$ {} last_i={} start_i={}"
962 # .format(w, last_i, start_i))
963 max_last_i = max(max_last_i, last_i) # if last_i has grown
964 if (node, start_i, last_i) not in new_nodes:
965 new_nodes.append((node, start_i, last_i))
966 if node.end:
967 # We can see a terminal point in the search tree.
968 u = check_unknown(
969 last_i, start_i, i, wordlst, allow_any, no_unknown_starts
970 )
971 # Create new paths candidates based on different past possible
972 # paths; pos_path[last_i] contains possible paths, so add this
973 # new one at the beginning(?)
974 # The list comprehension inside the parens generates an iterable
975 # of lists, so this is .extend( [(last_i...)], [(last_i...)], ... )
976 # XXX: this is becoming impossible to annotate, nodes might
977 # need to become classed objects and not just dicts, or at least
978 # a TypedDict with a "children" node
979 new_paths.extend(
980 [(last_i, node.tags, node.topics)] + u + x
981 for x in pos_paths[last_i]
982 )
983 max_last_i = i + 1
984 return max_last_i
987@functools.lru_cache(maxsize=65536)
988def decode_tags(
989 src: str,
990 allow_any=False,
991 no_unknown_starts=False,
992) -> tuple[list[tuple[str, ...]], list[str]]:
993 tagsets, topics = decode_tags1(src, allow_any, no_unknown_starts)
994 # print(f"decode_tags: {src=}, {tagsets=}")
996 # Insert retry-code here that modifies the text source
997 if (
998 any(s.startswith("error-") for tagset in tagsets for s in tagset)
999 # I hate Python's *nested* list comprehension syntax ^
1000 or any(s.startswith("error-") for s in topics)
1001 ):
1002 new_tagsets: list[tuple[str, ...]] = []
1003 new_topics: list[str] = []
1005 if "/" in src:
1006 # slashes_re contains valid key entries with slashes; we're going
1007 # to skip them by splitting the string and skipping handling every
1008 # second entry, which contains the splitting group like "masculine/
1009 # feminine" style keys.
1010 split_parts = re.split(slashes_re, src)
1011 new_parts: list[str] = []
1012 if len(split_parts) > 1:
1013 for i, s in enumerate(split_parts):
1014 if i % 2 == 0:
1015 new_parts.append(s.replace("/", " "))
1016 else:
1017 new_parts.append(s)
1018 new_src = "".join(new_parts)
1019 else:
1020 new_src = src
1021 new_tagsets, new_topics = decode_tags1(
1022 new_src, allow_any, no_unknown_starts
1023 )
1024 elif " or " in src or " and " in src:
1025 # Annoying kludge.
1026 new_src = src.replace(" and ", " ")
1027 new_src = new_src.replace(" or ", " ")
1028 new_tagsets, new_topics = decode_tags1(
1029 new_src, allow_any, no_unknown_starts
1030 )
1031 # print(f"{new_tagsets=}")
1033 if new_tagsets or new_topics:
1034 old_errors = sum(
1035 1 for tagset in tagsets for s in tagset if s.startswith("error")
1036 )
1037 old_errors += sum(1 for s in topics if s.startswith("error"))
1038 new_errors = sum(
1039 1
1040 for new_tagset in new_tagsets
1041 for s in new_tagset
1042 if s.startswith("error")
1043 )
1044 new_errors += sum(1 for s in new_topics if s.startswith("error"))
1046 if new_errors <= old_errors: 1046 ↛ 1049line 1046 didn't jump to line 1049 because the condition on line 1046 was always true
1047 return new_tagsets, new_topics
1049 return tagsets, topics
1052def decode_tags1(
1053 src: str,
1054 allow_any=False,
1055 no_unknown_starts=False,
1056) -> tuple[list[tuple[str, ...]], list[str]]:
1057 """Decodes tags, doing some canonicalizations. This returns a list of
1058 lists of tags and a list of topics."""
1059 assert isinstance(src, str)
1061 # print("decode_tags: src={!r}".format(src))
1063 pos_paths: list[list[list[PosPathStep]]] = [[[]]]
1064 wordlst: list[str] = []
1065 max_last_i = 0 # pre-initialized here so that it can be used as a ref
1067 add_new = functools.partial(
1068 add_new1, # pre-set parameters and references for function
1069 pos_paths=pos_paths,
1070 wordlst=wordlst,
1071 allow_any=allow_any,
1072 no_unknown_starts=no_unknown_starts,
1073 max_last_i=max_last_i,
1074 )
1075 # First split the tags at commas and semicolons. Their significance is that
1076 # a multi-word sequence cannot continue across them.
1077 parts = split_at_comma_semi(src, extra=[";", ":"])
1079 for part in parts:
1080 max_last_i = len(wordlst) # "how far have we gone?"
1081 lst1 = part.split()
1082 if not lst1:
1083 continue
1084 wordlst.extend(lst1)
1085 cur_nodes: list[tuple[ValidNode, int, int]] = [] # Currently seen
1086 for w in lst1:
1087 i = len(pos_paths) - 1
1088 new_nodes: list[tuple[ValidNode, int, int]] = []
1089 # replacement nodes for next loop
1090 new_paths: list[list[PosPathStep]] = []
1091 # print("ITER i={} w={} max_last_i={} wordlst={}"
1092 # .format(i, w, max_last_i, wordlst))
1093 node: ValidNode
1094 start_i: int
1095 last_i: int
1096 for node, start_i, last_i in cur_nodes:
1097 # ValidNodes are part of a search tree that checks if a
1098 # phrase is found in xlat_tags_map and other text->tags dicts.
1099 if w in node.children:
1100 # the phrase continues down the tree
1101 # print("INC", w)
1102 max_last_i = add_new(
1103 node.children[w],
1104 i,
1105 start_i,
1106 last_i,
1107 new_paths,
1108 new_nodes,
1109 )
1110 if node.end:
1111 # we've hit an end point, the tags and topics have already
1112 # been gathered at some point, don't do anything with the
1113 # old stuff
1114 if w in valid_sequences.children:
1115 # This starts a *new* possible section
1116 max_last_i = add_new(
1117 valid_sequences.children[w], # root->
1118 i,
1119 i,
1120 i,
1121 new_paths,
1122 new_nodes,
1123 )
1124 if w not in node.children and not node.end:
1125 # print("w not in node and $: i={} last_i={} wordlst={}"
1126 # .format(i, last_i, wordlst))
1127 # If i == last_i == 0, for example (beginning)
1128 if (
1129 i == last_i
1130 or no_unknown_starts
1131 or wordlst[last_i] not in allowed_unknown_starts
1132 ):
1133 # print("NEW", w)
1134 if w in valid_sequences.children:
1135 # Start new sequences here
1136 max_last_i = add_new(
1137 valid_sequences.children[w],
1138 i,
1139 i,
1140 last_i,
1141 new_paths,
1142 new_nodes,
1143 )
1144 if not new_nodes:
1145 # This is run at the start when i == max_last_i == 0,
1146 # which is what populates the first node in new_nodes.
1147 # Some initial words cause the rest to be interpreted as unknown
1148 # print("not new nodes: i={} last_i={} wordlst={}"
1149 # .format(i, max_last_i, wordlst))
1150 if (
1151 i == max_last_i
1152 or no_unknown_starts
1153 or wordlst[max_last_i] not in allowed_unknown_starts
1154 ):
1155 # print("RECOVER w={} i={} max_last_i={} wordlst={}"
1156 # .format(w, i, max_last_i, wordlst))
1157 if w in valid_sequences.children:
1158 max_last_i = add_new(
1159 # new sequence from root
1160 valid_sequences.children[w],
1161 i,
1162 i,
1163 max_last_i,
1164 new_paths,
1165 new_nodes,
1166 )
1167 cur_nodes = new_nodes # Completely replace nodes!
1168 # 2023-08-18, fix to improve performance
1169 # Decode tags does a big search of the best-shortest matching
1170 # sequences of tags, but the original algorithm didn't have
1171 # any culling happen during operation, so in a case with
1172 # a lot of tags (for example, big blocks of text inserted
1173 # somewhere by mistake that is processed by decode_tags),
1174 # it would lead to exponential growth of new_paths contents.
1175 # This culling, using the same weighting algorithm code as
1176 # in the original is just applied to new_paths before it is
1177 # added to pos_paths. Basically it's "take the 10 best paths".
1178 # This *can* cause bugs if it gets stuck in a local minimum
1179 # or something, but this whole process is one-dimensional
1180 # and not that complex, so hopefully it works out...
1181 pw = []
1182 path: list[PosPathStep]
1183 for path in new_paths:
1184 weight = len(path)
1185 if any(x[1] == ["UNKNOWN"] for x in path):
1186 weight += 100 # Penalize unknown paths
1187 pw.append((weight, path))
1188 new_paths = [weightpath[1] for weightpath in sorted(pw)[:10]]
1189 pos_paths.append(new_paths)
1191 # print("END max_last_i={} len(wordlst)={} len(pos_paths)={}"
1192 # .format(max_last_i, len(wordlst), len(pos_paths)))
1194 if cur_nodes:
1195 # print("END HAVE_NODES")
1196 for node, start_i, last_i in cur_nodes:
1197 if node.end:
1198 # print("$ END start_i={} last_i={}"
1199 # .format(start_i, last_i))
1200 for path in pos_paths[start_i]:
1201 pos_paths[-1].append(
1202 [(last_i, node.tags, node.topics)] + path
1203 )
1204 else:
1205 # print("UNK END start_i={} last_i={} wordlst={}"
1206 # .format(start_i, last_i, wordlst))
1207 u = check_unknown(
1208 last_i,
1209 len(wordlst),
1210 len(wordlst),
1211 wordlst,
1212 allow_any,
1213 no_unknown_starts,
1214 )
1215 if pos_paths[start_i]:
1216 for path in pos_paths[start_i]:
1217 pos_paths[-1].append(u + path)
1218 else:
1219 pos_paths[-1].append(u)
1220 else:
1221 # Check for a final unknown tag
1222 # print("NO END NODES max_last_i={}".format(max_last_i))
1223 paths = pos_paths[max_last_i] or [[]]
1224 u = check_unknown(
1225 max_last_i,
1226 len(wordlst),
1227 len(wordlst),
1228 wordlst,
1229 allow_any,
1230 no_unknown_starts,
1231 )
1232 if u:
1233 # print("end max_last_i={}".format(max_last_i))
1234 for path in list(paths): # Copy in case it is the last pos
1235 pos_paths[-1].append(u + path)
1237 # import json
1238 # print("POS_PATHS:", json.dumps(pos_paths, indent=2, sort_keys=True))
1240 if not pos_paths[-1]:
1241 # print("decode_tags: {}: EMPTY POS_PATHS[-1]".format(src))
1242 return [], []
1244 # Find the best path
1245 pw = []
1246 for path in pos_paths[-1]:
1247 weight = len(path)
1248 if any(x[1] == ["UNKNOWN"] for x in path):
1249 weight += 100 # Penalize unknown paths
1250 pw.append((weight, path))
1251 path = min(pw)[1]
1253 # Convert the best path to tagsets and topics
1254 tagsets: list[list[str]] = [[]]
1255 topics: list[str] = []
1256 for i, tagspec, topicspec in path:
1257 if len(tagsets or "") > 16:
1258 # ctx.error("Too many tagsets! This is probably exponential",
1259 # sortid="form_descriptions/20230818")
1260 return [("error-unknown-tag", "error-exponential-tagsets")], []
1261 if tagspec == ["UNKNOWN"]:
1262 new_tagsets = []
1263 for x in tagsets:
1264 new_tagsets.append(x + topicspec)
1265 tagsets = new_tagsets
1266 continue
1267 if tagspec:
1268 new_tagsets = []
1269 for x in tagsets:
1270 for t in tagspec:
1271 if t: 1271 ↛ 1278line 1271 didn't jump to line 1278 because the condition on line 1271 was always true
1272 new_tags = list(x)
1273 for tag in t.split():
1274 if tag not in new_tags:
1275 new_tags.append(tag)
1276 new_tagsets.append(new_tags)
1277 else:
1278 new_tagsets.append(x)
1279 tagsets = new_tagsets
1280 if topicspec:
1281 for t in topicspec:
1282 for topic in t.split():
1283 if topic not in topics:
1284 topics.append(topic)
1286 # print("unsorted tagsets:", tagsets)
1287 ret_tagsets = sorted(set(tuple(sorted(set(tags))) for tags in tagsets))
1288 # topics = list(sorted(set(topics))) XXX tests expect not sorted
1289 # print("decode_tags: {} -> {} topics {}".format(src, tagsets, topics))
1290 # Yes, ret_tagsets is a list of tags in tuples, while topics is a LIST
1291 # of tags. Turning topics into a tuple breaks tests, turning the tuples
1292 # inside tagsets into lists breaks tests, I'm leaving them mismatched
1293 # for now. XXX
1294 return ret_tagsets, topics
1297def parse_head_final_tags(
1298 wxr: WiktextractContext, lang: str, form: str
1299) -> tuple[str, list[str]]:
1300 """Parses tags that are allowed at the end of a form head from the end
1301 of the form. This can also be used for parsing the final gender etc tags
1302 from translations and linkages."""
1303 assert isinstance(wxr, WiktextractContext)
1304 assert isinstance(lang, str) # Should be language that "form" is for
1305 assert isinstance(form, str)
1307 # print("parse_head_final_tags: lang={} form={!r}".format(lang, form))
1309 # Make sure there are no double spaces in the form as this code does not
1310 # handle them otherwise.
1311 form = re.sub(r"\s+", " ", form.strip())
1312 if not form:
1313 return form, []
1315 origform = form
1317 tags = []
1319 # If parsing for certain Bantu languages (e.g., Swahili), handle
1320 # some extra head-final tags first
1321 if lang in head_final_bantu_langs:
1322 m = re.search(head_final_bantu_re, form)
1323 if m is not None:
1324 tagkeys = m.group(1)
1325 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1325 ↛ 1340line 1325 didn't jump to line 1340 because the condition on line 1325 was always true
1326 form = form[: m.start()]
1327 v = head_final_bantu_map[tagkeys]
1328 if v.startswith("?"): 1328 ↛ 1329line 1328 didn't jump to line 1329 because the condition on line 1328 was never true
1329 v = v[1:]
1330 wxr.wtp.debug(
1331 "suspicious suffix {!r} in language {}: {}".format(
1332 tagkeys, lang, origform
1333 ),
1334 sortid="form_descriptions/1028",
1335 )
1336 tags.extend(v.split())
1338 # If parsing for certain Semitic languages (e.g., Arabic), handle
1339 # some extra head-final tags first
1340 if lang in head_final_semitic_langs:
1341 m = re.search(head_final_semitic_re, form)
1342 if m is not None:
1343 tagkeys = m.group(1)
1344 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1344 ↛ 1359line 1344 didn't jump to line 1359 because the condition on line 1344 was always true
1345 form = form[: m.start()]
1346 v = head_final_semitic_map[tagkeys]
1347 if v.startswith("?"): 1347 ↛ 1348line 1347 didn't jump to line 1348 because the condition on line 1347 was never true
1348 v = v[1:]
1349 wxr.wtp.debug(
1350 "suspicious suffix {!r} in language {}: {}".format(
1351 tagkeys, lang, origform
1352 ),
1353 sortid="form_descriptions/1043",
1354 )
1355 tags.extend(v.split())
1357 # If parsing for certain other languages (e.g., Lithuanian,
1358 # French, Finnish), handle some extra head-final tags first
1359 if lang in head_final_other_langs:
1360 m = re.search(head_final_other_re, form)
1361 if m is not None:
1362 tagkeys = m.group(1)
1363 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1363 ↛ 1369line 1363 didn't jump to line 1369 because the condition on line 1363 was always true
1364 form = form[: m.start()]
1365 tags.extend(head_final_other_map[tagkeys].split(" "))
1367 # Handle normal head-final tags
1368 # Loop this until nothing is found
1369 while True:
1370 prev_form = form
1371 m = re.search(head_final_re, form)
1372 if m is not None:
1373 print(f"{m=}, {m.groups()=}")
1374 tagkeys = m.group(3)
1375 # Only replace tags ending with numbers in languages that have
1376 # head-final numeric tags (e.g., Bantu classes); also, don't replace
1377 # tags if the main title ends with them (then presume they are part
1378 # of the word)
1379 # print("head_final_tags form={!r} tagkeys={!r} lang={}"
1380 # .format(form, tagkeys, lang))
1381 tagkeys_contains_digit = re.search(r"\d", tagkeys)
1382 if (
1383 (not tagkeys_contains_digit or lang in head_final_numeric_langs)
1384 and not wxr.wtp.title.endswith(" " + tagkeys) # type:ignore[union-attr]
1385 and
1386 # XXX the above test does not capture when the whole word is a
1387 # xlat_head_map key, so I added the below test to complement
1388 # it; does this break anything?
1389 not wxr.wtp.title == tagkeys
1390 ): # defunct/English,
1391 # "more defunct" -> "more" ["archaic"]
1392 if ( 1392 ↛ 1410line 1392 didn't jump to line 1410 because the condition on line 1392 was always true
1393 not tagkeys_contains_digit
1394 or lang in head_final_numeric_langs
1395 ):
1396 # m.start(3) gets the start of what is in m.group(3), handy
1397 form = form[: m.start(3)].strip()
1398 v = xlat_head_map[tagkeys]
1399 if v.startswith("?"): 1399 ↛ 1400line 1399 didn't jump to line 1400 because the condition on line 1399 was never true
1400 v = v[1:]
1401 wxr.wtp.debug(
1402 "suspicious suffix {!r} in language {}: {}".format(
1403 tagkeys, lang, origform
1404 ),
1405 sortid="form_descriptions/1077",
1406 )
1407 tags.extend(v.split())
1408 else:
1409 break
1410 if prev_form == form:
1411 break
1413 # Generate warnings about words ending in " or" after processing
1414 if (
1415 (form.endswith(" or") and not origform.endswith(" or"))
1416 or re.search(
1417 r" (1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|"
1418 r"1a|2a|9a|10a|m1|f1|f2|m2|f3|m3|f4|m4|f5|m5|or|\?)"
1419 r"($|/| (f|m|sg|pl|anim|inan))",
1420 form,
1421 )
1422 or form.endswith(" du")
1423 ):
1424 if form not in ok_suspicious_forms:
1425 wxr.wtp.debug(
1426 "suspicious unhandled suffix in {}: {!r}, originally {!r}".format(
1427 lang, form, origform
1428 ),
1429 sortid="form_descriptions/1089",
1430 )
1432 # print("parse_head_final_tags: form={!r} tags={}".format(form, tags))
1433 return form, tags
1436def quote_kept_parens(s: str) -> str:
1437 """Changes certain parenthesized expressions so that they won't be
1438 interpreted as parentheses. This is used for parts that are kept as
1439 part of the word, such as "read admiral (upper half)"."""
1440 return re.sub(
1441 r"\((lower half|upper half|k|s|n|II|III|A|C|G|U|Y|"
1442 r"vinyl|p-phenylene vinylene|\(\(\s*\)\))\)",
1443 r"__lpar__\1__rpar__",
1444 s,
1445 )
1448def quote_kept_ruby(
1449 wxr: WiktextractContext,
1450 ruby_tuples: list[
1451 tuple[
1452 str,
1453 str,
1454 ]
1455 ],
1456 s: str,
1457) -> str:
1458 if len(ruby_tuples) < 1: 1458 ↛ 1459line 1458 didn't jump to line 1459 because the condition on line 1458 was never true
1459 wxr.wtp.debug(
1460 "quote_kept_ruby called with no ruby",
1461 sortid="form_description/1114/20230517",
1462 )
1463 return s
1464 ks = []
1465 rs = []
1466 for k, r in ruby_tuples:
1467 ks.append(re.escape(k))
1468 rs.append(re.escape(r))
1469 if not (ks and rs): 1469 ↛ 1470line 1469 didn't jump to line 1470 because the condition on line 1469 was never true
1470 wxr.wtp.debug(
1471 f"empty column in ruby_tuples: {ruby_tuples}",
1472 sortid="form_description/1124/20230606",
1473 )
1474 return s
1475 newm = re.compile(
1476 r"({})\s*\(\s*({})\s*\)".format("|".join(ks), "|".join(rs))
1477 )
1478 rub_re = re.compile(
1479 r"({})".format(
1480 r"|".join(
1481 r"{}\(*{}\)*".format(
1482 re.escape(k),
1483 re.escape(r),
1484 )
1485 for k, r in ruby_tuples
1486 )
1487 )
1488 )
1490 def paren_replace(m: re.Match) -> str:
1491 return re.sub(newm, r"\1__lrub__\2__rrub__", m.group(0))
1493 return re.sub(rub_re, paren_replace, s)
1496def unquote_kept_parens(s: str) -> str:
1497 """Conerts the quoted parentheses back to normal parentheses."""
1498 return re.sub(r"__lpar__(.*?)__rpar__", r"(\1)", s)
1501def add_romanization(
1502 wxr: WiktextractContext,
1503 data: WordData,
1504 roman: str,
1505 text: str,
1506 is_reconstruction: bool,
1507 head_group: Optional[int],
1508 ruby: Sequence[tuple[str, str]],
1509) -> None:
1510 tags_lst = ["romanization"]
1511 m = re.match(r"([^:]+):(.+)", roman)
1512 # This function's purpose is to intercept broken romanizations,
1513 # like "Yale: hēnpyeng" style tags. Most romanization styles
1514 # are already present as tags, so we can use decode_tags to find
1515 # them.
1516 if m: 1516 ↛ 1517line 1516 didn't jump to line 1517 because the condition on line 1516 was never true
1517 tagsets, topics = decode_tags(m.group(1))
1518 if tagsets:
1519 for tags in tagsets:
1520 tags_lst.extend(tags)
1521 roman = m.group(2)
1522 add_related(
1523 wxr,
1524 data,
1525 tags_lst,
1526 [roman],
1527 text,
1528 True,
1529 is_reconstruction,
1530 head_group,
1531 ruby,
1532 )
1535def add_related(
1536 wxr: WiktextractContext,
1537 data: WordData,
1538 tags_lst: Union[list[str], tuple[str, ...]],
1539 related_list: list[str],
1540 origtext: str,
1541 add_all_canonicals: bool,
1542 is_reconstruction: bool,
1543 head_group: Optional[int],
1544 ruby_data: Optional[Sequence[tuple[str, str]]] = None,
1545) -> Optional[list[tuple[str, ...]]]:
1546 """Internal helper function for some post-processing entries for related
1547 forms (e.g., in word head). This returns a list of list of tags to be
1548 added to following related forms or None (cf. walrus/English word head,
1549 parenthesized part starting with "both")."""
1550 assert isinstance(wxr, WiktextractContext)
1551 assert isinstance(tags_lst, (list, tuple))
1552 for x in tags_lst:
1553 assert isinstance(x, str)
1554 assert isinstance(related_list, (list, tuple))
1555 assert isinstance(origtext, str)
1556 assert add_all_canonicals in (True, False)
1557 assert isinstance(ruby_data, (list, tuple)) or ruby_data is None
1558 if ruby_data is None: 1558 ↛ 1559line 1558 didn't jump to line 1559 because the condition on line 1558 was never true
1559 ruby_data = []
1560 related = " ".join(related_list)
1561 # print("add_related: tags_lst={} related={}".format(tags_lst, related))
1562 if related == "[please provide]": 1562 ↛ 1563line 1562 didn't jump to line 1563 because the condition on line 1562 was never true
1563 return None
1564 if related in IGNORED_RELATED: 1564 ↛ 1565line 1564 didn't jump to line 1565 because the condition on line 1564 was never true
1565 return None
1566 if is_reconstruction and related.startswith("*") and len(related) > 1:
1567 related = related[1:]
1569 # Get title word, with any reconstruction prefix removed
1570 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title) # type:ignore[arg-type]
1572 def check_related(related: str) -> None:
1573 # Warn about some suspicious related forms
1574 m = re.search(suspicious_related_re, related)
1575 if (m and m.group(0) not in titleword) or (
1576 related in ("f", "m", "n", "c") and len(titleword) >= 3
1577 ):
1578 if "eumhun" in tags_lst: 1578 ↛ 1579line 1578 didn't jump to line 1579 because the condition on line 1578 was never true
1579 return
1580 if "cangjie-input" in tags_lst: 1580 ↛ 1581line 1580 didn't jump to line 1581 because the condition on line 1580 was never true
1581 return
1582 if "class" in tags_lst: 1582 ↛ 1583line 1582 didn't jump to line 1583 because the condition on line 1582 was never true
1583 return
1584 if wxr.wtp.section == "Korean" and re.search( 1584 ↛ 1588line 1584 didn't jump to line 1588 because the condition on line 1584 was never true
1585 r"^\s*\w*>\w*\s*$", related
1586 ):
1587 # ignore Korean "i>ni" / "라>나" values
1588 return
1589 if ( 1589 ↛ 1596line 1589 didn't jump to line 1596 because the condition on line 1589 was never true
1590 wxr.wtp.section == "Burmese"
1591 and "romanization" in tags_lst
1592 and re.search(r":", related)
1593 ):
1594 # ignore Burmese with ":", that is used in Burmese
1595 # translitteration of "း", the high-tone visarga.
1596 return
1597 wxr.wtp.debug(
1598 "suspicious related form tags {}: {!r} in {!r}".format(
1599 tags_lst, related, origtext
1600 ),
1601 sortid="form_descriptions/1147",
1602 )
1604 following_tagsets = None # Tagsets to add to following related forms
1605 roman = None
1606 tagsets1: list[tuple[str, ...]] = [tuple()]
1607 topics1: list[str] = []
1609 m = re.match(r"\((([^()]|\([^()]*\))*)\)\s+", related)
1610 if m:
1611 paren = m.group(1)
1612 related = related[m.end() :]
1613 m = re.match(r"^(all|both) (.*)", paren)
1614 if m: 1614 ↛ 1615line 1614 didn't jump to line 1615 because the condition on line 1614 was never true
1615 tagsets1, topics1 = decode_tags(m.group(2))
1616 following_tagsets = tagsets1
1617 else:
1618 tagsets1, topics1 = decode_tags(paren)
1619 else:
1620 m = re.search(r"\s+\((([^()]|\([^()]*\))*)\)$", related)
1621 if m:
1622 paren = m.group(1)
1623 if paren.startswith("U+"): 1623 ↛ 1624line 1623 didn't jump to line 1624 because the condition on line 1623 was never true
1624 related = related[: m.start()]
1625 else:
1626 cls = classify_desc(paren)
1627 if ( 1627 ↛ 1634line 1627 didn't jump to line 1634 because the condition on line 1627 was always true
1628 cls in ("romanization", "english")
1629 and classify_desc(related[: m.start()]) == "other"
1630 ):
1631 roman = paren
1632 related = related[: m.start()]
1633 else:
1634 related = related[: m.start()]
1635 tagsets1, topics1 = decode_tags(paren)
1636 if related and related.startswith("{{"): 1636 ↛ 1637line 1636 didn't jump to line 1637 because the condition on line 1636 was never true
1637 wxr.wtp.debug(
1638 "{{ in word head form - possible Wiktionary error: {!r}".format(
1639 related
1640 ),
1641 sortid="form_descriptions/1177",
1642 )
1643 return None # Likely Wiktionary coding error
1644 related = unquote_kept_parens(related)
1645 # Split related by "/" (e.g., grande/Spanish) superlative in head
1646 # Do not split if / in word title, see π//Japanese
1647 if len(related) > 5 and "/" not in wxr.wtp.title: # type:ignore[operator]
1648 alts = split_at_comma_semi(related, separators=["/"])
1649 else:
1650 alts = [related]
1651 if ruby_data:
1652 # prepare some regex stuff in advance
1653 ks, rs = [], []
1654 for k, r in ruby_data:
1655 ks.append(re.escape(k))
1656 rs.append(re.escape(r))
1657 splitter = r"((?:{})__lrub__(?:{})__rrub__)".format(
1658 "|".join(ks), "|".join(rs)
1659 )
1660 for related in alts:
1661 ruby: list[tuple[str, str]] = []
1662 if ruby_data:
1663 new_related = []
1664 rub_split = re.split(splitter, related)
1665 for s in rub_split:
1666 m = re.match(r"(.+)__lrub__(.+)__rrub__", s)
1667 if m:
1668 # add ruby with (\1, \2)
1669 ruby.append((m.group(1), m.group(2)))
1670 new_related.append(m.group(1))
1671 else:
1672 new_related.append(s)
1673 related = "".join(new_related)
1674 tagsets2, topics2 = decode_tags(" ".join(tags_lst))
1675 for tags1 in tagsets1:
1676 assert isinstance(tags1, (list, tuple))
1677 for tags2 in tagsets2:
1678 assert isinstance(tags1, (list, tuple))
1679 dt: LinkageData = {"word": related}
1680 if roman:
1681 dt["roman"] = roman
1682 if ruby:
1683 dt["ruby"] = ruby
1684 if "alt-of" in tags2: 1684 ↛ 1685line 1684 didn't jump to line 1685 because the condition on line 1684 was never true
1685 check_related(related)
1686 data_extend(data, "tags", tags1)
1687 data_extend(data, "tags", tags2)
1688 data_extend(data, "topics", topics1)
1689 data_extend(data, "topics", topics2)
1690 data_append(data, "alt_of", dt)
1691 elif "form-of" in tags2: 1691 ↛ 1692line 1691 didn't jump to line 1692 because the condition on line 1691 was never true
1692 check_related(related)
1693 data_extend(data, "tags", tags1)
1694 data_extend(data, "tags", tags2)
1695 data_extend(data, "topics", topics1)
1696 data_extend(data, "topics", topics2)
1697 data_append(data, "form_of", dt)
1698 elif "compound-of" in tags2: 1698 ↛ 1699line 1698 didn't jump to line 1699 because the condition on line 1698 was never true
1699 check_related(related)
1700 data_extend(data, "tags", tags1)
1701 data_extend(data, "tags", tags2)
1702 data_extend(data, "topics", topics1)
1703 data_extend(data, "topics", topics2)
1704 data_append(data, "compound", related)
1705 else:
1706 lang = wxr.wtp.section or "LANG_MISSING"
1707 related, final_tags = parse_head_final_tags(
1708 wxr, lang, related
1709 )
1710 # print("add_related: related={!r} tags1={!r} tags2={!r} "
1711 # "final_tags={!r}"
1712 # .format(related, tags1, tags2, final_tags))
1713 tags = list(tags1) + list(tags2) + list(final_tags)
1714 check_related(related)
1715 form: FormData = {"form": related}
1716 if head_group:
1717 form["head_nr"] = head_group
1718 if roman:
1719 form["roman"] = roman
1720 if ruby:
1721 form["ruby"] = ruby
1722 data_extend(form, "topics", topics1)
1723 data_extend(form, "topics", topics2)
1724 if topics1 or topics2: 1724 ↛ 1725line 1724 didn't jump to line 1725 because the condition on line 1724 was never true
1725 wxr.wtp.debug(
1726 "word head form has topics: {}".format(form),
1727 sortid="form_descriptions/1233",
1728 )
1729 # Add tags from canonical form into the main entry
1730 if "canonical" in tags:
1731 if related in ("m", "f") and len(titleword) > 1: 1731 ↛ 1732line 1731 didn't jump to line 1732 because the condition on line 1731 was never true
1732 wxr.wtp.debug(
1733 "probably incorrect canonical form "
1734 "{!r} ignored (probably tag combination "
1735 "missing from xlat_head_map)".format(related),
1736 sortid="form_descriptions/1241",
1737 )
1738 continue
1739 if (
1740 related != titleword
1741 or add_all_canonicals
1742 or topics1
1743 or topics2
1744 or ruby
1745 ):
1746 data_extend(form, "tags", sorted(set(tags)))
1747 else:
1748 # We won't add canonical form here
1749 filtered_tags = list(
1750 x for x in tags if x != "canonical"
1751 )
1752 data_extend(data, "tags", filtered_tags)
1753 continue
1754 else:
1755 data_extend(form, "tags", sorted(set(tags)))
1756 # Only insert if the form is not already there
1757 for old in data.get("forms", ()):
1758 if form == old: 1758 ↛ 1759line 1758 didn't jump to line 1759 because the condition on line 1758 was never true
1759 break
1760 else:
1761 data_append(data, "forms", form)
1763 # If this form had pre-tags that started with "both" or "all", add those
1764 # tags also to following related forms that don't have their own tags
1765 # specified.
1766 return following_tagsets
1769# Issue #967, in English word forms sometimes forms are skipped because
1770# they are taggable words and their distw() is too big, like clipping from clip
1771WORDS_WITH_FALSE_POSITIVE_TAGS: dict[str, list[str]] = {
1772 "clip": ["clipping"], # XXX remember to change me back to clipping after
1773 "English": ["English", "Englishes"],
1774 "common": ["common", "commoner"],
1775 # tests.
1776}
1778WORDS_WITH_FALSE_POSITIVE_FORMS: dict[str, list[str]] = {
1779 "unaccountability": ["countable", "uncountable"],
1780 "uncountability": ["countable", "uncountable"],
1781}
1783FALSE_POSITIVE_MISSING_FORMS: dict[str, list[str]] = {}
1785FORM_ASSOCIATED_TAG_WORDS: set[str] = {
1786 "participle",
1787 "past",
1788 "present",
1789 "singular",
1790 "plural",
1791 "first-person",
1792 "second-person",
1793 "third-person",
1794 "gerund",
1795}
1798def parse_word_head(
1799 wxr: WiktextractContext,
1800 pos: str,
1801 text: str,
1802 data: WordData,
1803 is_reconstruction: bool,
1804 head_group: Optional[int],
1805 ruby=None,
1806 links=None,
1807) -> None:
1808 """Parses the head line for a word for in a particular language and
1809 part-of-speech, extracting tags and related forms."""
1810 assert isinstance(wxr, WiktextractContext)
1811 assert isinstance(pos, str)
1812 assert isinstance(text, str)
1813 assert isinstance(data, dict)
1814 assert isinstance(ruby, (list, tuple)) or ruby is None
1815 if ruby is None:
1816 ruby = []
1817 assert is_reconstruction in (True, False)
1818 # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text))
1819 # print(f"PARSE_WORD_HEAD: {data=}")
1820 if links is None:
1821 links = []
1823 if len(links) > 0:
1824 # if we have link data (that is, links with stuff like commas and
1825 # spaces, replace word_re with a modified local scope pattern
1826 # print(f"links {list((c, ord(c)) for link in links for c in link)=}")
1827 word_re = re.compile(
1828 r"\b" # In case we have forms that are longer and contain links
1829 +
1830 # or words as a substring...
1831 r"\b|\b".join(
1832 sorted((re.escape(s) for s in links), key=lambda x: -len(x))
1833 )
1834 + r"\b|"
1835 + word_pattern
1836 )
1837 else:
1838 word_re = word_re_global
1840 if "Lua execution error" in text or "Lua timeout error" in text: 1840 ↛ 1841line 1840 didn't jump to line 1841 because the condition on line 1840 was never true
1841 return
1843 # Fix words with "superlative:" or "comparative:" at end of head
1844 # e.g. grande/Spanish/Adj
1845 text = re.sub(r" (superlative|comparative): (.*)", r" (\1 \2)", text)
1847 # Parse Arabic non-past forms, e.g. أبلع/Arabic/Verb
1848 m = re.search(r", non-past ([^)]+ \([^)]+\))", text)
1849 if m:
1850 add_related(
1851 wxr,
1852 data,
1853 ["non-past"],
1854 [m.group(1)],
1855 text,
1856 True,
1857 is_reconstruction,
1858 head_group,
1859 ruby,
1860 )
1861 text = text[: m.start()] + text[m.end() :]
1863 language = wxr.wtp.section
1864 titleword = re.sub(
1865 r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "MISSING_TITLE"
1866 )
1867 titleparts = list(
1868 m.group(0)
1869 for m in re.finditer(word_re, wxr.wtp.title or "MISSING_TITLE")
1870 )
1871 if not titleparts: 1871 ↛ 1872line 1871 didn't jump to line 1872 because the condition on line 1871 was never true
1872 return
1874 # Remove " or" from the end to prevent weird canonical forms
1875 if text.endswith(" or"):
1876 for tp in titleparts:
1877 if text.endswith(tp): 1877 ↛ 1878line 1877 didn't jump to line 1878 because the condition on line 1877 was never true
1878 break
1879 else:
1880 text = text.removesuffix(" or").rstrip()
1882 # Handle the part of the head that is not in parentheses. However, certain
1883 # parenthesized parts are part of word, and those must be handled
1884 # specially here.
1885 if ruby:
1886 text = quote_kept_ruby(wxr, ruby, text)
1887 base = text
1888 base = quote_kept_parens(base)
1889 base = remove_text_in_parentheses(base)
1890 base = base.replace("?", "") # Removes uncertain articles etc
1891 base = re.sub(r"\s+", " ", base)
1892 base = re.sub(r" ([,;])", r"\1", base)
1893 base = re.sub(r" • ", r" ", base)
1894 # Many languages use • as a punctuation mark separating the base
1895 # from the rest of the head. στάδιος/Ancient Greek, issue #176
1896 base = base.strip()
1897 # print(f"{base=}")
1899 # Check for certain endings in head (mostly for compatibility with weird
1900 # heads, e.g. rata/Romanian "1st conj." at end)
1901 m = re.search(head_end_re, base)
1902 tags: Union[tuple[str, ...], list[str]] = []
1903 if m: 1903 ↛ 1904line 1903 didn't jump to line 1904 because the condition on line 1903 was never true
1904 tags = head_end_map[m.group(1).lower()].split()
1905 data_extend(data, "tags", tags)
1906 base = base[: m.start()]
1908 # Special case: handle Hán Nôm readings for Vietnamese characters
1909 m = re.match(
1910 r"{}: (Hán Nôm) readings: (.*)".format(re.escape(titleword)), base
1911 )
1912 if m: 1912 ↛ 1913line 1912 didn't jump to line 1913 because the condition on line 1912 was never true
1913 tag, readings = m.groups()
1914 tag = re.sub(r"\s+", "-", tag)
1915 for reading in split_at_comma_semi(readings, skipped=links):
1916 add_related(
1917 wxr,
1918 data,
1919 [tag],
1920 [reading],
1921 text,
1922 True,
1923 is_reconstruction,
1924 head_group,
1925 ruby,
1926 )
1927 return
1929 # Special case: Hebrew " [pattern: nnn]" ending
1930 m = re.search(r"\s+\[pattern: ([^]]+)\]", base)
1931 if m: 1931 ↛ 1932line 1931 didn't jump to line 1932 because the condition on line 1931 was never true
1932 add_related(
1933 wxr,
1934 data,
1935 ["class"],
1936 [m.group(1)],
1937 text,
1938 True,
1939 is_reconstruction,
1940 head_group,
1941 ruby,
1942 )
1943 base = base[: m.start()] + base[m.end() :]
1945 # Clean away some messy "Upload an image" template text used in
1946 # American Sign Language:
1947 # S@NearBaseForearm-PalmUp Frontandback S@BaseForearm-PalmUp
1948 m = re.search(r"Upload .+ gif image.", base)
1949 if m: 1949 ↛ 1950line 1949 didn't jump to line 1950 because the condition on line 1949 was never true
1950 base = base[: m.start()] + base[m.end() :]
1952 semicolon_present = False
1953 # Split the head into alternatives. This is a complicated task, as
1954 # we do not want so split on "or" or "," when immediately followed by more
1955 # head-final tags, but otherwise do want to split by them.
1956 # 20230907 added "or" to this to handle 'true or false', titles with 'or'
1957 if wxr.wtp.title and (
1958 "," in wxr.wtp.title or ";" in wxr.wtp.title or " or " in wxr.wtp.title
1959 ):
1960 # If the title has ";", we don't want to split on that and can remove
1961 # the ; from the splitting regex pretty easily because it's uncommon.
1962 # However, commas are so common that not splitting on them is just
1963 # not feasible, and we have to just deal with that if there are
1964 # alternative forms or variations with stray commas that shouldn't
1965 # be split.
1966 if ";" in wxr.wtp.title:
1967 semicolon_present = True
1968 base = base.replace(";", "<SEMICOLON>")
1969 default_splitter = head_split_no_semicolon_re
1970 else:
1971 default_splitter = head_split_re
1972 # A kludge to handle article titles/phrases with commas.
1973 # Preprocess splits to first capture the title, then handle
1974 # all the others as usual.
1975 presplits = re.split(r"({})".format(wxr.wtp.title), base)
1976 splits = []
1977 for psplit in presplits:
1978 if psplit == wxr.wtp.title:
1979 splits.append(psplit)
1980 else:
1981 splits.extend(re.split(default_splitter, psplit))
1982 else:
1983 # Do the normal split; previous only-behavior.
1984 splits = re.split(head_split_re, base)
1985 # print("BASE: ", repr(base))
1986 # print("SPLITS:", splits)
1987 alts: list[str] = []
1988 # print("parse_word_head: splits:", splits,
1989 # "head_split_re_parens:", head_split_re_parens)
1990 for i in range(
1991 0, len(splits) - head_split_re_parens, head_split_re_parens + 1
1992 ):
1993 v = splits[i]
1994 ending = splits[i + 1] or "" # XXX is this correct???
1995 # print("parse_word_head alts v={!r} ending={!r} alts={}"
1996 # .format(v, ending, alts))
1997 if alts and (v == "" and ending):
1998 assert ending[0] == " "
1999 alts[-1] += " or" + ending # endings starts with space
2000 elif v or ending:
2001 alts.append((v or "") + (ending or ""))
2002 last = splits[-1].strip()
2003 conn = "" if len(splits) < 3 else splits[-2]
2004 # print("parse_word_head alts last={!r} conn={!r} alts={}"
2005 # .format(last, conn, alts))
2006 if ( 2006 ↛ 2017line 2006 didn't jump to line 2017 because the condition on line 2006 was never true
2007 alts
2008 and last
2009 and (
2010 last.split()[0] in xlat_head_map
2011 or (
2012 conn == " or "
2013 and (alts[-1] + " or " + last).strip() in xlat_head_map
2014 )
2015 )
2016 ):
2017 alts[-1] += " or " + last
2018 elif last: 2018 ↛ 2019line 2018 didn't jump to line 2019 because the condition on line 2018 was never true
2019 alts.append(last)
2021 # print("parse_word_head alts: {}".format(alts))
2022 # print(f"{base=}")
2024 # Process the head alternatives
2025 canonicals: list[tuple[list[str], list[str]]] = []
2026 mode: Optional[str] = None
2027 for alt_i, alt in enumerate(alts):
2028 alt = alt.strip()
2029 if alt.startswith("compound form:"): 2029 ↛ 2030line 2029 didn't jump to line 2030 because the condition on line 2029 was never true
2030 mode = "compound-form"
2031 alt = alt[14:].strip()
2032 if ((dash_i := alt.find(" -")) > 0) and (
2033 dash_i > (wxr.wtp.title or "").find(" -")
2034 ):
2035 # test_en_head / test_suffixes_at_end_of_form1
2036 # Some heads have suffixes that end up attached to the form
2037 # like in https://en.wiktionary.org/wiki/%E6%A5%BD%E3%81%97%E3%81%84
2038 alt = alt[:dash_i]
2039 if mode == "compound-form": 2039 ↛ 2040line 2039 didn't jump to line 2040 because the condition on line 2039 was never true
2040 add_related(
2041 wxr,
2042 data,
2043 ["in-compounds"],
2044 [alt],
2045 text,
2046 True,
2047 is_reconstruction,
2048 head_group,
2049 ruby,
2050 )
2051 continue
2052 # For non-first parts, see if it can be treated as tags-only
2053 if alt_i == 0:
2054 expanded_alts = [alt]
2055 else:
2056 expanded_alts = map_with(xlat_descs_map, [alt])
2057 # print("EXPANDED_ALTS:", expanded_alts)
2058 tagsets: Optional[list[tuple[str, ...]]]
2059 for alt in expanded_alts:
2060 baseparts = list(m.group(0) for m in word_re.finditer(alt))
2061 if alt_i > 0:
2062 tagsets, topics = decode_tags(" ".join(baseparts))
2063 if not any("error-unknown-tag" in x for x in tagsets):
2064 data_extend(data, "topics", topics)
2065 for tags1 in tagsets:
2066 data_extend(data, "tags", tags1)
2067 continue
2069 alt, tags = parse_head_final_tags(
2070 wxr, language or "MISSING_LANG", alt
2071 )
2072 tags = list(tags) # Make sure we don't modify anything cached
2073 tags.append("canonical")
2074 if alt_i == 0 and "," in wxr.wtp.title or ";" in wxr.wtp.title: # type:ignore[operator]
2075 # Kludge to handle article titles/phrases with commas.
2076 # basepart's regex strips commas, which leads to a
2077 # canonical form that is the title phrase without a comma.
2078 # basepart in add_related is almost immediately joined with
2079 # spaces anyhow. XXX not exactly sure why it's
2080 # canonicals.append((tags, baseparts)) and not (tags, [alt])
2081 baseparts = [alt]
2082 canonicals.append((tags, baseparts))
2084 # If more of this kind of replace-and-return-original kind of stuff is
2085 # needed, make semicolon_present into a flag enum, something like `modified`
2086 if semicolon_present:
2087 new_cans = []
2088 for tags, baseparts in canonicals:
2089 new_cans.append(
2090 (tags, [s.replace("<SEMICOLON>", ";") for s in baseparts])
2091 )
2092 canonicals = new_cans
2093 for tags, baseparts in canonicals:
2094 add_related(
2095 wxr,
2096 data,
2097 tags,
2098 baseparts,
2099 text,
2100 len(canonicals) > 1,
2101 is_reconstruction,
2102 head_group,
2103 ruby,
2104 )
2106 # Handle parenthesized descriptors for the word form and links to
2107 # related words
2108 text = quote_kept_parens(text)
2109 parens = list(
2110 m.group(2)
2111 for m in re.finditer(r"(^|\s)\((([^()]|\([^()]*\))*)\)", text)
2112 )
2113 parens.extend(
2114 m.group(1)
2115 for m in re.finditer(r"[^\s]\((([^()]|\([^()]*\))*)\)($|\s)", text)
2116 )
2117 have_romanization = False
2118 have_ruby = False
2119 hiragana = ""
2120 katakana = ""
2121 for paren in parens:
2122 paren = paren.strip()
2123 if not paren: 2123 ↛ 2124line 2123 didn't jump to line 2124 because the condition on line 2123 was never true
2124 continue
2125 if paren.startswith("see "):
2126 continue
2127 if paren.startswith("U+"): 2127 ↛ 2128line 2127 didn't jump to line 2128 because the condition on line 2127 was never true
2128 continue
2129 # In some rare cases, strip word that inflects form the form
2130 # description, e.g. "look through rose-tinted glasses"/English.
2131 paren = re.sub(r"\s*\(\[[^])]*\]\)", "", paren)
2133 # If it starts with hiragana or katakana, treat as such form. Note
2134 # that each hiragana/katakana character is in separate parentheses,
2135 # so we must concatenate them.
2136 try:
2137 un = unicodedata.name(paren[0]).split()[0]
2138 except ValueError:
2139 un = "INVALID"
2140 if un == "KATAKANA": 2140 ↛ 2141line 2140 didn't jump to line 2141 because the condition on line 2140 was never true
2141 katakana += paren
2142 have_ruby = True
2143 continue
2144 if un == "HIRAGANA": 2144 ↛ 2145line 2144 didn't jump to line 2145 because the condition on line 2144 was never true
2145 hiragana += paren
2146 have_ruby = True
2147 continue
2149 # Parse format ", 16 (Japan, Mainland), 17 (Hong Kong, Taiwan) strokes,"
2150 # in the middle of the parenthesized expression, e.g. 薄
2151 def strokes_repl(m: re.Match) -> str:
2152 strokes1, tags1, strokes2, tags2 = m.groups()
2153 for strokes, tags in [[strokes1, tags1], [strokes2, tags2]]:
2154 tags = tags.split(", ")
2155 tags = list(
2156 "Mainland China" if t == "Mainland" else t for t in tags
2157 )
2158 tags.append("strokes")
2159 add_related(
2160 wxr,
2161 data,
2162 tags,
2163 [strokes],
2164 text,
2165 True,
2166 is_reconstruction,
2167 head_group,
2168 ruby,
2169 )
2170 return ", "
2172 paren = re.sub(
2173 r", (\d+) \(([^()]+)\), (\d+) \(([^()]+)\) strokes, ",
2174 strokes_repl,
2175 paren,
2176 )
2178 descriptors = map_with(xlat_descs_map, [paren])
2179 new_desc = []
2180 for desc in descriptors:
2181 new_desc.extend(
2182 map_with(
2183 xlat_tags_map,
2184 split_at_comma_semi(desc, extra=[", or "], skipped=links),
2185 )
2186 )
2187 prev_tags: Union[list[list[str]], list[tuple[str, ...]], None] = None
2188 following_tags = None # Added to prev_tags from previous parenthesized
2189 # part, e.g. walrus/English
2190 # "(both nonstandard, proscribed, uncommon)"
2191 for desc_i, desc in enumerate(new_desc):
2192 # print("HEAD DESC: {!r}".format(desc))
2194 # Abort on certain descriptors (assume remaining values are
2195 # examples or uninteresting, cf. gaan/Navajo, horior/Latin)
2196 if re.match(r"^(per |e\.g\.$)", desc): 2196 ↛ 2197line 2196 didn't jump to line 2197 because the condition on line 2196 was never true
2197 break
2199 # If it all consists of CJK characters, add it with the
2200 # CJK tag. This is used at least for some Vietnamese
2201 # words (e.g., ba/Vietnamese)
2202 try:
2203 if all(unicodedata.name(x).startswith("CJK ") for x in desc): 2203 ↛ 2204line 2203 didn't jump to line 2204 because the condition on line 2203 was never true
2204 add_related(
2205 wxr,
2206 data,
2207 ["CJK"],
2208 [desc],
2209 text,
2210 True,
2211 is_reconstruction,
2212 head_group,
2213 ruby,
2214 )
2215 continue
2216 except ValueError:
2217 pass
2219 # Handle some special cases
2220 splitdesc = desc.split()
2221 if ( 2221 ↛ 2230line 2221 didn't jump to line 2230 because the condition on line 2221 was never true
2222 len(splitdesc) >= 3
2223 and splitdesc[1] == "superlative"
2224 and classify_desc(splitdesc[0]) != "tags"
2225 and prev_tags
2226 ):
2227 # Handle the special case of second comparative after comma,
2228 # followed by superlative without comma. E.g.
2229 # mal/Portuguese/Adv
2230 for ts in prev_tags:
2231 add_related(
2232 wxr,
2233 data,
2234 ts,
2235 [splitdesc[0]],
2236 text,
2237 True,
2238 is_reconstruction,
2239 head_group,
2240 ruby,
2241 )
2242 desc = " ".join(splitdesc[1:])
2243 elif ( 2243 ↛ 2251line 2243 didn't jump to line 2251 because the condition on line 2243 was never true
2244 len(splitdesc) == 2
2245 and splitdesc[0] in ("also", "and")
2246 and prev_tags
2247 and classify_desc(splitdesc[1]) != "tags"
2248 ):
2249 # Sometimes alternative forms are prefixed with "also" or
2250 # "and"
2251 for ts in prev_tags:
2252 add_related(
2253 wxr,
2254 data,
2255 ts,
2256 [splitdesc[1]],
2257 text,
2258 True,
2259 is_reconstruction,
2260 head_group,
2261 ruby,
2262 )
2263 continue
2264 elif len(splitdesc) >= 2 and splitdesc[0] in ("including",): 2264 ↛ 2265line 2264 didn't jump to line 2265 because the condition on line 2264 was never true
2265 continue
2267 # If only one word, assume it is comma-separated alternative
2268 # to the previous one
2269 if " " not in desc:
2270 cls = classify_desc(desc)
2271 if cls != "tags":
2272 if prev_tags: 2272 ↛ 2274line 2272 didn't jump to line 2274 because the condition on line 2272 was never true
2273 # Assume comma-separated alternative to previous one
2274 for ts in prev_tags:
2275 add_related(
2276 wxr,
2277 data,
2278 ts,
2279 [desc],
2280 text,
2281 True,
2282 is_reconstruction,
2283 head_group,
2284 ruby,
2285 )
2286 continue
2287 elif distw(titleparts, desc) <= 0.5: 2287 ↛ 2290line 2287 didn't jump to line 2290 because the condition on line 2287 was never true
2288 # Similar to head word, assume a dialectal variation to
2289 # the base form. Cf. go/Alemannic German/Verb
2290 add_related(
2291 wxr,
2292 data,
2293 ["alternative"],
2294 [desc],
2295 text,
2296 True,
2297 is_reconstruction,
2298 head_group,
2299 ruby,
2300 )
2301 continue
2302 elif (
2303 cls in ("romanization", "english")
2304 and not have_romanization
2305 and classify_desc(titleword) == "other"
2306 and not (
2307 "categories" in data and desc in data["categories"]
2308 )
2309 ):
2310 # Assume it to be a romanization
2311 add_romanization(
2312 wxr,
2313 data,
2314 desc,
2315 text,
2316 is_reconstruction,
2317 head_group,
2318 ruby,
2319 )
2320 have_romanization = True
2321 continue
2323 m = re.match(r"^(\d+) strokes?$", desc)
2324 if m:
2325 # Special case, used to give #strokes for Han characters
2326 add_related(
2327 wxr,
2328 data,
2329 ["strokes"],
2330 [m.group(1)],
2331 text,
2332 True,
2333 is_reconstruction,
2334 head_group,
2335 ruby,
2336 )
2337 continue
2339 # See if it is radical+strokes
2340 m = re.match(
2341 r"^([\u2F00-\u2FDF\u2E80-\u2EFF\U00018800-\U00018AFF"
2342 r"\uA490-\uA4CF\u4E00-\u9FFF]\+\d+)"
2343 r"( in (Japanese|Chinese|traditional Chinese|"
2344 r"simplified Chinese))?$",
2345 desc,
2346 )
2347 if m: 2347 ↛ 2350line 2347 didn't jump to line 2350 because the condition on line 2347 was never true
2348 # Special case, used to give radical + strokes for Han
2349 # characters
2350 radical_strokes = m.group(1)
2351 lang = m.group(3)
2352 t = ["radical+strokes"]
2353 if lang:
2354 t.extend(lang.split())
2355 add_related(
2356 wxr,
2357 data,
2358 t,
2359 [radical_strokes],
2360 text,
2361 True,
2362 is_reconstruction,
2363 head_group,
2364 ruby,
2365 )
2366 prev_tags = None
2367 following_tags = None
2368 continue
2370 # See if it indicates historical Katakana ortography (←) or
2371 # just otherwise katakana/hiragana form
2372 m = re.match(r"←\s*|kana\s+", desc)
2373 if m: 2373 ↛ 2374line 2373 didn't jump to line 2374 because the condition on line 2373 was never true
2374 if desc.startswith("←"):
2375 t1 = "historical "
2376 else:
2377 t1 = ""
2378 x = desc[m.end() :]
2379 if x.endswith("?"):
2380 x = x[:-1]
2381 # XXX should we add a tag indicating uncertainty?
2382 if x:
2383 name = unicodedata.name(x[0])
2384 if name.startswith("HIRAGANA "):
2385 desc = t1 + "hiragana " + x
2386 elif name.startswith("KATAKANA "):
2387 desc = t1 + "katakana " + x
2389 # See if it is "n strokes in Chinese" or similar
2390 m = re.match(
2391 r"(\d+) strokes in (Chinese|Japanese|"
2392 r"traditional Chinese|simplified Chinese)$",
2393 desc,
2394 )
2395 if m: 2395 ↛ 2397line 2395 didn't jump to line 2397 because the condition on line 2395 was never true
2396 # Special case, used to give just strokes for some Han chars
2397 strokes = m.group(1)
2398 lang = m.group(2)
2399 t = ["strokes"]
2400 t.extend(lang.split())
2401 add_related(
2402 wxr,
2403 data,
2404 t,
2405 [strokes],
2406 text,
2407 True,
2408 is_reconstruction,
2409 head_group,
2410 ruby,
2411 )
2412 prev_tags = None
2413 following_tags = None
2414 continue
2416 # American Sign Language has images (or requests for image)
2417 # as heads, + this ASL gloss after.
2418 m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text)
2419 if m2: 2419 ↛ 2420line 2419 didn't jump to line 2420 because the condition on line 2419 was never true
2420 add_related(
2421 wxr,
2422 data,
2423 ["ASL-gloss"],
2424 [m2.group(1)],
2425 text,
2426 True,
2427 is_reconstruction,
2428 head_group,
2429 ruby,
2430 )
2431 continue
2433 parts = list(m.group(0) for m in re.finditer(word_re, desc))
2434 if not parts: 2434 ↛ 2435line 2434 didn't jump to line 2435 because the condition on line 2434 was never true
2435 prev_tags = None
2436 following_tags = None
2437 continue
2439 # Check for certain language-specific header part starts that
2440 # modify
2441 if len(parts) == 2 and language in lang_specific_head_map: 2441 ↛ 2442line 2441 didn't jump to line 2442 because the condition on line 2441 was never true
2442 ht = lang_specific_head_map[language]
2443 if parts[0] in ht:
2444 rem_tags, add_tags = ht[parts[0]]
2445 new_prev_tags1: list[list[str]] = []
2446 tags2: Union[tuple[str, ...], list[str]]
2447 for tags2 in prev_tags or [()]:
2448 if rem_tags is True: # Remove all old tags
2449 tsets = set()
2450 else:
2451 tsets = set(tags2) - set(rem_tags.split())
2452 tsets = tsets | set(add_tags.split())
2453 tags = list(sorted(tsets))
2454 add_related(
2455 wxr,
2456 data,
2457 tags,
2458 [parts[1]],
2459 text,
2460 True,
2461 is_reconstruction,
2462 head_group,
2463 ruby,
2464 )
2465 new_prev_tags1.append(tags)
2466 prev_tags = new_prev_tags1
2467 following_tags = None
2468 continue
2470 # Handle the special case of descriptors that are parenthesized,
2471 # e.g., (archaic or Scotland)
2472 m = re.match(r"\(([^)]+)\)\s+(.*)$", desc)
2473 if m is not None and classify_desc(m.group(1)) == "tags": 2473 ↛ 2474line 2473 didn't jump to line 2474 because the condition on line 2473 was never true
2474 tagpart = m.group(1)
2475 related = [m.group(2)]
2476 tagsets, topics = decode_tags(tagpart, no_unknown_starts=True)
2477 if topics:
2478 wxr.wtp.debug(
2479 "parenthized head part {!r} contains topics: {}".format(
2480 tagpart, topics
2481 ),
2482 sortid="form_descriptions/1647",
2483 )
2484 elif m is not None and re.match(r"in the sense ", m.group(1)): 2484 ↛ 2487line 2484 didn't jump to line 2487 because the condition on line 2484 was never true
2485 # Handle certain ignored cases
2486 # e.g. bord/Danish: in the sense "plank"
2487 related = [m.group(2)]
2488 tagsets = [()]
2489 else:
2490 # Normal parsing of the descriptor
2491 alt_related = None
2492 alt_tagsets = None
2493 tagsets = None
2494 for i in range(len(parts), 0, -1):
2495 related = parts[i:]
2496 tagparts = parts[:i]
2497 # print(" i={} related={} tagparts={}"
2498 # .format(i, related, tagparts))
2499 tagsets, topics = decode_tags(
2500 " ".join(tagparts), no_unknown_starts=True
2501 )
2502 # print("tagparts={!r} tagsets={} topics={} related={} "
2503 # "alt_related={} distw={:.2f}"
2504 # .format(tagparts, tagsets, topics, related,
2505 # alt_related,
2506 # distw(titleparts, parts[i - 1])))
2507 if (
2508 topics
2509 or not tagsets
2510 or any("error-unknown-tag" in x for x in tagsets)
2511 ):
2512 if alt_related is not None: 2512 ↛ 2514line 2512 didn't jump to line 2514 because the condition on line 2512 was never true
2513 # We already had a good division, so let's stop.
2514 break
2515 # Bad division, try deeper
2516 continue
2517 # print(f"{parts[i-1]=}, {parts=}")
2518 if (
2519 i > 1
2520 and len(parts[i - 1]) >= 4
2521 and (
2522 distw(titleparts, parts[i - 1]) <= 0.4
2523 or (
2524 wxr.wtp.section == "English"
2525 and wxr.wtp.title
2526 in WORDS_WITH_FALSE_POSITIVE_TAGS
2527 and parts[i - 1]
2528 in WORDS_WITH_FALSE_POSITIVE_TAGS[wxr.wtp.title]
2529 )
2530 )
2531 # Fixes 'unaccountability' wiktext #1196
2532 and not (
2533 wxr.wtp.section == "English"
2534 and wxr.wtp.title in WORDS_WITH_FALSE_POSITIVE_FORMS
2535 and parts[i - 1]
2536 in WORDS_WITH_FALSE_POSITIVE_FORMS[wxr.wtp.title]
2537 )
2538 # Fixes wiktextract #983, where "participle"
2539 # was too close to "Martinize" and so this accepted
2540 # ["participle", "Martinize"] as matching; this
2541 # kludge prevents this from happening if titleparts
2542 # is shorter than what would be 'related'.
2543 # This breaks if we want to detect stuff that
2544 # actually gets an extra space-separated word when
2545 # 'inflected'.
2546 and (
2547 len(titleparts) >= len(parts[i - 1 :])
2548 or "or" in parts[i - 1 :]
2549 )
2550 ):
2551 # print(f"Reached; {parts=}, {parts[i-1]=}")
2552 alt_related = related
2553 alt_tagsets = tagsets
2554 continue
2555 alt_related = None
2556 alt_tagsets = None
2557 break
2558 else:
2559 if alt_related is None: 2559 ↛ 2591line 2559 didn't jump to line 2591 because the condition on line 2559 was always true
2560 # Check if the parenthesized part is likely a
2561 # romanization
2562 if ( 2562 ↛ 2570line 2562 didn't jump to line 2570 because the condition on line 2562 was never true
2563 (have_ruby or classify_desc(base) == "other")
2564 and classify_desc(paren) == "romanization"
2565 and not (
2566 "categories" in data
2567 and desc in data["categories"]
2568 )
2569 ):
2570 for r in split_at_comma_semi(
2571 paren, extra=[" or "], skipped=links
2572 ):
2573 add_romanization(
2574 wxr,
2575 data,
2576 r,
2577 text,
2578 is_reconstruction,
2579 head_group,
2580 ruby,
2581 )
2582 have_romanization = True
2583 continue
2584 tagsets = [("error-unrecognized-head-form",)]
2585 wxr.wtp.debug(
2586 "unrecognized head form: {}".format(desc),
2587 sortid="form_descriptions/1698",
2588 )
2589 continue
2591 if alt_related is not None: 2591 ↛ 2592line 2591 didn't jump to line 2592 because the condition on line 2591 was never true
2592 related = alt_related
2593 tagsets = alt_tagsets
2595 # print("FORM END: tagsets={} related={}".format(tagsets, related))
2596 # print("==================")
2598 if ( 2598 ↛ 2619line 2598 didn't jump to line 2619 because the condition on line 2598 was never true
2599 len(related) <= 0
2600 and wxr.wtp.section == "English"
2601 and tagsets is not None
2602 and len(tagsets) > 0
2603 and not any(
2604 s.startswith("error-") for tagset in tagsets for s in tagset
2605 )
2606 and any(
2607 s in FORM_ASSOCIATED_TAG_WORDS
2608 for tagset in tagsets
2609 for s in tagset
2610 )
2611 and (
2612 wxr.wtp.title not in FALSE_POSITIVE_MISSING_FORMS
2613 and not any(
2614 rel in FALSE_POSITIVE_MISSING_FORMS[wxr.wtp.title or ""]
2615 for rel in related
2616 )
2617 )
2618 ):
2619 wxr.wtp.debug(
2620 f"Form tags without form: {desc=}, {tagsets=}",
2621 sortid="form_description/20250107",
2622 )
2623 if not tagsets: 2623 ↛ 2624line 2623 didn't jump to line 2624 because the condition on line 2623 was never true
2624 continue
2626 # print(f"{alts=}, {related=}")
2628 assert isinstance(related, (list, tuple))
2629 related_str = " ".join(related)
2630 if "or" in titleparts:
2631 alts = [related_str]
2632 else:
2633 alts = split_at_comma_semi(
2634 related_str, separators=[r"\bor\b"], skipped=links
2635 )
2636 # print(f"{related_str=}, {alts=}")
2637 if not alts:
2638 alts = [""]
2639 for related_str in alts:
2640 if related_str:
2641 if prev_tags and (
2642 all(
2643 all(
2644 t in ["nonstandard", "dialectal"]
2645 or valid_tags[t] == "dialect"
2646 for t in tags
2647 )
2648 for ts in tagsets
2649 )
2650 or (
2651 any("participle" in ts for ts in prev_tags)
2652 and all(
2653 "attributive" in ts
2654 or any(valid_tags[t] == "gender" for t in ts)
2655 for ts in tagsets
2656 )
2657 )
2658 ):
2659 # Merged with previous tags. Don't update previous
2660 # tags here; cf. burn/English/Verb
2661 for tags_l in tagsets:
2662 for ts in prev_tags:
2663 tags_l1 = sorted(set(tags_l) | set(ts))
2664 add_related(
2665 wxr,
2666 data,
2667 tags_l1,
2668 [related_str],
2669 text,
2670 True,
2671 is_reconstruction,
2672 head_group,
2673 ruby,
2674 )
2675 else:
2676 # Not merged with previous tags
2677 for tags_l in tagsets:
2678 if following_tags is not None: 2678 ↛ 2679line 2678 didn't jump to line 2679 because the condition on line 2678 was never true
2679 for ts in following_tags:
2680 tags_l1 = list(
2681 sorted(set(tags_l) | set(ts))
2682 )
2683 add_related(
2684 wxr,
2685 data,
2686 tags_l1,
2687 [related_str],
2688 text,
2689 True,
2690 is_reconstruction,
2691 head_group,
2692 ruby,
2693 )
2694 else:
2695 ret = add_related(
2696 wxr,
2697 data,
2698 tags_l,
2699 [related_str],
2700 text,
2701 True,
2702 is_reconstruction,
2703 head_group,
2704 ruby,
2705 )
2706 if ret is not None: 2706 ↛ 2707line 2706 didn't jump to line 2707 because the condition on line 2706 was never true
2707 following_tags = ret
2708 prev_tags = tagsets
2709 else:
2710 if desc_i < len(new_desc) - 1 and all( 2710 ↛ 2717line 2710 didn't jump to line 2717 because the condition on line 2710 was never true
2711 "participle" in ts or "infinitive" in ts
2712 for ts in tagsets
2713 ):
2714 # Interpret it as a standalone form description
2715 # in the middle, probably followed by forms or
2716 # language-specific descriptors. cf. drikke/Danish
2717 new_prev_tags2 = []
2718 for ts1 in prev_tags or [()]:
2719 for ts2 in tagsets:
2720 ts = tuple(sorted(set(ts1) | set(ts2)))
2721 new_prev_tags2.append(ts)
2722 prev_tags = new_prev_tags2
2723 continue
2724 for tags in tagsets:
2725 data_extend(data, "tags", tags)
2726 prev_tags = tagsets
2727 following_tags = None
2729 # Finally, if we collected hirakana/katakana, add them now
2730 if hiragana: 2730 ↛ 2731line 2730 didn't jump to line 2731 because the condition on line 2730 was never true
2731 add_related(
2732 wxr,
2733 data,
2734 ["hiragana"],
2735 [hiragana],
2736 text,
2737 True,
2738 is_reconstruction,
2739 head_group,
2740 ruby,
2741 )
2742 if katakana: 2742 ↛ 2743line 2742 didn't jump to line 2743 because the condition on line 2742 was never true
2743 add_related(
2744 wxr,
2745 data,
2746 ["katakana"],
2747 [katakana],
2748 text,
2749 True,
2750 is_reconstruction,
2751 head_group,
2752 ruby,
2753 )
2755 # XXX check if this is actually relevant, tags in word root data
2756 # is extremely rare (not sure where they slip through).
2757 tags = data.get("tags", []) # type:ignore
2758 if len(tags) > 0:
2759 # wxr.wtp.debug(
2760 # f"Tags appear in word root data: {data['tags']=}", # type:ignore
2761 # sortid="form_descriptions/2620/20240606",
2762 # ) # Messes up tests.
2763 data["tags"] = sorted(set(tags)) # type:ignore
2766def parse_sense_qualifier(
2767 wxr: WiktextractContext, text: str, data: Union[SenseData, LinkageData]
2768) -> None:
2769 """Parses tags or topics for a sense or some other data. The values are
2770 added into the dictionary ``data``."""
2771 assert isinstance(wxr, WiktextractContext)
2772 assert isinstance(text, str)
2773 assert isinstance(data, dict)
2774 # print("parse_sense_qualifier:", text)
2775 if re.match(r"\([^()]+\)$", text): 2775 ↛ 2776line 2775 didn't jump to line 2776 because the condition on line 2775 was never true
2776 text = text[1:-1]
2777 if re.match(r'"[^"]+"$', text): 2777 ↛ 2778line 2777 didn't jump to line 2778 because the condition on line 2777 was never true
2778 text = text[1:-1]
2779 lst = map_with(xlat_descs_map, [text])
2780 sense_tags: list[str] = []
2781 for text in lst:
2782 for semi in split_at_comma_semi(text):
2783 if not semi: 2783 ↛ 2784line 2783 didn't jump to line 2784 because the condition on line 2783 was never true
2784 continue
2785 orig_semi = semi
2786 idx = semi.find(":")
2787 if idx >= 0: 2787 ↛ 2788line 2787 didn't jump to line 2788 because the condition on line 2787 was never true
2788 semi = semi[:idx]
2789 cls = classify_desc(semi, allow_unknown_tags=True)
2790 # print("parse_sense_qualifier: classify_desc: {} -> {}"
2791 # .format(semi, cls))
2792 if cls == "tags":
2793 tagsets, topics = decode_tags(semi)
2794 data_extend(data, "topics", topics)
2795 # XXX should think how to handle distinct options better,
2796 # e.g., "singular and plural genitive"; that can't really be
2797 # done with changing the calling convention of this function.
2798 # Should split sense if more than one category of tags differs.
2799 for tags in tagsets:
2800 sense_tags.extend(tags)
2801 elif cls == "taxonomic": 2801 ↛ 2802line 2801 didn't jump to line 2802 because the condition on line 2801 was never true
2802 if re.match(r"×[A-Z]", semi):
2803 sense_tags.append("extinct")
2804 semi = semi[1:]
2805 data["taxonomic"] = semi
2806 elif cls == "english":
2807 if "qualifier" in data and data["qualifier"] != orig_semi: 2807 ↛ 2808line 2807 didn't jump to line 2808 because the condition on line 2807 was never true
2808 data["qualifier"] += "; " + orig_semi
2809 else:
2810 data["qualifier"] = orig_semi
2811 else:
2812 wxr.wtp.debug(
2813 "unrecognized sense qualifier: {}".format(text),
2814 sortid="form_descriptions/1831",
2815 )
2816 sense_tags = sorted(set(sense_tags))
2817 data_extend(data, "tags", sense_tags)
2820def parse_pronunciation_tags(
2821 wxr: WiktextractContext, text: str, data: SoundData
2822) -> None:
2823 assert isinstance(wxr, WiktextractContext)
2824 assert isinstance(text, str)
2825 assert isinstance(data, dict)
2826 text = text.strip()
2827 if not text: 2827 ↛ 2828line 2827 didn't jump to line 2828 because the condition on line 2827 was never true
2828 return
2829 cls = classify_desc(text)
2830 notes = []
2831 if cls == "tags":
2832 tagsets, topics = decode_tags(text)
2833 data_extend(data, "topics", topics)
2834 for tagset in tagsets:
2835 for t in tagset:
2836 if " " in t: 2836 ↛ 2837line 2836 didn't jump to line 2837 because the condition on line 2836 was never true
2837 notes.append(t)
2838 else:
2839 data_append(data, "tags", t)
2840 else:
2841 notes.append(text)
2842 if notes:
2843 data["note"] = "; ".join(notes)
2846def parse_translation_desc(
2847 wxr: WiktextractContext, lang: str, text: str, tr: TranslationData
2848) -> None:
2849 assert isinstance(wxr, WiktextractContext)
2850 assert isinstance(lang, str) # The language of ``text``
2851 assert isinstance(text, str)
2852 assert isinstance(tr, dict)
2853 # print("parse_translation_desc:", text)
2855 # Process all parenthesized parts from the translation item
2856 note = None
2857 restore_beginning = ""
2858 restore_end = ""
2859 while True:
2860 beginning = False
2861 # See if we can find a parenthesized expression at the end
2862 m = re.search(r"\s*\((([^()]|\([^()]+\))+)\)\.?$", text)
2863 if m:
2864 par = m.group(1)
2865 text = text[: m.start()]
2866 if par.startswith(("literally ", "lit.")):
2867 continue # Not useful for disambiguation in many idioms
2868 else:
2869 # See if we can find a parenthesized expression at the start
2870 m = re.match(r"^\^?\((([^()]|\([^()]+\))+)\):?(\s+|$)", text)
2871 if m:
2872 par = m.group(1)
2873 text = text[m.end() :]
2874 beginning = True
2875 if re.match(r"^(\d|\s|,| or | and )+$", par): 2875 ↛ 2880line 2875 didn't jump to line 2880 because the condition on line 2875 was never true
2876 # Looks like this beginning parenthesized expression only
2877 # contains digits or their combinations. We assume such
2878 # to be sense descriptions if no sense has been selected,
2879 # or otherwise just ignore them.
2880 if not tr.get("sense"):
2881 tr["sense"] = par
2882 continue
2883 else:
2884 # See if we can find a parenthesized expression in the middle.
2885 # Romanizations are sometimes between word and gender marker,
2886 # e.g. wife/English/Tr/Yiddish.
2887 m = re.search(r"\s+\((([^()]|\([^()]+\))+)\)", text)
2888 if m:
2889 par = m.group(1)
2890 text = text[: m.start()] + text[m.end() :]
2891 else:
2892 # No more parenthesized expressions - break out of the loop
2893 break
2895 # Some cleanup of artifacts that may result from skipping some templates
2896 # in earlier stages
2897 if par.startswith(": "): 2897 ↛ 2898line 2897 didn't jump to line 2898 because the condition on line 2897 was never true
2898 par = par[2:]
2899 if par.endswith(","): 2899 ↛ 2900line 2899 didn't jump to line 2900 because the condition on line 2899 was never true
2900 par = par[:-1]
2901 if re.match(r'^[“"]([^“”"]*)[“”"]$', par): 2901 ↛ 2902line 2901 didn't jump to line 2902 because the condition on line 2901 was never true
2902 par = par[1:-1]
2903 par = par.strip()
2905 # Check for special script pronunciation followed by romanization,
2906 # used in many Asian languages.
2907 lst = par.split(", ")
2908 if len(lst) == 2:
2909 a, r = lst
2910 if classify_desc(a) == "other":
2911 cls = classify_desc(r)
2912 # print("parse_translation_desc: r={} cls={}".format(r, cls))
2913 if cls == "romanization" or (
2914 cls == "english" and len(r.split()) == 1 and r[0].islower()
2915 ):
2916 if tr.get("alt") and tr.get("alt") != a: 2916 ↛ 2917line 2916 didn't jump to line 2917 because the condition on line 2916 was never true
2917 wxr.wtp.debug(
2918 'more than one value in "alt": {} vs. {}'.format(
2919 tr["alt"], a
2920 ),
2921 sortid="form_descriptions/1930",
2922 )
2923 tr["alt"] = a
2924 if tr.get("roman") and tr.get("roman") != r: 2924 ↛ 2925line 2924 didn't jump to line 2925 because the condition on line 2924 was never true
2925 wxr.wtp.debug(
2926 'more than one value in "roman": {} vs. {}'.format(
2927 tr["roman"], r
2928 ),
2929 sortid="form_descriptions/1936",
2930 )
2931 tr["roman"] = r
2932 continue
2934 # Check for certain comma-separated tags combined with English text
2935 # at the beginning or end of a comma-separated parenthesized list
2936 while len(lst) > 1:
2937 cls = classify_desc(lst[0])
2938 if cls == "tags": 2938 ↛ 2939line 2938 didn't jump to line 2939 because the condition on line 2938 was never true
2939 tagsets, topics = decode_tags(lst[0])
2940 for t in tagsets:
2941 data_extend(tr, "tags", t)
2942 data_extend(tr, "topics", topics)
2943 lst = lst[1:]
2944 continue
2945 cls = classify_desc(lst[-1])
2946 if cls == "tags":
2947 tagsets, topics = decode_tags(lst[-1])
2948 for t in tagsets:
2949 data_extend(tr, "tags", t)
2950 data_extend(tr, "topics", topics)
2951 lst = lst[:-1]
2952 continue
2953 break
2954 par = ", ".join(lst)
2956 if not par: 2956 ↛ 2957line 2956 didn't jump to line 2957 because the condition on line 2956 was never true
2957 continue
2958 if re.search(tr_ignored_parens_re, par): 2958 ↛ 2959line 2958 didn't jump to line 2959 because the condition on line 2958 was never true
2959 continue
2960 if par.startswith("numeral:"):
2961 par = par[8:].strip()
2963 # Classify the part in parenthesis and process accordingly
2964 cls = classify_desc(par)
2965 # print("parse_translation_desc classify: {!r} -> {}"
2966 # .format(par, cls))
2967 if par == text:
2968 pass
2969 if par == "f": 2969 ↛ 2970line 2969 didn't jump to line 2970 because the condition on line 2969 was never true
2970 data_append(tr, "tags", "feminine")
2971 elif par == "m": 2971 ↛ 2972line 2971 didn't jump to line 2972 because the condition on line 2971 was never true
2972 data_append(tr, "tags", "masculine")
2973 elif cls == "tags":
2974 tagsets, topics = decode_tags(par)
2975 for tags in tagsets:
2976 data_extend(tr, "tags", tags)
2977 data_extend(tr, "topics", topics)
2978 elif cls == "english":
2979 # If the text contains any of certain grammatical words, treat it
2980 # as a "note" instead of "english"
2981 if re.search(tr_note_re, par):
2982 if par.endswith(":"): 2982 ↛ 2983line 2982 didn't jump to line 2983 because the condition on line 2982 was never true
2983 par = par[:-1]
2984 if par not in ("see entry for forms",): 2984 ↛ 2859line 2984 didn't jump to line 2859 because the condition on line 2984 was always true
2985 if note: 2985 ↛ 2986line 2985 didn't jump to line 2986 because the condition on line 2985 was never true
2986 note = note + ";" + par
2987 else:
2988 note = par
2989 else:
2990 # There can be more than one parenthesized english item, see
2991 # e.g. Aunt/English/Translations/Tamil
2992 if "translation" in tr and "english" in tr:
2993 tr["english"] += "; " + par # DEPRECATED for "translation"
2994 tr["translation"] += "; " + par
2995 else:
2996 tr["english"] = par # DEPRECATED for "translation"
2997 tr["translation"] = par
2998 elif cls == "romanization":
2999 # print("roman text={!r} text cls={}"
3000 # .format(text, classify_desc(text)))
3001 if classify_desc(text) in (
3002 "english",
3003 "romanization",
3004 ) and lang not in ("Egyptian",):
3005 if beginning:
3006 restore_beginning += "({}) ".format(par)
3007 else:
3008 restore_end = " ({})".format(par) + restore_end
3009 else:
3010 if tr.get("roman"): 3010 ↛ 3011line 3010 didn't jump to line 3011 because the condition on line 3010 was never true
3011 wxr.wtp.debug(
3012 'more than one value in "roman": {} vs. {}'.format(
3013 tr["roman"], par
3014 ),
3015 sortid="form_descriptions/2013",
3016 )
3017 tr["roman"] = par
3018 elif cls == "taxonomic": 3018 ↛ 3019line 3018 didn't jump to line 3019 because the condition on line 3018 was never true
3019 if tr.get("taxonomic"):
3020 wxr.wtp.debug(
3021 'more than one value in "taxonomic": {} vs. {}'.format(
3022 tr["taxonomic"], par
3023 ),
3024 sortid="form_descriptions/2019",
3025 )
3026 if re.match(r"×[A-Z]", par):
3027 data_append(tr, "tags", "extinct")
3028 par = par[1:]
3029 tr["taxonomic"] = par
3030 elif cls == "other": 3030 ↛ 3040line 3030 didn't jump to line 3040 because the condition on line 3030 was always true
3031 if tr.get("alt"): 3031 ↛ 3032line 3031 didn't jump to line 3032 because the condition on line 3031 was never true
3032 wxr.wtp.debug(
3033 'more than one value in "alt": {} vs. {}'.format(
3034 tr["alt"], par
3035 ),
3036 sortid="form_descriptions/2028",
3037 )
3038 tr["alt"] = par
3039 else:
3040 wxr.wtp.debug(
3041 "parse_translation_desc unimplemented cls {}: {}".format(
3042 cls, par
3043 ),
3044 sortid="form_descriptions/2033",
3045 )
3047 # Check for gender indications in suffix
3048 text, final_tags = parse_head_final_tags(wxr, lang, text)
3049 data_extend(tr, "tags", final_tags)
3051 # Restore those parts that we did not want to remove (they are often
3052 # optional words or words that are always used with the given translation)
3053 text = restore_beginning + text + restore_end
3055 if note:
3056 tr["note"] = note.strip()
3057 if text and text not in ignored_translations:
3058 tr["word"] = text.strip()
3060 # Sometimes gender seems to be at the end of "roman" field, see e.g.
3061 # fire/English/Noun/Translations/Egyptian (for "oxidation reaction")
3062 roman = tr.get("roman")
3063 if roman:
3064 if roman.endswith(" f"): 3064 ↛ 3065line 3064 didn't jump to line 3065 because the condition on line 3064 was never true
3065 data_append(tr, "tags", "feminine")
3066 tr["roman"] = roman[:-2].strip()
3067 elif roman.endswith(" m"): 3067 ↛ 3068line 3067 didn't jump to line 3068 because the condition on line 3067 was never true
3068 data_append(tr, "tags", "masculine")
3069 tr["roman"] = roman[:-2].strip()
3071 # If the word now has "translation" field but no "roman" field, and
3072 # the word would be classified "other" (generally non-latin
3073 # characters), and the value in "translation" is only one lowercase
3074 # word, move it to "roman". This happens semi-frequently when the
3075 # translation is transliterated the same as some English word.
3076 roman = tr.get("roman")
3077 english = tr.get("translation")
3078 if english and not roman and "word" in tr:
3079 cls = classify_desc(tr["word"])
3080 if cls == "other" and " " not in english and english[0].islower():
3081 del tr["translation"]
3082 if "english" in tr: # DEPRECATED for "translation" 3082 ↛ 3084line 3082 didn't jump to line 3084 because the condition on line 3082 was always true
3083 del tr["english"]
3084 tr["roman"] = english
3086 # If the entry now has both tr["roman"] and tr["word"] and they have
3087 # the same value, delete tr["roman"] (e.g., man/English/Translations
3088 # Evenki)
3089 if tr.get("word") and tr.get("roman") == tr.get("word"): 3089 ↛ 3090line 3089 didn't jump to line 3090 because the condition on line 3089 was never true
3090 del tr["roman"]
3093def parse_alt_or_inflection_of(
3094 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str]
3095) -> Optional[tuple[list[str], Optional[list[AltOf]]]]:
3096 """Tries to parse an inflection-of or alt-of description. If successful,
3097 this returns (tags, alt-of/inflection-of-dict). If the description cannot
3098 be parsed, this returns None. This may also return (tags, None) when the
3099 gloss describes a form (or some other tags were extracted from it), but
3100 there was no alt-of/form-of/synonym-of word."""
3101 # print("parse_alt_or_inflection_of: {!r}".format(gloss))
3102 # Occasionally inflection_of/alt_of have "A(n) " etc. at the beginning.
3104 # Never interpret a gloss that is equal to the word itself as a tag
3105 # (e.g., instrumental/Romanian, instrumental/Spanish).
3106 if gloss.lower() == wxr.wtp.title.lower() or ( # type:ignore[union-attr]
3107 len(gloss) >= 5 and distw([gloss.lower()], wxr.wtp.title.lower()) < 0.2 # type:ignore[union-attr]
3108 ):
3109 return None
3111 # First try parsing it as-is
3112 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args)
3113 if parsed is not None:
3114 return parsed
3116 # Next try parsing it with the first character converted to lowercase if
3117 # it was previously uppercase.
3118 if gloss and gloss[0].isupper():
3119 gloss = gloss[0].lower() + gloss[1:]
3120 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args)
3121 if parsed is not None:
3122 return parsed
3124 return None
3127# These tags are not allowed in alt-or-inflection-of parsing
3128alt_infl_disallowed: set[str] = set(
3129 [
3130 "error-unknown-tag",
3131 "place", # Not in inflected forms and causes problems e.g. house/English
3132 ]
3133)
3136def parse_alt_or_inflection_of1(
3137 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str]
3138) -> Optional[tuple[list[str], Optional[list[AltOf]]]]:
3139 """Helper function for parse_alt_or_inflection_of. This handles a single
3140 capitalization."""
3141 if not gloss or not gloss.strip(): 3141 ↛ 3142line 3141 didn't jump to line 3142 because the condition on line 3141 was never true
3142 return None
3144 # Prevent some common errors where we would parse something we shouldn't
3145 if re.search(r"(?i)form of address ", gloss): 3145 ↛ 3146line 3145 didn't jump to line 3146 because the condition on line 3145 was never true
3146 return None
3148 gloss = re.sub(r"only used in [^,]+, ", "", gloss)
3150 # First try all formats ending with "of" (or other known last words that
3151 # can end a form description)
3152 matches = list(re.finditer(r"\b(of|for|by|as|letter|number) ", gloss))
3153 m: Optional[re.Match]
3154 for m in reversed(matches):
3155 desc = gloss[: m.end()].strip()
3156 base = gloss[m.end() :].strip()
3157 tagsets, topics = decode_tags(desc, no_unknown_starts=True)
3158 if not topics and any(
3159 not (alt_infl_disallowed & set(ts)) for ts in tagsets
3160 ):
3161 # Successfully parsed, including "of" etc.
3162 tags: list[str] = []
3163 # If you have ("Western-Armenian", ..., "form-of") as your
3164 # tag set, it's most probable that it's something like
3165 # "Western Armenian form of խոսել (xosel)", which should
3166 # get "alt-of" instead of "form-of" (inflection).
3167 # խօսիլ/Armenian
3168 for ts_t in tagsets:
3169 if "form-of" in ts_t and any(
3170 valid_tags.get(tk) == "dialect" for tk in ts_t
3171 ):
3172 ts_s = (set(ts_t) - {"form-of"}) | {"alt-of"}
3173 else:
3174 ts_s = set(ts_t)
3175 if not (alt_infl_disallowed & ts_s): 3175 ↛ 3168line 3175 didn't jump to line 3168 because the condition on line 3175 was always true
3176 tags.extend(ts_s)
3177 if (
3178 "alt-of" in tags
3179 or "form-of" in tags
3180 or "synonym-of" in tags
3181 or "compound-of" in tags
3182 ):
3183 break
3184 if m.group(1) == "of":
3185 # Try parsing without the final "of". This is commonly used in
3186 # various form-of expressions.
3187 desc = gloss[: m.start()]
3188 base = gloss[m.end() :]
3189 tagsets, topics = decode_tags(desc, no_unknown_starts=True)
3190 # print("ALT_OR_INFL: desc={!r} base={!r} tagsets={} topics={}"
3191 # .format(desc, base, tagsets, topics))
3192 if not topics and any(
3193 not (alt_infl_disallowed & set(t)) for t in tagsets
3194 ):
3195 tags = []
3196 for t in tagsets:
3197 if not (alt_infl_disallowed & set(t)): 3197 ↛ 3196line 3197 didn't jump to line 3196 because the condition on line 3197 was always true
3198 tags.extend(t)
3199 # It must have at least one tag from form_of_tags
3200 if set(tags) & form_of_tags:
3201 # Accept this as form-of
3202 tags.append("form-of")
3203 break
3204 if set(tags) & alt_of_tags:
3205 # Accept this as alt-of
3206 tags.append("alt-of")
3207 break
3209 else:
3210 # Did not find a form description based on last word; see if the
3211 # whole description is tags
3212 tagsets, topics = decode_tags(gloss, no_unknown_starts=True)
3213 if not topics and any(
3214 not (alt_infl_disallowed & set(ts)) and form_of_tags & set(ts)
3215 for ts in tagsets
3216 ):
3217 tags = []
3218 for ts in tagsets:
3219 if not (alt_infl_disallowed & set(ts)) and form_of_tags & set( 3219 ↛ 3218line 3219 didn't jump to line 3218 because the condition on line 3219 was always true
3220 ts
3221 ):
3222 tags.extend(ts)
3223 base = ""
3224 else:
3225 return None
3227 # kludge for Spanish (again): 'x of [word] combined with [clitic]'
3228 m = re.search(r"combined with \w+$", base)
3229 if m: 3229 ↛ 3230line 3229 didn't jump to line 3230 because the condition on line 3229 was never true
3230 tagsets, topics = decode_tags(m.group(0), no_unknown_starts=True)
3231 if not topics:
3232 for ts in tagsets:
3233 tags.extend(ts)
3234 base = base[: m.start()]
3236 # It is fairly common for form_of glosses to end with something like
3237 # "ablative case" or "in instructive case". Parse that ending.
3238 base = base.strip()
3239 lst = base.split()
3240 # print("parse_alt_or_inflection_of: lst={}".format(lst))
3241 if len(lst) >= 3 and lst[-1] in ("case", "case."): 3241 ↛ 3242line 3241 didn't jump to line 3242 because the condition on line 3241 was never true
3242 node = valid_sequences.children.get(lst[-2])
3243 if node and node.end:
3244 for s in node.tags:
3245 tags.extend(s.split(" "))
3246 lst = lst[:-2]
3247 if lst[-1] == "in" and len(lst) > 1:
3248 lst = lst[:-1]
3250 # Eliminate empty and duplicate tags
3251 tags = sorted(set(t for t in tags if t))
3253 # Clean up some extra stuff from the linked word, separating the text
3254 # into ``base`` (the linked word) and ``extra`` (additional information,
3255 # such as English translation or clarifying word sense information).
3256 orig_base = base
3257 base = re.sub(alt_of_form_of_clean_re, "", orig_base)
3258 base = re.sub(r" [(⟨][^()]*[)⟩]", "", base) # Remove all (...) groups
3259 extra = orig_base[len(base) :]
3260 extra = re.sub(r"^[- :;.,,—]+", "", extra)
3261 if extra.endswith(".") and extra.count(".") == 1:
3262 extra = extra[:-1].strip()
3263 m = re.match(r"^\(([^()]*)\)$", extra)
3264 if m: 3264 ↛ 3265line 3264 didn't jump to line 3265 because the condition on line 3264 was never true
3265 extra = m.group(1)
3266 else:
3267 # These weird backets used in "slash mark"
3268 m = re.match(r"^⟨([^()]*)⟩$", extra)
3269 if m: 3269 ↛ 3270line 3269 didn't jump to line 3270 because the condition on line 3269 was never true
3270 extra = m.group(1)
3271 m = re.match(r'^[“"]([^"“”]*)["”]$', extra)
3272 if m: 3272 ↛ 3273line 3272 didn't jump to line 3273 because the condition on line 3272 was never true
3273 extra = m.group(1)
3274 # Note: base might still contain comma-separated values and values
3275 # separated by "and"
3276 base = base.strip()
3277 if base.endswith(",") and len(base) > 2: 3277 ↛ 3278line 3277 didn't jump to line 3278 because the condition on line 3277 was never true
3278 base = base[:-1].strip()
3279 while (
3280 base.endswith(".")
3281 and not wxr.wtp.page_exists(base)
3282 and base not in gloss_template_args
3283 ):
3284 base = base[:-1].strip()
3285 if base.endswith('(\u201cconjecture")'): 3285 ↛ 3286line 3285 didn't jump to line 3286 because the condition on line 3285 was never true
3286 base = base[:-14].strip()
3287 tags.append("conjecture")
3288 while ( 3288 ↛ 3293line 3288 didn't jump to line 3293 because the condition on line 3288 was never true
3289 base.endswith(".")
3290 and not wxr.wtp.page_exists(base)
3291 and base not in gloss_template_args
3292 ):
3293 base = base[:-1].strip()
3294 if ( 3294 ↛ 3299line 3294 didn't jump to line 3299 because the condition on line 3294 was never true
3295 base.endswith(".")
3296 and base not in gloss_template_args
3297 and base[:-1] in gloss_template_args
3298 ):
3299 base = base[:-1]
3300 base = base.strip()
3301 if not base:
3302 return tags, None
3304 # Kludge: Spanish verb forms seem to have a dot added at the end.
3305 # Remove it; we know of no Spanish verbs ending with a dot.
3306 language = wxr.wtp.section
3307 pos = wxr.wtp.subsection
3308 # print("language={} pos={} base={}".format(language, pos, base))
3309 if ( 3309 ↛ 3315line 3309 didn't jump to line 3315 because the condition on line 3309 was never true
3310 base.endswith(".")
3311 and len(base) > 1
3312 and base[-2].isalpha()
3313 and (language == "Spanish" and pos == "Verb")
3314 ):
3315 base = base[:-1]
3317 # Split base to alternatives when multiple alternatives provided
3318 parts = split_at_comma_semi(base, extra=[" / ", "/", r" \+ "])
3319 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "")
3320 if (
3321 len(parts) <= 1
3322 or base.startswith("/")
3323 or base.endswith("/")
3324 or "/" in titleword
3325 ):
3326 parts = [base]
3327 # Split base to alternatives when of form "a or b" and "a" and "b" are
3328 # similar (generally spelling variants of the same word or similar words)
3329 if len(parts) == 1:
3330 pp = base.split()
3331 if len(pp) == 3 and pp[1] == "or" and distw([pp[0]], pp[2]) < 0.4:
3332 parts = [pp[0], pp[2]]
3334 # Create form-of/alt-of entries based on the extracted data
3335 dt_lst: list[AltOf] = []
3336 for p in parts:
3337 # Check for some suspicious base forms
3338 m = re.search(r"[.,] |[{}()]", p)
3339 if m and not wxr.wtp.page_exists(p): 3339 ↛ 3340line 3339 didn't jump to line 3340 because the condition on line 3339 was never true
3340 wxr.wtp.debug(
3341 "suspicious alt_of/form_of with {!r}: {}".format(m.group(0), p),
3342 sortid="form_descriptions/2278",
3343 )
3344 if p.startswith("*") and len(p) >= 3 and p[1].isalpha(): 3344 ↛ 3345line 3344 didn't jump to line 3345 because the condition on line 3344 was never true
3345 p = p[1:]
3346 dt: AltOf = {"word": p}
3347 if extra:
3348 dt["extra"] = extra
3349 dt_lst.append(dt)
3350 # print("alt_or_infl_of returning tags={} lst={} base={!r}"
3351 # .format(tags, lst, base))
3352 return tags, dt_lst
3355@functools.lru_cache(maxsize=65536)
3356def classify_desc(
3357 desc: str,
3358 allow_unknown_tags=False,
3359 no_unknown_starts=False,
3360 accepted: Union[tuple[str, ...], frozenset[str]] = tuple(),
3361) -> str:
3362 """Determines whether the given description is most likely tags, english,
3363 a romanization, or something else. Returns one of: "tags", "english",
3364 "romanization", or "other". If ``allow_unknown_tags`` is True, then
3365 allow "tags" classification even when the only tags are those starting
3366 with a word in allowed_unknown_starts."""
3367 assert isinstance(desc, str)
3368 # Empty and whitespace-only strings are treated as "other"
3369 desc = desc.strip()
3370 if not desc:
3371 return "other"
3373 normalized_desc = unicodedata.normalize("NFKD", desc)
3375 # If it can be fully decoded as tags without errors, treat as tags
3376 tagsets, topics = decode_tags(desc, no_unknown_starts=no_unknown_starts)
3377 for tagset in tagsets:
3378 assert isinstance(tagset, (list, tuple, set))
3379 if "error-unknown-tag" not in tagset and (
3380 topics or allow_unknown_tags or any(" " not in x for x in tagset)
3381 ):
3382 return "tags"
3384 # Check if it looks like the taxonomic name of a species
3385 if desc in known_species:
3386 return "taxonomic"
3387 desc1 = re.sub(r"^×([A-Z])", r"\1", desc)
3388 desc1 = re.sub(r"\s*×.*", "", desc1)
3389 lst = desc1.split()
3390 if len(lst) > 1 and len(lst) <= 5 and lst[0] in known_firsts:
3391 have_non_english = 1 if lst[0].lower() not in english_words else 0
3392 for x in lst[1:]:
3393 if x in ("A", "B", "C", "D", "E", "F", "I", "II", "III", "IV", "V"):
3394 continue
3395 if x[0].isupper():
3396 break
3397 if x not in english_words:
3398 have_non_english += 1
3399 else:
3400 # Starts with known taxonomic term, does not contain uppercase
3401 # words (except allowed letters) and at least one word is not
3402 # English
3403 if have_non_english >= len(lst) - 1 and have_non_english > 0: 3403 ↛ 3409line 3403 didn't jump to line 3409 because the condition on line 3403 was always true
3404 return "taxonomic"
3406 # If all words are in our English dictionary, interpret as English.
3407 # [ -~] is regex black magic, "ALL CHARACTERS from space to tilde"
3408 # in ASCII. Took me a while to figure out.
3409 if re.match(r"[ -~―—“”…'‘’ʹ€]+$", normalized_desc) and len(desc) > 1:
3410 if desc in english_words and desc[0].isalpha():
3411 return "english" # Handles ones containing whitespace
3412 desc1 = re.sub(
3413 tokenizer_fixup_re, lambda m: tokenizer_fixup_map[m.group(0)], desc
3414 )
3415 tokens = tokenizer.tokenize(desc1)
3416 if not tokens: 3416 ↛ 3417line 3416 didn't jump to line 3417 because the condition on line 3416 was never true
3417 return "other"
3418 lst_bool = list(
3419 x not in not_english_words
3420 and
3421 # not x.isdigit() and
3422 (
3423 x in english_words
3424 or x.lower() in english_words
3425 or x in known_firsts
3426 or x[0].isdigit()
3427 or x in accepted
3428 or
3429 # (x[0].isupper() and x.find("-") < 0 and x.isascii()) or
3430 (
3431 x.endswith("s") and len(x) >= 4 and x[:-1] in english_words
3432 ) # Plural
3433 or (
3434 x.endswith("ies")
3435 and len(x) >= 5
3436 and x[:-3] + "y" in english_words
3437 ) # E.g. lily - lilies
3438 or (
3439 x.endswith("ing")
3440 and len(x) >= 5
3441 and x[:-3] in english_words
3442 ) # E.g. bring - bringing
3443 or (
3444 x.endswith("ing")
3445 and len(x) >= 5
3446 and x[:-3] + "e" in english_words
3447 ) # E.g., tone - toning
3448 or (
3449 x.endswith("ed") and len(x) >= 5 and x[:-2] in english_words
3450 ) # E.g. hang - hanged
3451 or (
3452 x.endswith("ed")
3453 and len(x) >= 5
3454 and x[:-2] + "e" in english_words
3455 ) # E.g. atone - atoned
3456 or (x.endswith("'s") and x[:-2] in english_words)
3457 or (x.endswith("s'") and x[:-2] in english_words)
3458 or (
3459 x.endswith("ise")
3460 and len(x) >= 5
3461 and x[:-3] + "ize" in english_words
3462 )
3463 or (
3464 x.endswith("ised")
3465 and len(x) >= 6
3466 and x[:-4] + "ized" in english_words
3467 )
3468 or (
3469 x.endswith("ising")
3470 and len(x) >= 7
3471 and x[:-5] + "izing" in english_words
3472 )
3473 or (
3474 re.search(r"[-/]", x)
3475 and all(
3476 ((y in english_words and len(y) > 2) or not y)
3477 for y in re.split(r"[-/]", x)
3478 )
3479 )
3480 )
3481 for x in tokens
3482 )
3483 cnt = lst_bool.count(True)
3484 rejected_words = tuple(
3485 x for i, x in enumerate(tokens) if not lst_bool[i]
3486 )
3487 if (
3488 any(
3489 lst_bool[i] and x[0].isalpha() and len(x) > 1
3490 for i, x in enumerate(tokens)
3491 )
3492 and not desc.startswith("-")
3493 and not desc.endswith("-")
3494 and re.search(r"\w+", desc)
3495 and (
3496 cnt == len(lst_bool)
3497 or (
3498 any(
3499 lst_bool[i] and len(x) > 3 for i, x in enumerate(tokens)
3500 )
3501 and cnt >= len(lst_bool) - 1
3502 )
3503 or cnt / len(lst_bool) >= 0.8
3504 or (
3505 all(x in potentially_english_words for x in rejected_words)
3506 and cnt / len(lst_bool) >= 0.50
3507 )
3508 )
3509 ):
3510 return "english"
3511 # Some translations have apparent pronunciation descriptions in /.../
3512 # which we'll put in the romanization field (even though they probably are
3513 # not exactly romanizations).
3514 if desc.startswith("/") and desc.endswith("/"):
3515 return "romanization"
3516 # If all characters are in classes that could occur in romanizations,
3517 # treat as romanization
3518 classes = list(
3519 unicodedata.category(x) if x not in ("-", ",", ":", "/", '"') else "OK"
3520 for x in normalized_desc
3521 )
3522 classes1 = []
3523 num_latin = 0
3524 num_greek = 0
3525 # part = ""
3526 # for ch, cl in zip(normalized_desc, classes):
3527 # part += f"{ch}({cl})"
3528 # print(part)
3529 for ch, cl in zip(normalized_desc, classes):
3530 if ch in (
3531 "'", # ' in Arabic, / in IPA-like parenthesized forms
3532 ".", # e.g., "..." in translations
3533 ";",
3534 ":",
3535 "!",
3536 "‘",
3537 "’",
3538 '"',
3539 "“",
3540 "”",
3541 "/",
3542 "?",
3543 "…", # alternative to "..."
3544 "⁉", # 見る/Japanese automatic transcriptions...
3545 "?",
3546 "!",
3547 "⁻", # superscript -, used in some Cantonese roman, e.g. "we"
3548 "ʔ",
3549 "ʼ",
3550 "ʾ",
3551 "ʹ",
3552 ): # ʹ e.g. in understand/English/verb Russian transl
3553 classes1.append("OK")
3554 continue
3555 if cl not in ("Ll", "Lu"):
3556 classes1.append(cl)
3557 continue
3558 try:
3559 name = unicodedata.name(ch)
3560 first = name.split()[0]
3561 if first == "LATIN":
3562 num_latin += 1
3563 elif first == "GREEK":
3564 num_greek += 1
3565 elif first == "COMBINING": # Combining diacritic 3565 ↛ 3566line 3565 didn't jump to line 3566 because the condition on line 3565 was never true
3566 cl = "OK"
3567 elif re.match(non_latin_scripts_re, name): 3567 ↛ 3571line 3567 didn't jump to line 3571 because the condition on line 3567 was always true
3568 cl = "NO" # Not acceptable in romanizations
3569 except ValueError:
3570 cl = "NO" # Not acceptable in romanizations
3571 classes1.append(cl)
3572 # print("classify_desc: {!r} classes1: {}".format(desc, classes1))
3573 # print(set(classes1) )
3574 if all(
3575 x in ("Ll", "Lu", "Lt", "Lm", "Mn", "Mc", "Zs", "Nd", "OK")
3576 for x in classes1
3577 ):
3578 if (
3579 (num_latin >= num_greek + 2 or num_greek == 0)
3580 and classes1.count("OK") < len(classes1)
3581 and classes1.count("Nd") < len(classes1)
3582 ):
3583 return "romanization"
3584 # Otherwise it is something else, such as hanji version of the word
3585 return "other"
3588def remove_text_in_parentheses(text: str) -> str:
3589 parentheses = 0
3590 new_text = ""
3591 for c in text:
3592 if c == "(":
3593 parentheses += 1
3594 elif c == ")":
3595 parentheses -= 1
3596 elif parentheses == 0:
3597 new_text += c
3598 return new_text