Coverage for src/wiktextract/extractor/en/form_descriptions.py: 79%
1415 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 06:55 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 06:55 +0000
1# Code for parsing linguistic form descriptions and tags for word senses
2# (both the word entry head - initial part and parenthesized parts -
3# and tags at the beginning of word senses)
4#
5# Copyright (c) 2020-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
7import functools
8import re
9import unicodedata
10from typing import (
11 Any,
12 Literal,
13 Optional,
14 Sequence,
15 Union,
16)
18import Levenshtein
19from nltk import TweetTokenizer # type:ignore[import-untyped]
20from wikitextprocessor.parser import WikiNode
22from ...datautils import data_append, data_extend, split_at_comma_semi
23from ...page import extract_links_from_node
24from ...tags import (
25 alt_of_tags,
26 form_of_tags,
27 head_final_bantu_langs,
28 head_final_bantu_map,
29 head_final_numeric_langs,
30 head_final_other_langs,
31 head_final_other_map,
32 head_final_semitic_langs,
33 head_final_semitic_map,
34 uppercase_tags,
35 valid_tags,
36 xlat_descs_map,
37 xlat_head_map,
38 xlat_tags_map,
39)
40from ...topics import topic_generalize_map, valid_topics
41from ...wxr_context import WiktextractContext
42from .english_words import (
43 english_words,
44 not_english_words,
45 potentially_english_words,
46)
47from .form_descriptions_known_firsts import known_firsts
48from .taxondata import known_species
49from .type_utils import (
50 AltOf,
51 FormData,
52 LinkageData,
53 SenseData,
54 SoundData,
55 TemplateData,
56 TranslationData,
57 WordData,
58)
60# Tokenizer for classify_desc()
61tokenizer = TweetTokenizer()
63# These are ignored as the value of a related form in form head.
64IGNORED_RELATED: set[str] = set(
65 [
66 "-",
67 "־",
68 "᠆",
69 "‐",
70 "‑",
71 "‒",
72 "–",
73 "—",
74 "―",
75 "−",
76 "⸺",
77 "⸻",
78 "﹘",
79 "﹣",
80 "-",
81 "?",
82 "(none)",
83 ]
84)
87# First words of unicodedata.name() that indicate scripts that cannot be
88# accepted in romanizations or english (i.e., should be considered "other"
89# in classify_desc()).
90non_latin_scripts: list[str] = [
91 "ADLAM",
92 "ARABIC",
93 "ARABIC-INDIC",
94 "ARMENIAN",
95 "BALINESE",
96 "BENGALI",
97 "BRAHMI",
98 "BRAILLE",
99 "CANADIAN",
100 "CHAKMA",
101 "CHAM",
102 "CHEROKEE",
103 "CJK",
104 "COPTIC",
105 "COUNTING ROD",
106 "CUNEIFORM",
107 "CYRILLIC",
108 "DOUBLE-STRUCK",
109 "EGYPTIAN",
110 "ETHIOPIC",
111 "EXTENDED ARABIC-INDIC",
112 "GEORGIAN",
113 "GLAGOLITIC",
114 "GOTHIC",
115 "GREEK",
116 "GUJARATI",
117 "GURMUKHI",
118 "HANGUL",
119 "HANIFI ROHINGYA",
120 "HEBREW",
121 "HIRAGANA",
122 "JAVANESE",
123 "KANNADA",
124 "KATAKANA",
125 "KAYAH LI",
126 "KHMER",
127 "KHUDAWADI",
128 "LAO",
129 "LEPCHA",
130 "LIMBU",
131 "MALAYALAM",
132 "MEETEI",
133 "MYANMAR",
134 "NEW TAI LUE",
135 "NKO",
136 "OL CHIKI",
137 "OLD PERSIAN",
138 "OLD SOUTH ARABIAN",
139 "ORIYA",
140 "OSMANYA",
141 "PHOENICIAN",
142 "SAURASHTRA",
143 "SHARADA",
144 "SINHALA",
145 "SUNDANESE",
146 "SYLOTI",
147 "TAI THAM",
148 "TAKRI",
149 "TAMIL",
150 "TELUGU",
151 "THAANA",
152 "THAI",
153 "TIBETAN",
154 "TIFINAGH",
155 "TIRHUTA",
156 "UGARITIC",
157 "WARANG CITI",
158 "YI",
159]
160non_latin_scripts_re = re.compile(
161 r"(" + r"|".join(re.escape(x) for x in non_latin_scripts) + r")\b"
162)
164# Sanity check xlat_head_map values
165for k, v in xlat_head_map.items():
166 if v.startswith("?"):
167 v = v[1:]
168 for tag in v.split():
169 if tag not in valid_tags: 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true
170 print(
171 "WARNING: xlat_head_map[{}] contains"
172 " unrecognized tag {}".format(k, tag)
173 )
175# Regexp for finding nested translations from translation items (these are
176# used in, e.g., year/English/Translations/Arabic). This is actually used
177# in page.py.
178nested_translations_re = re.compile(
179 r"\s+\((({}): ([^()]|\([^()]+\))+)\)".format(
180 "|".join(
181 re.escape(x.removeprefix("?"))
182 for x in sorted(xlat_head_map.values(), key=len, reverse=True)
183 if x and not x.startswith("class-")
184 )
185 )
186)
188# Regexp that matches head tag specifiers. Used to match tags from end of
189# translations and linkages
190head_final_re_text = r"( -)?( ({}))+".format(
191 "|".join(
192 re.escape(x)
193 for x in
194 # The sort is to put longer ones first, preferring them in
195 # the regexp match
196 sorted(xlat_head_map.keys(), key=len, reverse=True)
197 )
198)
199head_final_re = re.compile(head_final_re_text + r"$")
201# Regexp used to match head tag specifiers at end of a form for certain
202# Bantu languages (particularly Swahili and similar languages).
203head_final_bantu_re_text = r" ({})".format(
204 "|".join(re.escape(x) for x in head_final_bantu_map.keys())
205)
206head_final_bantu_re = re.compile(head_final_bantu_re_text + "$")
208# Regexp used to match head tag specifiers at end of a form for certain
209# Semitic languages (particularly Arabic and similar languages).
210head_final_semitic_re_text = r" ({})".format(
211 "|".join(re.escape(x) for x in head_final_semitic_map.keys())
212)
213head_final_semitic_re = re.compile(head_final_semitic_re_text + "$")
215# Regexp used to match head tag specifiers at end of a form for certain
216# other languages (e.g., Lithuanian, Finnish, French).
217head_final_other_re_text = r" ({})".format(
218 "|".join(re.escape(x) for x in head_final_other_map.keys())
219)
220head_final_other_re = re.compile(head_final_other_re_text + "$")
222# Regexp for splitting heads. See parse_word_head().
223head_split_re_text_part_1 = (
224 "("
225 + head_final_re_text
226 + "|"
227 + head_final_bantu_re_text
228 + "|"
229 + head_final_semitic_re_text
230 + "|"
231 + head_final_other_re_text
232)
234head_split_re_text = head_split_re_text_part_1 + ")?( or |[,;]+| *$)"
236head_split_re_text_no_semicolon = head_split_re_text_part_1 + ")?( or |,+| *$)"
238head_split_re = re.compile(head_split_re_text)
239head_split_no_semicolon_re = re.compile(head_split_re_text_no_semicolon)
241head_split_re_parens = 0
242for m in re.finditer(r"(^|[^\\])[(]+", head_split_re_text):
243 head_split_re_parens += m.group(0).count("(")
245# Parenthesized parts that are ignored in translations
246tr_ignored_parens: set[str] = set(
247 [
248 "please verify",
249 "(please verify)",
250 "transliteration needed",
251 "(transliteration needed)",
252 "in words with back vowel harmony",
253 "(in words with back vowel harmony)",
254 "in words with front vowel harmony",
255 "(in words with front vowel harmony)",
256 "see below",
257 "see usage notes below",
258 ]
259)
260tr_ignored_parens_re = re.compile(
261 r"^("
262 + "|".join(re.escape(x) for x in tr_ignored_parens)
263 + ")$"
264 + r"|^(Can we clean up|Can we verify|for other meanings see "
265 r"lit\. )"
266)
268# Translations that are ignored
269ignored_translations: set[str] = set(
270 [
271 "[script needed]",
272 "please add this translation if you can",
273 ]
274)
276# Put english text into the "note" field in a translation if it contains one
277# of these words
278tr_note_re = re.compile(
279 r"(\b(article|definite|indefinite|superlative|comparative|pattern|"
280 r"adjective|adjectives|clause|clauses|pronoun|pronouns|preposition|prep|"
281 r"postposition|postp|action|actions|articles|"
282 r"adverb|adverbs|noun|nouns|verb|verbs|before|"
283 r"after|placed|prefix|suffix|used with|translated|"
284 r"nominative|genitive|dative|infinitive|participle|past|perfect|imperfect|"
285 r"perfective|imperfective|auxiliary|negative|future|present|tense|aspect|"
286 r"conjugation|declension|class|category|plural|singular|positive|"
287 r"seldom used|formal|informal|familiar|unspoken|spoken|written|"
288 r"indicative|progressive|conditional|potential|"
289 r"accusative|adessive|inessive|superessive|elative|allative|"
290 r"dialect|dialects|object|subject|predicate|movies|recommended|language|"
291 r"locative|continuous|simple|continuousness|gerund|subjunctive|"
292 r"periphrastically|no equivalent|not used|not always used|"
293 r"used only with|not applicable|use the|signifying|wordplay|pronounced|"
294 r"preconsonantal|spelled|spelling|respelling|respellings|phonetic|"
295 r"may be replaced|stricter sense|for nonhumans|"
296 r"sense:|used:|in full:|informally used|followed by|"
297 r"not restricted to|pertaining to|or optionally with|are optional|"
298 r"in conjunction with|in compounds|depending on the relationship|"
299 r"person addressed|one person|multiple persons|may be replaced with|"
300 r"optionally completed with|in the phrase|in response to|"
301 r"before a|before an|preceded by|verbs ending|very common|after a verb|"
302 r"with verb|with uncountable|with the objects|with stative|"
303 r"can be replaced by|often after|used before|used after|"
304 r"used in|clipping of|spoken|somewhat|capitalized|"
305 r"short form|shortening of|shortened form|initialism of|"
306 r"said to|rare:|rarer also|is rarer|negatively connoted|"
307 r"previously mentioned|uncountable noun|countable noun|"
308 r"countable nouns|uncountable nouns|"
309 r"with predicative|with -|with imperfect|with a negated|"
310 r"colloquial|misspelling|holophrastic|frequently|esp\.|especially|"
311 r'"|'
312 r"general term|after a vowel|before a vowel|"
313 r"form|regular|irregular|alternative)"
314 r")($|[) ])|^("
315 # Following are only matched at the beginning of the string
316 r"pl|pl\.|see:|pl:|sg:|plurals:|e\.g\.|e\.g\.:"
317 r"|e\.g\.,|cf\.|compare|such as|"
318 r"see|only|often|usually|used|usage:|of|not|in|compare|usu\.|"
319 r"as|about|abbrv\.|abbreviation|abbr\.|that:|optionally|"
320 r"mainly|from|for|also|also:|acronym|"
321 r"\+|with) "
322)
323# \b does not work at the end???
325# Related forms matching this regexp will be considered suspicious if the
326# page title does not also match one of these.
327suspicious_related_re = re.compile(
328 r"(^| )(f|m|n|c|or|pl|sg|inan|anim|pers|anml|impf|pf|vir|nvir)( |$)"
329 r"|[][:=<>&#*|]"
330 r"| \d+$"
331)
333# Word forms (head forms, translations, etc) that will be considered ok and
334# silently accepted even if they would otherwise trigger a suspicious
335# form warning.
336ok_suspicious_forms: set[str] = set(
337 [
338 "but en or", # "golden goal"/English/Tr/French
339 "cœur en or", # "heart of gold"/Eng/Tr/French
340 "en or", # golden/Eng/Tr/French
341 "men du", # jet/Etym2/Noun/Tr/Cornish
342 "parachute en or", # "golden parachute"/Eng/Tr/French
343 "vieil or", # "old gold"/Eng/Tr/French
344 # "all that glitters is not gold"/Eng/Tr/French
345 "tout ce qui brille n’est pas or",
346 "μη αποκλειστικό or", # inclusive or/Eng/Tr/Greek
347 "period or full stop",
348 ]
349)
352# Replacements to be done in classify_desc before tokenizing. This is a
353# workaround for shortcomings in TweetTokenizer.
354tokenizer_fixup_map = {
355 r"a.m.": "AM",
356 r"p.m.": "PM",
357}
358tokenizer_fixup_re = re.compile(
359 r"\b("
360 + "|".join(
361 re.escape(x)
362 for x in sorted(
363 tokenizer_fixup_map.keys(), key=lambda x: len(x), reverse=True
364 )
365 )
366 + r")"
367)
369# Unknown tags starting with these words will be silently ignored.
370ignored_unknown_starts: set[str] = set(
371 [
372 "originally",
373 "e.g.",
374 "c.f.",
375 "supplanted by",
376 "supplied by",
377 ]
378)
380ignored_unknown_starts_re = re.compile(
381 r"^("
382 + "|".join(
383 re.escape(x)
384 for x in sorted(ignored_unknown_starts, key=lambda x: -len(x))
385 )
386 + ") "
387)
389# If an unknown sequence starts with one of these, it will continue as an
390# unknown sequence until the end, unless it turns out to have a replacement.
391allowed_unknown_starts: set[str] = set(
392 [
393 "Relating",
394 "accompanied",
395 "added",
396 "after",
397 "answering",
398 "as",
399 "based",
400 "before",
401 "conjugated",
402 "conjunction",
403 "construed",
404 "especially",
405 "expression:",
406 "figurative:",
407 "followed",
408 "for",
409 "forms",
410 "from",
411 "governs",
412 "in",
413 "indicating",
414 "modifying",
415 "normally",
416 "not",
417 "of",
418 "preceding",
419 "prefixed",
420 "referring",
421 "relating",
422 "revived",
423 "said",
424 "since",
425 "takes",
426 "used",
427 "with",
428 "With",
429 "without",
430 ]
431)
432# Allow the ignored unknown starts without complaining
433allowed_unknown_starts.update(ignored_unknown_starts)
435# Full unknown tags that will be ignored in decode_tags()
436# XXX this is unused, ask Tatu where the contents is now
437ignored_unknown_tags: set[str] = set([])
439# Head endings that are mapped to tags
440head_end_map = {
441 " 1st conj.": "conjugation-1",
442 " 2nd conj.": "conjugation-2",
443 " 3rd conj.": "conjugation-3",
444 " 4th conj.": "conjugation-4",
445 " 5th conj.": "conjugation-5",
446 " 6th conj.": "conjugation-6",
447 " 7th conj.": "conjugation-7",
448}
449head_end_re = re.compile(
450 r"(" + "|".join(re.escape(x) for x in head_end_map.keys()) + r")$"
451)
454# Dictionary of language-specific parenthesized head part starts that
455# either introduce new tags or modify previous tags. The value for each
456# language is a dictionary that maps the first word of the head part to
457# (rem_tags, add_tags), where ``rem_tags`` can be True to remove all previous
458# tags or a space-separated string of tags to remove, and ``add_tags`` should
459# be a string of tags to add.
460lang_specific_head_map: dict[
461 str, dict[str, Union[tuple[str, str], tuple[Literal[True], str]]]
462] = {
463 "Danish": {
464 # prefix: (rem_tags space separate string/True, add_tags s-sep str)
465 "c": ("neuter", "common-gender"),
466 "n": ("common-gender", "neuter"),
467 "pl": ("singular neuter common-gender", "plural"),
468 "sg": ("plural neuter common-gender", "singular"),
469 },
470}
473# Regular expression used to strip additional stuff from the end of alt_of and
474# form_of.
475alt_of_form_of_clean_re = re.compile(
476 r"(?s)("
477 + "|".join(
478 [
479 r":",
480 r'[“"]',
481 r";",
482 r" \(",
483 r" - ",
484 r" ־ ",
485 r" ᠆ ",
486 r" ‐ ",
487 r" ‑ ",
488 r" ‒ ",
489 r" – ",
490 r" — ",
491 r" ― ",
492 r" − ",
493 r" ⸺ ",
494 r" ⸻ ",
495 r" ﹘ ",
496 r" ﹣ ",
497 r" - ",
498 r" \+ ",
499 r" \(with ",
500 r" with -ra/-re",
501 r"\. Used ",
502 r"\. Also ",
503 r"\. Since ",
504 r"\. A ",
505 r"\.\. A ",
506 r"\. An ",
507 r"\.\. An ",
508 r"\. an ",
509 r"\. The ",
510 r"\. Spanish ",
511 r"\. Language ",
512 r"\. former name of ",
513 r"\. AIM",
514 r"\. OT",
515 r"\. Not ",
516 r"\. Now ",
517 r"\. Nowadays ",
518 r"\. Early ",
519 r"\. ASEAN",
520 r"\. UN",
521 r"\. IMF",
522 r"\. WHO",
523 r"\. WIPO",
524 r"\. AC",
525 r"\. DC",
526 r"\. DNA",
527 r"\. RNA",
528 r"\. SOB",
529 r"\. IMO",
530 r"\. Behavior",
531 r"\. Income ",
532 r"\. More ",
533 r"\. Most ",
534 r"\. Only ",
535 r"\. Also ",
536 r"\. From ",
537 r"\. Of ",
538 r"\.\. Of ",
539 r"\. To ",
540 r"\. For ",
541 r"\. If ",
542 r"\. Praenominal ",
543 r"\. This ",
544 r"\. Replaced ",
545 r"\. CHCS is the ",
546 r"\. Equivalent ",
547 r"\. Initialism ",
548 r"\. Note ",
549 r"\. Alternative ",
550 r"\. Compare ",
551 r"\. Cf\. ",
552 r"\. Comparable ",
553 r"\. Involves ",
554 r"\. Sometimes ",
555 r"\. Commonly ",
556 r"\. Often ",
557 r"\. Typically ",
558 r"\. Possibly ",
559 r"\. Although ",
560 r"\. Rare ",
561 r"\. Instead ",
562 r"\. Integrated ",
563 r"\. Distinguished ",
564 r"\. Given ",
565 r"\. Found ",
566 r"\. Was ",
567 r"\. In ",
568 r"\. It ",
569 r"\.\. It ",
570 r"\. One ",
571 r"\. Any ",
572 r"\. They ",
573 r"\. Members ",
574 r"\. Each ",
575 r"\. Original ",
576 r"\. Especially ",
577 r"\. Usually ",
578 r"\. Known ",
579 r"\.\. Known ",
580 r"\. See ",
581 r"\. see ",
582 r"\. target was not ",
583 r"\. Popular ",
584 r"\. Pedantic ",
585 r"\. Positive ",
586 r"\. Society ",
587 r"\. Plan ",
588 r"\. Environmentally ",
589 r"\. Affording ",
590 r"\. Encompasses ",
591 r"\. Expresses ",
592 r"\. Indicates ",
593 r"\. Text ",
594 r"\. Large ",
595 r"\. Sub-sorting ",
596 r"\. Sax",
597 r"\. First-person ",
598 r"\. Second-person ",
599 r"\. Third-person ",
600 r"\. 1st ",
601 r"\. 2nd ",
602 r"\. 3rd ",
603 r"\. Term ",
604 r"\. Northeastern ",
605 r"\. Northwestern ",
606 r"\. Southeast ",
607 r"\. Egyptian ",
608 r"\. English ",
609 r"\. Cape Province was split into ",
610 r"\. Pañcat",
611 r"\. of the ",
612 r"\. is ",
613 r"\. after ",
614 r"\. or ",
615 r"\. chromed",
616 r"\. percussion",
617 r"\. with his ",
618 r"\. a\.k\.a\. ",
619 r"\. comparative form ",
620 r"\. singular ",
621 r"\. plural ",
622 r"\. present ",
623 r"\. his ",
624 r"\. her ",
625 r"\. equivalent ",
626 r"\. measuring ",
627 r"\. used in ",
628 r"\. cutely ",
629 r"\. Protects",
630 r'\. "',
631 r"\.^",
632 r"\. \+ ",
633 r"\., ",
634 r". — ",
635 r", a ",
636 r", an ",
637 r", the ",
638 r", obsolete ",
639 r", possessed", # 'd/English
640 r", imitating", # 1/English
641 r", derived from",
642 r", called ",
643 r", especially ",
644 r", slang for ",
645 r", used to", # c/o /English
646 r", commonly", # b/w /English
647 r" corresponding to ",
648 r" equivalent to ",
649 r" popularized by ",
650 r" denoting ",
651 r" in its various senses\.",
652 r" used by ",
653 r" but not for ",
654 r" since ",
655 r" i\.e\. ",
656 r" i\. e\. ",
657 r" e\.g\. ",
658 r" eg\. ",
659 r" etc\. ",
660 r"\[http",
661 r" — used as ",
662 r" by K\. Forsyth ",
663 r" by J\. R\. Allen ",
664 r" by S\. Ferguson ",
665 r" by G\. Donaldson ",
666 r" May refer to ",
667 r" An area or region ",
668 ]
669 )
670 + r").*$"
671)
674class ValidNode:
675 """Node in the valid_sequences tree. Each node is part of a chain
676 or chains that form sequences built out of keys in key->tags
677 maps like xlat_tags, etc. The ValidNode's 'word' is the key
678 by which it is refered to in the root dict or a `children` dict,
679 `end` marks that the node is the end-terminus of a sequence (but
680 it can still continue if the sequence is shared by the start of
681 other sequences: "nominative$" and "nominative plural$" for example),
682 `tags` and `topics` are the dicts containing tag and topic strings
683 for terminal nodes (end==True)."""
685 __slots__ = (
686 "end",
687 "tags",
688 "topics",
689 "children",
690 )
692 def __init__(
693 self,
694 end=False,
695 tags: Optional[list[str]] = None,
696 topics: Optional[list[str]] = None,
697 children: Optional[dict[str, "ValidNode"]] = None,
698 ) -> None:
699 self.end = end
700 self.tags: list[str] = tags or []
701 self.topics: list[str] = topics or []
702 self.children: dict[str, "ValidNode"] = children or {}
705def add_to_valid_tree(tree: ValidNode, desc: str, v: Optional[str]) -> None:
706 """Helper function for building trees of valid tags/sequences during
707 initialization."""
708 assert isinstance(tree, ValidNode)
709 assert isinstance(desc, str)
710 assert v is None or isinstance(v, str)
711 node = tree
713 # Build the tree structure: each node has children nodes
714 # whose names are denoted by their dict key.
715 for w in desc.split(" "):
716 if w in node.children:
717 node = node.children[w]
718 else:
719 new_node = ValidNode()
720 node.children[w] = new_node
721 node = new_node
722 if not node.end:
723 node.end = True
724 if not v:
725 return None # Terminate early because there are no tags
727 tagslist = []
728 topicslist = []
729 for vv in v.split():
730 if vv in valid_tags:
731 tagslist.append(vv)
732 elif vv in valid_topics: 732 ↛ 735line 732 didn't jump to line 735 because the condition on line 732 was always true
733 topicslist.append(vv)
734 else:
735 print(
736 "WARNING: tag/topic {!r} maps to unknown {!r}".format(desc, vv)
737 )
738 topics = " ".join(topicslist)
739 tags = " ".join(tagslist)
740 # Changed to "_tags" and "_topics" to avoid possible key-collisions.
741 if topics:
742 node.topics.extend([topics])
743 if tags:
744 node.tags.extend([tags])
747def add_to_valid_tree1(
748 tree: ValidNode,
749 k: str,
750 v: Union[list[str], tuple[str, ...], str],
751 valid_values: Union[set[str], dict[str, Any]],
752) -> list[str]:
753 assert isinstance(tree, ValidNode)
754 assert isinstance(k, str)
755 assert v is None or isinstance(v, (list, tuple, str))
756 assert isinstance(valid_values, (set, dict))
757 if not v: 757 ↛ 758line 757 didn't jump to line 758 because the condition on line 757 was never true
758 add_to_valid_tree(valid_sequences, k, None)
759 return []
760 elif isinstance(v, str):
761 v = [v]
762 q = []
763 for vv in v:
764 assert isinstance(vv, str)
765 add_to_valid_tree(valid_sequences, k, vv)
766 vvs = vv.split()
767 for x in vvs:
768 q.append(x)
769 # return each individual tag
770 return q
773def add_to_valid_tree_mapping(
774 tree: ValidNode,
775 mapping: Union[dict[str, Union[list[str], str]], dict[str, str]],
776 valid_values: Union[set[str], dict[str, Any]],
777 recurse: bool,
778) -> None:
779 assert isinstance(tree, ValidNode)
780 assert isinstance(mapping, dict)
781 assert isinstance(valid_values, (set, dict))
782 assert recurse in (True, False)
783 for k, v in mapping.items():
784 assert isinstance(k, str)
785 assert isinstance(v, (list, str))
786 if isinstance(v, str):
787 q = add_to_valid_tree1(tree, k, [v], valid_values)
788 else:
789 q = add_to_valid_tree1(tree, k, v, valid_values)
790 if recurse:
791 visited = set()
792 while q:
793 v = q.pop()
794 if v in visited:
795 continue
796 visited.add(v)
797 if v not in mapping:
798 continue
799 vv = mapping[v]
800 qq = add_to_valid_tree1(tree, k, vv, valid_values)
801 q.extend(qq)
804# Tree of sequences considered to be tags (includes sequences that are
805# mapped to something that becomes one or more valid tags)
806valid_sequences = ValidNode()
807sequences_with_slashes: set[str] = set()
808for tag in valid_tags:
809 # The basic tags used in our tag system; some are a bit weird, but easier
810 # to implement this with 'false' positives than filter out stuff no one else
811 # uses.
812 if "/" in tag:
813 sequences_with_slashes.add(tag)
814 add_to_valid_tree(valid_sequences, tag, tag)
815for tag in uppercase_tags:
816 hyphenated = re.sub(r"\s+", "-", tag)
817 if "/" in tag:
818 sequences_with_slashes.add(tag)
819 add_to_valid_tree(valid_sequences, tag, hyphenated)
821# xlat_tags_map!
822add_to_valid_tree_mapping(valid_sequences, xlat_tags_map, valid_tags, False)
823for k in xlat_tags_map:
824 if "/" in k:
825 sequences_with_slashes.add(k)
826# Add topics to the same table, with all generalized topics also added
827for topic in valid_topics:
828 assert " " not in topic
829 if "/" in topic: 829 ↛ 830line 829 didn't jump to line 830 because the condition on line 829 was never true
830 sequences_with_slashes.add(topic)
831 add_to_valid_tree(valid_sequences, topic, topic)
832# Let each original topic value stand alone. These are not generally on
833# valid_topics. We add the original topics with spaces replaced by hyphens.
834for topic in topic_generalize_map.keys():
835 hyphenated = re.sub(r"\s+", "-", topic)
836 if "/" in topic: 836 ↛ 837line 836 didn't jump to line 837 because the condition on line 836 was never true
837 sequences_with_slashes.add(topic)
838 add_to_valid_tree(valid_sequences, topic, hyphenated)
839# Add canonicalized/generalized topic values
840add_to_valid_tree_mapping(
841 valid_sequences, topic_generalize_map, valid_topics, True
842)
844# Regex used to divide a decode candidate into parts that shouldn't
845# have their slashes turned into spaces
846slashes_re = re.compile(
847 r"(" + "|".join((re.escape(s) for s in sequences_with_slashes)) + r")"
848)
850# Regexp used to find "words" from word heads and linguistic descriptions
851word_pattern = (
852 r"[^ ,;()\u200e]+|"
853 r"\([^ ,;()\u200e]+\)[^ ,;()\u200e]+|"
854 r"[\u2800-\u28ff]|" # Braille characters
855 r"\(([^()]|\([^()]*\))*\)"
856)
858word_re_global = re.compile(word_pattern)
861def distw(titleparts: Sequence[str], word: str) -> float:
862 """Computes how distinct ``word`` is from the most similar word in
863 ``titleparts``. Returns 1 if words completely distinct, 0 if
864 identical, or otherwise something in between."""
865 assert isinstance(titleparts, (list, tuple))
866 assert isinstance(word, str)
867 w = min(
868 Levenshtein.distance(word, tw) / max(len(tw), len(word))
869 for tw in titleparts
870 )
871 return w
874def map_with(
875 ht: dict[str, str | list[str]] | dict[str, str],
876 lst: Sequence[str],
877) -> list[str]:
878 """Takes alternatives from ``lst``, maps them using ``ht`` to zero or
879 more alternatives each, and returns a combined list of alternatives."""
880 assert isinstance(ht, dict)
881 assert isinstance(lst, (list, tuple))
882 ret = []
883 for x in lst:
884 assert isinstance(x, str)
885 x = x.strip()
886 x = ht.get(x, x)
887 if isinstance(x, str): 887 ↛ 890line 887 didn't jump to line 890 because the condition on line 887 was always true
888 if x: 888 ↛ 883line 888 didn't jump to line 883 because the condition on line 888 was always true
889 ret.append(x)
890 elif isinstance(x, (list, tuple)):
891 ret.extend(x)
892 else:
893 raise RuntimeError("map_with unexpected value: {!r}".format(x))
894 return ret
897TagList = list[str]
898PosPathStep = tuple[int, TagList, TagList]
901def check_unknown(
902 from_i: int,
903 to_i: int,
904 i: int,
905 wordlst: Sequence[str],
906 allow_any: bool,
907 no_unknown_starts: bool,
908) -> list[PosPathStep]:
909 """Check if the current section from_i->to_i is actually unknown
910 or if it needs some special handling. We already presupposed that
911 this is UNKNOWN; this is just called to see what *kind* of UNKNOWN."""
912 assert isinstance(to_i, int)
913 assert isinstance(from_i, int)
914 assert isinstance(i, int)
915 # Adds unknown tag if needed. Returns new last_i
916 # print("check_unknown to_i={} from_i={} i={}"
917 # .format(to_i, from_i, i))
918 if from_i >= to_i:
919 return []
920 words = wordlst[from_i:to_i]
921 tag = " ".join(words)
922 assert tag
923 # print(f"{tag=}")
924 if re.match(ignored_unknown_starts_re, tag):
925 # Tags with this start are to be ignored
926 return [(from_i, ["UNKNOWN"], [])]
927 if tag in ignored_unknown_tags: 927 ↛ 928line 927 didn't jump to line 928 because the condition on line 927 was never true
928 return [] # One of the tags listed as to be ignored
929 if tag in ("and", "or"):
930 return []
931 if (
932 not allow_any
933 and not words[0].startswith("~")
934 and (
935 no_unknown_starts
936 or words[0] not in allowed_unknown_starts
937 or len(words) <= 1
938 )
939 ):
940 # print("ERR allow_any={} words={}"
941 # .format(allow_any, words))
942 return [
943 (from_i, ["UNKNOWN"], ["error-unknown-tag"])
944 ] # Add ``tag`` here to include
945 else:
946 return [(from_i, ["UNKNOWN"], [tag])]
949def add_new1(
950 node: ValidNode,
951 i: int,
952 start_i: int,
953 last_i: int,
954 new_paths: list[list[PosPathStep]],
955 new_nodes: list[tuple[ValidNode, int, int]],
956 pos_paths: list[list[list[PosPathStep]]],
957 wordlst: list[str],
958 allow_any: bool,
959 no_unknown_starts: bool,
960 max_last_i: int,
961) -> int:
962 assert isinstance(new_paths, list)
963 # print("add_new: start_i={} last_i={}".format(start_i, last_i))
964 # print("$ {} last_i={} start_i={}"
965 # .format(w, last_i, start_i))
966 max_last_i = max(max_last_i, last_i) # if last_i has grown
967 if (node, start_i, last_i) not in new_nodes:
968 new_nodes.append((node, start_i, last_i))
969 if node.end:
970 # We can see a terminal point in the search tree.
971 u = check_unknown(
972 last_i, start_i, i, wordlst, allow_any, no_unknown_starts
973 )
974 # Create new paths candidates based on different past possible
975 # paths; pos_path[last_i] contains possible paths, so add this
976 # new one at the beginning(?)
977 # The list comprehension inside the parens generates an iterable
978 # of lists, so this is .extend( [(last_i...)], [(last_i...)], ... )
979 # XXX: this is becoming impossible to annotate, nodes might
980 # need to become classed objects and not just dicts, or at least
981 # a TypedDict with a "children" node
982 new_paths.extend(
983 [(last_i, node.tags, node.topics)] + u + x
984 for x in pos_paths[last_i]
985 )
986 max_last_i = i + 1
987 return max_last_i
990@functools.lru_cache(maxsize=65536)
991def decode_tags(
992 src: str,
993 allow_any=False,
994 no_unknown_starts=False,
995) -> tuple[list[tuple[str, ...]], list[str]]:
996 tagsets, topics = decode_tags1(src, allow_any, no_unknown_starts)
997 # print(f"decode_tags: {src=}, {tagsets=}")
999 # Insert retry-code here that modifies the text source
1000 if (
1001 any(s.startswith("error-") for tagset in tagsets for s in tagset)
1002 # I hate Python's *nested* list comprehension syntax ^
1003 or any(s.startswith("error-") for s in topics)
1004 ):
1005 new_tagsets: list[tuple[str, ...]] = []
1006 new_topics: list[str] = []
1008 if "/" in src:
1009 # slashes_re contains valid key entries with slashes; we're going
1010 # to skip them by splitting the string and skipping handling every
1011 # second entry, which contains the splitting group like "masculine/
1012 # feminine" style keys.
1013 split_parts = re.split(slashes_re, src)
1014 new_parts: list[str] = []
1015 if len(split_parts) > 1:
1016 for i, s in enumerate(split_parts):
1017 if i % 2 == 0:
1018 new_parts.append(s.replace("/", " "))
1019 else:
1020 new_parts.append(s)
1021 new_src = "".join(new_parts)
1022 else:
1023 new_src = src
1024 new_tagsets, new_topics = decode_tags1(
1025 new_src, allow_any, no_unknown_starts
1026 )
1027 elif " or " in src or " and " in src:
1028 # Annoying kludge.
1029 new_src = src.replace(" and ", " ")
1030 new_src = new_src.replace(" or ", " ")
1031 new_tagsets, new_topics = decode_tags1(
1032 new_src, allow_any, no_unknown_starts
1033 )
1034 # print(f"{new_tagsets=}")
1036 if new_tagsets or new_topics:
1037 old_errors = sum(
1038 1 for tagset in tagsets for s in tagset if s.startswith("error")
1039 )
1040 old_errors += sum(1 for s in topics if s.startswith("error"))
1041 new_errors = sum(
1042 1
1043 for new_tagset in new_tagsets
1044 for s in new_tagset
1045 if s.startswith("error")
1046 )
1047 new_errors += sum(1 for s in new_topics if s.startswith("error"))
1049 if new_errors <= old_errors: 1049 ↛ 1052line 1049 didn't jump to line 1052 because the condition on line 1049 was always true
1050 return new_tagsets, new_topics
1052 return tagsets, topics
1055def decode_tags1(
1056 src: str,
1057 allow_any=False,
1058 no_unknown_starts=False,
1059) -> tuple[list[tuple[str, ...]], list[str]]:
1060 """Decodes tags, doing some canonicalizations. This returns a list of
1061 lists of tags and a list of topics."""
1062 assert isinstance(src, str)
1064 # print("decode_tags: src={!r}".format(src))
1066 pos_paths: list[list[list[PosPathStep]]] = [[[]]]
1067 wordlst: list[str] = []
1068 max_last_i = 0 # pre-initialized here so that it can be used as a ref
1070 add_new = functools.partial(
1071 add_new1, # pre-set parameters and references for function
1072 pos_paths=pos_paths,
1073 wordlst=wordlst,
1074 allow_any=allow_any,
1075 no_unknown_starts=no_unknown_starts,
1076 max_last_i=max_last_i,
1077 )
1078 # First split the tags at commas and semicolons. Their significance is that
1079 # a multi-word sequence cannot continue across them.
1080 parts = split_at_comma_semi(src, extra=[";", ":"])
1082 for part in parts:
1083 max_last_i = len(wordlst) # "how far have we gone?"
1084 lst1 = part.split()
1085 if not lst1:
1086 continue
1087 wordlst.extend(lst1)
1088 cur_nodes: list[tuple[ValidNode, int, int]] = [] # Currently seen
1089 for w in lst1:
1090 i = len(pos_paths) - 1
1091 new_nodes: list[tuple[ValidNode, int, int]] = []
1092 # replacement nodes for next loop
1093 new_paths: list[list[PosPathStep]] = []
1094 # print("ITER i={} w={} max_last_i={} wordlst={}"
1095 # .format(i, w, max_last_i, wordlst))
1096 node: ValidNode
1097 start_i: int
1098 last_i: int
1099 for node, start_i, last_i in cur_nodes:
1100 # ValidNodes are part of a search tree that checks if a
1101 # phrase is found in xlat_tags_map and other text->tags dicts.
1102 if w in node.children:
1103 # the phrase continues down the tree
1104 # print("INC", w)
1105 max_last_i = add_new(
1106 node.children[w],
1107 i,
1108 start_i,
1109 last_i,
1110 new_paths,
1111 new_nodes,
1112 )
1113 if node.end:
1114 # we've hit an end point, the tags and topics have already
1115 # been gathered at some point, don't do anything with the
1116 # old stuff
1117 if w in valid_sequences.children:
1118 # This starts a *new* possible section
1119 max_last_i = add_new(
1120 valid_sequences.children[w], # root->
1121 i,
1122 i,
1123 i,
1124 new_paths,
1125 new_nodes,
1126 )
1127 if w not in node.children and not node.end:
1128 # print("w not in node and $: i={} last_i={} wordlst={}"
1129 # .format(i, last_i, wordlst))
1130 # If i == last_i == 0, for example (beginning)
1131 if (
1132 i == last_i
1133 or no_unknown_starts
1134 or wordlst[last_i] not in allowed_unknown_starts
1135 ):
1136 # print("NEW", w)
1137 if w in valid_sequences.children:
1138 # Start new sequences here
1139 max_last_i = add_new(
1140 valid_sequences.children[w],
1141 i,
1142 i,
1143 last_i,
1144 new_paths,
1145 new_nodes,
1146 )
1147 if not new_nodes:
1148 # This is run at the start when i == max_last_i == 0,
1149 # which is what populates the first node in new_nodes.
1150 # Some initial words cause the rest to be interpreted as unknown
1151 # print("not new nodes: i={} last_i={} wordlst={}"
1152 # .format(i, max_last_i, wordlst))
1153 if (
1154 i == max_last_i
1155 or no_unknown_starts
1156 or wordlst[max_last_i] not in allowed_unknown_starts
1157 ):
1158 # print("RECOVER w={} i={} max_last_i={} wordlst={}"
1159 # .format(w, i, max_last_i, wordlst))
1160 if w in valid_sequences.children:
1161 max_last_i = add_new(
1162 # new sequence from root
1163 valid_sequences.children[w],
1164 i,
1165 i,
1166 max_last_i,
1167 new_paths,
1168 new_nodes,
1169 )
1170 cur_nodes = new_nodes # Completely replace nodes!
1171 # 2023-08-18, fix to improve performance
1172 # Decode tags does a big search of the best-shortest matching
1173 # sequences of tags, but the original algorithm didn't have
1174 # any culling happen during operation, so in a case with
1175 # a lot of tags (for example, big blocks of text inserted
1176 # somewhere by mistake that is processed by decode_tags),
1177 # it would lead to exponential growth of new_paths contents.
1178 # This culling, using the same weighting algorithm code as
1179 # in the original is just applied to new_paths before it is
1180 # added to pos_paths. Basically it's "take the 10 best paths".
1181 # This *can* cause bugs if it gets stuck in a local minimum
1182 # or something, but this whole process is one-dimensional
1183 # and not that complex, so hopefully it works out...
1184 pw = []
1185 path: list[PosPathStep]
1186 for path in new_paths:
1187 weight = len(path)
1188 if any(x[1] == ["UNKNOWN"] for x in path):
1189 weight += 100 # Penalize unknown paths
1190 pw.append((weight, path))
1191 new_paths = [weightpath[1] for weightpath in sorted(pw)[:10]]
1192 pos_paths.append(new_paths)
1194 # print("END max_last_i={} len(wordlst)={} len(pos_paths)={}"
1195 # .format(max_last_i, len(wordlst), len(pos_paths)))
1197 if cur_nodes:
1198 # print("END HAVE_NODES")
1199 for node, start_i, last_i in cur_nodes:
1200 if node.end:
1201 # print("$ END start_i={} last_i={}"
1202 # .format(start_i, last_i))
1203 for path in pos_paths[start_i]:
1204 pos_paths[-1].append(
1205 [(last_i, node.tags, node.topics)] + path
1206 )
1207 else:
1208 # print("UNK END start_i={} last_i={} wordlst={}"
1209 # .format(start_i, last_i, wordlst))
1210 u = check_unknown(
1211 last_i,
1212 len(wordlst),
1213 len(wordlst),
1214 wordlst,
1215 allow_any,
1216 no_unknown_starts,
1217 )
1218 if pos_paths[start_i]:
1219 for path in pos_paths[start_i]:
1220 pos_paths[-1].append(u + path)
1221 else:
1222 pos_paths[-1].append(u)
1223 else:
1224 # Check for a final unknown tag
1225 # print("NO END NODES max_last_i={}".format(max_last_i))
1226 paths = pos_paths[max_last_i] or [[]]
1227 u = check_unknown(
1228 max_last_i,
1229 len(wordlst),
1230 len(wordlst),
1231 wordlst,
1232 allow_any,
1233 no_unknown_starts,
1234 )
1235 if u:
1236 # print("end max_last_i={}".format(max_last_i))
1237 for path in list(paths): # Copy in case it is the last pos
1238 pos_paths[-1].append(u + path)
1240 # import json
1241 # print("POS_PATHS:", json.dumps(pos_paths, indent=2, sort_keys=True))
1243 if not pos_paths[-1]:
1244 # print("decode_tags: {}: EMPTY POS_PATHS[-1]".format(src))
1245 return [], []
1247 # Find the best path
1248 pw = []
1249 for path in pos_paths[-1]:
1250 weight = len(path)
1251 if any(x[1] == ["UNKNOWN"] for x in path):
1252 weight += 100 # Penalize unknown paths
1253 pw.append((weight, path))
1254 path = min(pw)[1]
1256 # Convert the best path to tagsets and topics
1257 tagsets: list[list[str]] = [[]]
1258 topics: list[str] = []
1259 for i, tagspec, topicspec in path:
1260 if len(tagsets or "") > 16:
1261 # ctx.error("Too many tagsets! This is probably exponential",
1262 # sortid="form_descriptions/20230818")
1263 return [("error-unknown-tag", "error-exponential-tagsets")], []
1264 if tagspec == ["UNKNOWN"]:
1265 new_tagsets = []
1266 for x in tagsets:
1267 new_tagsets.append(x + topicspec)
1268 tagsets = new_tagsets
1269 continue
1270 if tagspec:
1271 new_tagsets = []
1272 for x in tagsets:
1273 for t in tagspec:
1274 if t: 1274 ↛ 1281line 1274 didn't jump to line 1281 because the condition on line 1274 was always true
1275 new_tags = list(x)
1276 for tag in t.split():
1277 if tag not in new_tags:
1278 new_tags.append(tag)
1279 new_tagsets.append(new_tags)
1280 else:
1281 new_tagsets.append(x)
1282 tagsets = new_tagsets
1283 if topicspec:
1284 for t in topicspec:
1285 for topic in t.split():
1286 if topic not in topics:
1287 topics.append(topic)
1289 # print("unsorted tagsets:", tagsets)
1290 ret_tagsets = sorted(set(tuple(sorted(set(tags))) for tags in tagsets))
1291 # topics = list(sorted(set(topics))) XXX tests expect not sorted
1292 # print("decode_tags: {} -> {} topics {}".format(src, tagsets, topics))
1293 # Yes, ret_tagsets is a list of tags in tuples, while topics is a LIST
1294 # of tags. Turning topics into a tuple breaks tests, turning the tuples
1295 # inside tagsets into lists breaks tests, I'm leaving them mismatched
1296 # for now. XXX
1297 return ret_tagsets, topics
1300def parse_head_final_tags(
1301 wxr: WiktextractContext, lang: str, form: str
1302) -> tuple[str, list[str]]:
1303 """Parses tags that are allowed at the end of a form head from the end
1304 of the form. This can also be used for parsing the final gender etc tags
1305 from translations and linkages."""
1306 assert isinstance(wxr, WiktextractContext)
1307 assert isinstance(lang, str) # Should be language that "form" is for
1308 assert isinstance(form, str)
1310 # print("parse_head_final_tags: lang={} form={!r}".format(lang, form))
1312 # Make sure there are no double spaces in the form as this code does not
1313 # handle them otherwise.
1314 form = re.sub(r"\s+", " ", form.strip())
1315 if not form:
1316 return form, []
1318 origform = form
1320 tags = []
1322 # If parsing for certain Bantu languages (e.g., Swahili), handle
1323 # some extra head-final tags first
1324 if lang in head_final_bantu_langs:
1325 m = re.search(head_final_bantu_re, form)
1326 if m is not None:
1327 tagkeys = m.group(1)
1328 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1328 ↛ 1343line 1328 didn't jump to line 1343 because the condition on line 1328 was always true
1329 form = form[: m.start()]
1330 v = head_final_bantu_map[tagkeys]
1331 if v.startswith("?"): 1331 ↛ 1332line 1331 didn't jump to line 1332 because the condition on line 1331 was never true
1332 v = v[1:]
1333 wxr.wtp.debug(
1334 "suspicious suffix {!r} in language {}: {}".format(
1335 tagkeys, lang, origform
1336 ),
1337 sortid="form_descriptions/1028",
1338 )
1339 tags.extend(v.split())
1341 # If parsing for certain Semitic languages (e.g., Arabic), handle
1342 # some extra head-final tags first
1343 if lang in head_final_semitic_langs:
1344 m = re.search(head_final_semitic_re, form)
1345 if m is not None:
1346 tagkeys = m.group(1)
1347 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1347 ↛ 1362line 1347 didn't jump to line 1362 because the condition on line 1347 was always true
1348 form = form[: m.start()]
1349 v = head_final_semitic_map[tagkeys]
1350 if v.startswith("?"): 1350 ↛ 1351line 1350 didn't jump to line 1351 because the condition on line 1350 was never true
1351 v = v[1:]
1352 wxr.wtp.debug(
1353 "suspicious suffix {!r} in language {}: {}".format(
1354 tagkeys, lang, origform
1355 ),
1356 sortid="form_descriptions/1043",
1357 )
1358 tags.extend(v.split())
1360 # If parsing for certain other languages (e.g., Lithuanian,
1361 # French, Finnish), handle some extra head-final tags first
1362 if lang in head_final_other_langs:
1363 m = re.search(head_final_other_re, form)
1364 if m is not None:
1365 tagkeys = m.group(1)
1366 if not wxr.wtp.title.endswith(tagkeys): # type:ignore[union-attr] 1366 ↛ 1372line 1366 didn't jump to line 1372 because the condition on line 1366 was always true
1367 form = form[: m.start()]
1368 tags.extend(head_final_other_map[tagkeys].split(" "))
1370 # Handle normal head-final tags
1371 # Loop this until nothing is found
1372 while True:
1373 prev_form = form
1374 m = re.search(head_final_re, form)
1375 if m is not None:
1376 # print(f"{m=}, {m.groups()=}")
1377 tagkeys = m.group(3)
1378 # Only replace tags ending with numbers in languages that have
1379 # head-final numeric tags (e.g., Bantu classes); also, don't replace
1380 # tags if the main title ends with them (then presume they are part
1381 # of the word)
1382 # print("head_final_tags form={!r} tagkeys={!r} lang={}"
1383 # .format(form, tagkeys, lang))
1384 tagkeys_contains_digit = re.search(r"\d", tagkeys)
1385 if (
1386 (not tagkeys_contains_digit or lang in head_final_numeric_langs)
1387 and not wxr.wtp.title.endswith(" " + tagkeys) # type:ignore[union-attr]
1388 and
1389 # XXX the above test does not capture when the whole word is a
1390 # xlat_head_map key, so I added the below test to complement
1391 # it; does this break anything?
1392 not wxr.wtp.title == tagkeys
1393 ): # defunct/English,
1394 # "more defunct" -> "more" ["archaic"]
1395 if ( 1395 ↛ 1413line 1395 didn't jump to line 1413 because the condition on line 1395 was always true
1396 not tagkeys_contains_digit
1397 or lang in head_final_numeric_langs
1398 ):
1399 # m.start(3) gets the start of what is in m.group(3), handy
1400 form = form[: m.start(3)].strip()
1401 v = xlat_head_map[tagkeys]
1402 if v.startswith("?"): 1402 ↛ 1403line 1402 didn't jump to line 1403 because the condition on line 1402 was never true
1403 v = v[1:]
1404 wxr.wtp.debug(
1405 "suspicious suffix {!r} in language {}: {}".format(
1406 tagkeys, lang, origform
1407 ),
1408 sortid="form_descriptions/1077",
1409 )
1410 tags.extend(v.split())
1411 else:
1412 break
1413 if prev_form == form:
1414 break
1416 # Generate warnings about words ending in " or" after processing
1417 if (
1418 (form.endswith(" or") and not origform.endswith(" or"))
1419 or re.search(
1420 r" (1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|"
1421 r"1a|2a|9a|10a|m1|f1|f2|m2|f3|m3|f4|m4|f5|m5|or|\?)"
1422 r"($|/| (f|m|sg|pl|anim|inan))",
1423 form,
1424 )
1425 or form.endswith(" du")
1426 ):
1427 if form not in ok_suspicious_forms:
1428 wxr.wtp.debug(
1429 "suspicious unhandled suffix in {}:"
1430 " {!r}, originally {!r}".format(lang, form, origform),
1431 sortid="form_descriptions/1089",
1432 )
1434 # print("parse_head_final_tags: form={!r} tags={}".format(form, tags))
1435 return form, tags
1438def quote_kept_parens(s: str) -> str:
1439 """Changes certain parenthesized expressions so that they won't be
1440 interpreted as parentheses. This is used for parts that are kept as
1441 part of the word, such as "read admiral (upper half)"."""
1442 return re.sub(
1443 r"\((lower half|upper half|k|s|n|II|III|A|C|G|U|Y|"
1444 r"vinyl|p-phenylene vinylene|\(\(\s*\)\))\)",
1445 r"__lpar__\1__rpar__",
1446 s,
1447 )
1450def quote_kept_ruby(
1451 wxr: WiktextractContext,
1452 ruby_tuples: list[
1453 tuple[
1454 str,
1455 str,
1456 ]
1457 ],
1458 s: str,
1459) -> str:
1460 if len(ruby_tuples) < 1: 1460 ↛ 1461line 1460 didn't jump to line 1461 because the condition on line 1460 was never true
1461 wxr.wtp.debug(
1462 "quote_kept_ruby called with no ruby",
1463 sortid="form_description/1114/20230517",
1464 )
1465 return s
1466 ks = []
1467 rs = []
1468 for k, r in ruby_tuples:
1469 ks.append(re.escape(k))
1470 rs.append(re.escape(r))
1471 if not (ks and rs): 1471 ↛ 1472line 1471 didn't jump to line 1472 because the condition on line 1471 was never true
1472 wxr.wtp.debug(
1473 f"empty column in ruby_tuples: {ruby_tuples}",
1474 sortid="form_description/1124/20230606",
1475 )
1476 return s
1477 newm = re.compile(
1478 r"({})\s*\(\s*({})\s*\)".format("|".join(ks), "|".join(rs))
1479 )
1480 rub_re = re.compile(
1481 r"({})".format(
1482 r"|".join(
1483 r"{}\(*{}\)*".format(
1484 re.escape(k),
1485 re.escape(r),
1486 )
1487 for k, r in ruby_tuples
1488 )
1489 )
1490 )
1492 def paren_replace(m: re.Match) -> str:
1493 return re.sub(newm, r"\1__lrub__\2__rrub__", m.group(0))
1495 return re.sub(rub_re, paren_replace, s)
1498def unquote_kept_parens(s: str) -> str:
1499 """Converts the quoted parentheses back to normal parentheses."""
1500 return re.sub(r"__lpar__(.*?)__rpar__", r"(\1)", s)
1503def add_romanization(
1504 wxr: WiktextractContext,
1505 data: WordData,
1506 roman: str,
1507 text: str,
1508 is_reconstruction: bool,
1509 head_group: Optional[int],
1510 ruby: Sequence[tuple[str, str]],
1511) -> None:
1512 tags_lst = ["romanization"]
1513 m = re.match(r"([^:]+):(.+)", roman)
1514 # This function's purpose is to intercept broken romanizations,
1515 # like "Yale: hēnpyeng" style tags. Most romanization styles
1516 # are already present as tags, so we can use decode_tags to find
1517 # them.
1518 if m: 1518 ↛ 1519line 1518 didn't jump to line 1519 because the condition on line 1518 was never true
1519 tagsets, topics = decode_tags(m.group(1))
1520 if tagsets:
1521 for tags in tagsets:
1522 tags_lst.extend(tags)
1523 roman = m.group(2)
1524 add_related(
1525 wxr,
1526 data,
1527 tags_lst,
1528 [roman],
1529 text,
1530 True,
1531 is_reconstruction,
1532 head_group,
1533 ruby,
1534 )
1537def add_related(
1538 wxr: WiktextractContext,
1539 data: WordData,
1540 tags_lst: Union[list[str], tuple[str, ...]],
1541 related_list: list[str],
1542 origtext: str,
1543 add_all_canonicals: bool,
1544 is_reconstruction: bool,
1545 head_group: Optional[int],
1546 ruby_data: Optional[Sequence[tuple[str, str]]] = None,
1547 links: list[tuple[str, str]] | None = None,
1548 link_dict: dict[str, list[str]] | None = None,
1549) -> Optional[list[tuple[str, ...]]]:
1550 """Internal helper function for some post-processing entries for related
1551 forms (e.g., in word head). This returns a list of list of tags to be
1552 added to following related forms or None (cf. walrus/English word head,
1553 parenthesized part starting with "both")."""
1554 assert isinstance(wxr, WiktextractContext)
1555 assert isinstance(tags_lst, (list, tuple))
1556 for x in tags_lst:
1557 assert isinstance(x, str)
1558 assert isinstance(related_list, (list, tuple))
1559 assert isinstance(origtext, str)
1560 assert add_all_canonicals in (True, False)
1561 assert isinstance(ruby_data, (list, tuple)) or ruby_data is None
1562 if ruby_data is None: 1562 ↛ 1563line 1562 didn't jump to line 1563 because the condition on line 1562 was never true
1563 ruby_data = []
1564 related = " ".join(related_list)
1565 # print("add_related: tags_lst={} related={}".format(tags_lst, related))
1566 if related == "[please provide]": 1566 ↛ 1567line 1566 didn't jump to line 1567 because the condition on line 1566 was never true
1567 return None
1568 if related in IGNORED_RELATED: 1568 ↛ 1569line 1568 didn't jump to line 1569 because the condition on line 1568 was never true
1569 return None
1570 if is_reconstruction and related.startswith("*") and len(related) > 1:
1571 related = related[1:]
1573 # print(f"{links=}, {link_dict=}")
1574 # Get title word, with any reconstruction prefix removed
1575 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title) # type:ignore[arg-type]
1577 def check_related(related: str) -> None:
1578 # Warn about some suspicious related forms
1579 m = re.search(suspicious_related_re, related)
1580 if (m and m.group(0) not in titleword) or (
1581 related in ("f", "m", "n", "c") and len(titleword) >= 3
1582 ):
1583 if "eumhun" in tags_lst: 1583 ↛ 1584line 1583 didn't jump to line 1584 because the condition on line 1583 was never true
1584 return
1585 if "cangjie-input" in tags_lst: 1585 ↛ 1586line 1585 didn't jump to line 1586 because the condition on line 1585 was never true
1586 return
1587 if "class" in tags_lst: 1587 ↛ 1588line 1587 didn't jump to line 1588 because the condition on line 1587 was never true
1588 return
1589 if wxr.wtp.section == "Korean" and re.search( 1589 ↛ 1593line 1589 didn't jump to line 1593 because the condition on line 1589 was never true
1590 r"^\s*\w*>\w*\s*$", related
1591 ):
1592 # ignore Korean "i>ni" / "라>나" values
1593 return
1594 if ( 1594 ↛ 1601line 1594 didn't jump to line 1601 because the condition on line 1594 was never true
1595 wxr.wtp.section == "Burmese"
1596 and "romanization" in tags_lst
1597 and re.search(r":", related)
1598 ):
1599 # ignore Burmese with ":", that is used in Burmese
1600 # translitteration of "း", the high-tone visarga.
1601 return
1602 wxr.wtp.debug(
1603 "suspicious related form tags {}: {!r} in {!r}".format(
1604 tags_lst, related, origtext
1605 ),
1606 sortid="form_descriptions/1147",
1607 )
1609 following_tagsets = None # Tagsets to add to following related forms
1610 roman = None
1611 tagsets1: list[tuple[str, ...]] = [tuple()]
1612 topics1: list[str] = []
1614 m = re.match(r"\((([^()]|\([^()]*\))*)\)\s+", related)
1615 if m:
1616 paren = m.group(1)
1617 related = related[m.end() :]
1618 m = re.match(r"^(all|both) (.*)", paren)
1619 if m: 1619 ↛ 1620line 1619 didn't jump to line 1620 because the condition on line 1619 was never true
1620 tagsets1, topics1 = decode_tags(m.group(2))
1621 following_tagsets = tagsets1
1622 else:
1623 tagsets1, topics1 = decode_tags(paren)
1624 else:
1625 m = re.search(r"\s+\((([^()]|\([^()]*\))*)\)$", related)
1626 if m:
1627 paren = m.group(1)
1628 if paren.startswith("U+"): 1628 ↛ 1629line 1628 didn't jump to line 1629 because the condition on line 1628 was never true
1629 related = related[: m.start()]
1630 else:
1631 cls = classify_desc(paren)
1632 if ( 1632 ↛ 1639line 1632 didn't jump to line 1639 because the condition on line 1632 was always true
1633 cls in ("romanization", "english")
1634 and classify_desc(related[: m.start()]) == "other"
1635 ):
1636 roman = paren
1637 related = related[: m.start()]
1638 else:
1639 related = related[: m.start()]
1640 tagsets1, topics1 = decode_tags(paren)
1641 if related and related.startswith("{{"): 1641 ↛ 1642line 1641 didn't jump to line 1642 because the condition on line 1641 was never true
1642 wxr.wtp.debug(
1643 "`{{` in word head form - possible Wiktionary error: {!r}".format(
1644 related
1645 ),
1646 sortid="form_descriptions/1177",
1647 )
1648 return None # Likely Wiktionary coding error
1649 related = unquote_kept_parens(related)
1650 # Split related by "/" (e.g., grande/Spanish) superlative in head
1651 # Do not split if / in word title, see π//Japanese
1652 if len(related) > 5 and "/" not in wxr.wtp.title: # type:ignore[operator]
1653 alts = split_at_comma_semi(related, separators=["/"])
1654 else:
1655 alts = [related]
1656 if ruby_data:
1657 # prepare some regex stuff in advance
1658 ks, rs = [], []
1659 for k, r in ruby_data:
1660 ks.append(re.escape(k))
1661 rs.append(re.escape(r))
1662 splitter = r"((?:{})__lrub__(?:{})__rrub__)".format(
1663 "|".join(ks), "|".join(rs)
1664 )
1665 for related in alts:
1666 ruby: list[tuple[str, str]] = []
1667 if ruby_data:
1668 new_related = []
1669 rub_split = re.split(splitter, related)
1670 for s in rub_split:
1671 m = re.match(r"(.+)__lrub__(.+)__rrub__", s)
1672 if m:
1673 # add ruby with (\1, \2)
1674 ruby.append((m.group(1), m.group(2)))
1675 new_related.append(m.group(1))
1676 else:
1677 new_related.append(s)
1678 related = "".join(new_related)
1679 tagsets2, topics2 = decode_tags(" ".join(tags_lst))
1680 for tags1 in tagsets1:
1681 assert isinstance(tags1, (list, tuple))
1682 for tags2 in tagsets2:
1683 assert isinstance(tags1, (list, tuple))
1684 dt: LinkageData = {"word": related}
1685 if roman:
1686 dt["roman"] = roman
1687 if ruby:
1688 dt["ruby"] = ruby
1689 if "alt-of" in tags2: 1689 ↛ 1690line 1689 didn't jump to line 1690 because the condition on line 1689 was never true
1690 check_related(related)
1691 data_extend(data, "tags", tags1)
1692 data_extend(data, "tags", tags2)
1693 data_extend(data, "topics", topics1)
1694 data_extend(data, "topics", topics2)
1695 data_append(data, "alt_of", dt)
1696 elif "form-of" in tags2: 1696 ↛ 1697line 1696 didn't jump to line 1697 because the condition on line 1696 was never true
1697 check_related(related)
1698 data_extend(data, "tags", tags1)
1699 data_extend(data, "tags", tags2)
1700 data_extend(data, "topics", topics1)
1701 data_extend(data, "topics", topics2)
1702 data_append(data, "form_of", dt)
1703 elif "compound-of" in tags2: 1703 ↛ 1704line 1703 didn't jump to line 1704 because the condition on line 1703 was never true
1704 check_related(related)
1705 data_extend(data, "tags", tags1)
1706 data_extend(data, "tags", tags2)
1707 data_extend(data, "topics", topics1)
1708 data_extend(data, "topics", topics2)
1709 data_append(data, "compound", related)
1710 else:
1711 lang = wxr.wtp.section or "LANG_MISSING"
1712 related, final_tags = parse_head_final_tags(
1713 wxr, lang, related
1714 )
1715 # print("add_related: related={!r} tags1={!r} tags2={!r} "
1716 # "final_tags={!r}"
1717 # .format(related, tags1, tags2, final_tags))
1718 tags = list(tags1) + list(tags2) + list(final_tags)
1719 check_related(related)
1720 form: FormData = {"form": related}
1721 if (
1722 links
1723 and link_dict
1724 and (
1725 form_links := match_links_to_form(
1726 wxr, related, links, link_dict
1727 )
1728 )
1729 ):
1730 form["links"] = form_links
1731 if head_group:
1732 form["head_nr"] = head_group
1733 if roman:
1734 form["roman"] = roman
1735 if ruby:
1736 form["ruby"] = ruby
1737 data_extend(form, "topics", topics1)
1738 data_extend(form, "topics", topics2)
1739 if topics1 or topics2: 1739 ↛ 1740line 1739 didn't jump to line 1740 because the condition on line 1739 was never true
1740 wxr.wtp.debug(
1741 "word head form has topics: {}".format(form),
1742 sortid="form_descriptions/1233",
1743 )
1744 # Add tags from canonical form into the main entry
1745 if "canonical" in tags:
1746 if related in ("m", "f") and len(titleword) > 1: 1746 ↛ 1747line 1746 didn't jump to line 1747 because the condition on line 1746 was never true
1747 wxr.wtp.debug(
1748 "probably incorrect canonical form "
1749 "{!r} ignored (probably tag combination "
1750 "missing from xlat_head_map)".format(related),
1751 sortid="form_descriptions/1241",
1752 )
1753 continue
1754 if (
1755 related != titleword
1756 or add_all_canonicals
1757 or topics1
1758 or topics2
1759 or ruby
1760 ):
1761 data_extend(form, "tags", sorted(set(tags)))
1762 else:
1763 # We won't add canonical form here
1764 filtered_tags = list(
1765 x for x in tags if x != "canonical"
1766 )
1767 data_extend(data, "tags", filtered_tags)
1768 continue
1769 else:
1770 data_extend(form, "tags", sorted(set(tags)))
1771 # Only insert if the form is not already there
1772 for old in data.get("forms", ()):
1773 if form == old: 1773 ↛ 1774line 1773 didn't jump to line 1774 because the condition on line 1773 was never true
1774 break
1775 else:
1776 data_append(data, "forms", form)
1778 # If this form had pre-tags that started with "both" or "all", add those
1779 # tags also to following related forms that don't have their own tags
1780 # specified.
1781 return following_tagsets
1784def match_links_to_form(
1785 wxr: WiktextractContext,
1786 form: str,
1787 links: list[tuple[str, str]],
1788 link_dict: dict[str, list[str]] | None,
1789) -> list[tuple[str, str]] | None:
1790 if not links: 1790 ↛ 1791line 1790 didn't jump to line 1791 because the condition on line 1790 was never true
1791 return None
1792 if link_dict is None:
1793 link_dict = {}
1794 for ltxt, ltrg in links:
1795 if ltxt not in link_dict:
1796 link_dict[ltxt] = [
1797 ltrg,
1798 ]
1799 else:
1800 link_dict[ltxt].append(ltrg)
1801 ret: list[tuple[str, str]] = []
1802 if form in link_dict:
1803 if len(link_dict[form]) > 1 and any( 1803 ↛ 1806line 1803 didn't jump to line 1806 because the condition on line 1803 was never true
1804 x != link_dict[form][0] for x in link_dict[form]
1805 ):
1806 wxr.wtp.warning(
1807 f"{form=} has many different "
1808 f"link candidates `{link_dict[form]}`, "
1809 f"which can't be disambiguated.",
1810 sortid="form_descriptions/match_links_to_form",
1811 )
1812 for ltarg in link_dict[form]:
1813 ret.append((form, ltarg))
1814 elif " " in form:
1815 # split and search for a sequence of links...
1816 split_forms = form.split()
1817 found = False
1818 for i, (ltext, ltarg) in enumerate(links):
1819 if ltext == split_forms[0]:
1820 for j, f in enumerate(split_forms):
1821 if i + j >= len(links):
1822 break
1823 if f.strip(",;() ") != links[i + j][0].strip(",;() "):
1824 break
1825 if i + j == len(links): 1825 ↛ 1826line 1825 didn't jump to line 1826 because the condition on line 1825 was never true
1826 break
1827 else:
1828 found = True
1829 if found:
1830 ret = links[i : i + len(split_forms)]
1831 break
1832 # We only care about weird links
1833 # print(f"{len(ret)=}, {ret}")
1834 for txt, tar in ret:
1835 if txt != tar and txt != tar[: tar.find("#")]:
1836 break
1837 else:
1838 return None
1839 return ret or None
1842# Issue #967, in English word forms sometimes forms are skipped because
1843# they are taggable words and their distw() is too big, like clipping from clip
1844WORDS_WITH_FALSE_POSITIVE_TAGS: dict[str, list[str]] = {
1845 "clip": ["clipping"], # XXX remember to change me back to clipping after
1846 "English": ["English", "Englishes"],
1847 "common": ["common", "commoner"],
1848 # tests.
1849}
1851WORDS_WITH_FALSE_POSITIVE_FORMS: dict[str, list[str]] = {
1852 "unaccountability": ["countable", "uncountable"],
1853 "uncountability": ["countable", "uncountable"],
1854}
1856FALSE_POSITIVE_MISSING_FORMS: dict[str, list[str]] = {}
1858FORM_ASSOCIATED_TAG_WORDS: set[str] = {
1859 "participle",
1860 "past",
1861 "present",
1862 "singular",
1863 "plural",
1864 "first-person",
1865 "second-person",
1866 "third-person",
1867 "gerund",
1868}
1870SEMICOLON_REPLACEMENT = "__SEMICOLON__"
1873def parse_word_head(
1874 wxr: WiktextractContext,
1875 word: str,
1876 pos: str,
1877 text: str,
1878 data: WordData,
1879 is_reconstruction: bool,
1880 head_group: Optional[int],
1881 original_header_nodes: list[WikiNode | str] | None = None,
1882 ruby=None,
1883 links: list[
1884 tuple[
1885 str,
1886 str,
1887 ]
1888 ]
1889 | None = None,
1890 label_templates: list[TemplateData] | None = None,
1891) -> None:
1892 """Parses the head line for a word for in a particular language and
1893 part-of-speech, extracting tags and related forms."""
1894 assert isinstance(wxr, WiktextractContext)
1895 assert isinstance(pos, str)
1896 assert isinstance(text, str)
1897 assert isinstance(data, dict)
1898 assert isinstance(ruby, (list, tuple)) or ruby is None
1899 if ruby is None:
1900 ruby = []
1901 assert is_reconstruction in (True, False)
1902 # print("PARSE_WORD_HEAD: {}: {!r}".format(wxr.wtp.section, text))
1903 # print(f"PARSE_WORD_HEAD: {data=}")
1904 # print(f"PARSE_WORD_HEAD: {links=}")
1906 # Save original text for if we want to look for mismatched form-links
1908 link_dict: dict[str, list[str]] | None
1909 if links is not None:
1910 link_dict = {}
1911 for ltxt, ltrg in links:
1912 if ltxt not in link_dict:
1913 link_dict[ltxt] = [
1914 ltrg,
1915 ]
1916 else:
1917 link_dict[ltxt].append(ltrg)
1918 else:
1919 link_dict = None
1921 # print(f"MAIN: {links=}")
1922 link_words_not_alnum = []
1923 if not word.isalnum() and not word.replace("-", "").isalnum():
1924 # `-` is kosher, add more of these if needed.
1925 # if the word contains non-letter or -number characters, it
1926 # might have something that messes with split-at-semi-comma; we
1927 # collect links so that we can skip splitting them.
1928 if links is None and original_header_nodes is not None:
1929 links, _ = extract_links_from_node(
1930 wxr,
1931 original_header_nodes,
1932 remove_anchor_tags=True,
1933 expand_nodes=True,
1934 )
1935 if links is not None: 1935 ↛ 1939line 1935 didn't jump to line 1939 because the condition on line 1935 was always true
1936 for ltext, ltar in links:
1937 if not ltext.isalnum():
1938 link_words_not_alnum.append(ltext)
1939 if word not in link_words_not_alnum: 1939 ↛ 1942line 1939 didn't jump to line 1942 because the condition on line 1939 was always true
1940 link_words_not_alnum.append(word)
1942 if link_words_not_alnum is None: 1942 ↛ 1943line 1942 didn't jump to line 1943 because the condition on line 1942 was never true
1943 link_words_not_alnum = []
1945 if len(link_words_not_alnum) > 0:
1946 # if we have link data (that is, links with stuff like commas and
1947 # spaces, replace word_re with a modified local scope pattern
1948 # print(f"links {list((c, ord(c)) for link in links for c in link)=}")
1949 word_re = re.compile(
1950 r"\b" # In case we have forms that are longer and contain links
1951 +
1952 # or words as a substring...
1953 r"\b|\b".join(
1954 sorted(
1955 (re.escape(s) for s in link_words_not_alnum),
1956 key=lambda x: -len(x),
1957 )
1958 )
1959 + r"\b|"
1960 + word_pattern
1961 )
1962 else:
1963 word_re = word_re_global
1965 if "Lua execution error" in text or "Lua timeout error" in text: 1965 ↛ 1966line 1965 didn't jump to line 1966 because the condition on line 1965 was never true
1966 return
1968 # Fix words with "superlative:" or "comparative:" at end of head
1969 # e.g. grande/Spanish/Adj
1970 text = re.sub(r" (superlative|comparative): (.*)", r" (\1 \2)", text)
1972 # Parse Arabic non-past forms, e.g. أبلع/Arabic/Verb
1973 m = re.search(r", non-past ([^)]+ \([^)]+\))", text)
1974 if m:
1975 add_related(
1976 wxr,
1977 data,
1978 ["non-past"],
1979 [m.group(1)],
1980 text,
1981 True,
1982 is_reconstruction,
1983 head_group,
1984 ruby,
1985 links,
1986 link_dict,
1987 )
1988 text = text[: m.start()] + text[m.end() :]
1990 language = wxr.wtp.section
1991 titleword = re.sub(
1992 r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "MISSING_TITLE"
1993 )
1994 titleparts = list(
1995 m.group(0)
1996 for m in re.finditer(word_re, wxr.wtp.title or "MISSING_TITLE")
1997 )
1998 if not titleparts: 1998 ↛ 1999line 1998 didn't jump to line 1999 because the condition on line 1998 was never true
1999 return
2001 # Remove " or" from the end to prevent weird canonical forms
2002 if text.endswith(" or"):
2003 for tp in titleparts:
2004 if text.endswith(tp): 2004 ↛ 2005line 2004 didn't jump to line 2005 because the condition on line 2004 was never true
2005 break
2006 else:
2007 text = text.removesuffix(" or").rstrip()
2009 # Handle the part of the head that is not in parentheses. However, certain
2010 # parenthesized parts are part of word, and those must be handled
2011 # specially here.
2012 if ruby:
2013 text = quote_kept_ruby(wxr, ruby, text)
2014 base = text
2015 base = quote_kept_parens(base)
2016 base = remove_text_in_parentheses(base)
2017 base = base.replace("?", "") # Removes uncertain articles etc
2018 base = re.sub(r"\s+", " ", base)
2019 base = re.sub(r" ([,;])", r"\1", base)
2020 base = re.sub(r" • ", r" ", base)
2021 # Many languages use • as a punctuation mark separating the base
2022 # from the rest of the head. στάδιος/Ancient Greek, issue #176
2023 base = base.strip()
2024 # print(f"{base=}, {text=}")
2026 # Check for certain endings in head (mostly for compatibility with weird
2027 # heads, e.g. rata/Romanian "1st conj." at end)
2028 m = re.search(head_end_re, base)
2029 tags: Union[tuple[str, ...], list[str]] = []
2030 if m: 2030 ↛ 2031line 2030 didn't jump to line 2031 because the condition on line 2030 was never true
2031 tags = head_end_map[m.group(1).lower()].split()
2032 data_extend(data, "tags", tags)
2033 base = base[: m.start()]
2035 # Special case: handle Hán Nôm readings for Vietnamese characters
2036 m = re.match(
2037 r"{}: (Hán Nôm) readings: (.*)".format(re.escape(titleword)), base
2038 )
2039 if m: 2039 ↛ 2040line 2039 didn't jump to line 2040 because the condition on line 2039 was never true
2040 tag, readings = m.groups()
2041 tag = re.sub(r"\s+", "-", tag)
2042 for reading in split_at_comma_semi(
2043 readings, skipped=link_words_not_alnum
2044 ):
2045 add_related(
2046 wxr,
2047 data,
2048 [tag],
2049 [reading],
2050 text,
2051 True,
2052 is_reconstruction,
2053 head_group,
2054 ruby,
2055 links,
2056 link_dict,
2057 )
2058 return
2060 # Special case: Hebrew " [pattern: nnn]" ending
2061 m = re.search(r"\s+\[pattern: ([^]]+)\]", base)
2062 if m: 2062 ↛ 2063line 2062 didn't jump to line 2063 because the condition on line 2062 was never true
2063 add_related(
2064 wxr,
2065 data,
2066 ["class"],
2067 [m.group(1)],
2068 text,
2069 True,
2070 is_reconstruction,
2071 head_group,
2072 ruby,
2073 links,
2074 link_dict,
2075 )
2076 base = base[: m.start()] + base[m.end() :]
2078 # Clean away some messy "Upload an image" template text used in
2079 # American Sign Language:
2080 # S@NearBaseForearm-PalmUp Frontandback S@BaseForearm-PalmUp
2081 m = re.search(r"Upload .+ gif image.", base)
2082 if m: 2082 ↛ 2083line 2082 didn't jump to line 2083 because the condition on line 2082 was never true
2083 base = base[: m.start()] + base[m.end() :]
2085 semicolon_present = False
2086 # Split the head into alternatives. This is a complicated task, as
2087 # we do not want so split on "or" or "," when immediately followed by more
2088 # head-final tags, but otherwise do want to split by them.
2089 # 20230907 added "or" to this to handle 'true or false', titles with 'or'
2090 if wxr.wtp.title and (
2091 "," in wxr.wtp.title or ";" in wxr.wtp.title or " or " in wxr.wtp.title
2092 ):
2093 # If the title has ";", we don't want to split on that and can remove
2094 # the ; from the splitting regex pretty easily because it's uncommon.
2095 # However, commas are so common that not splitting on them is just
2096 # not feasible, and we have to just deal with that if there are
2097 # alternative forms or variations with stray commas that shouldn't
2098 # be split.
2099 if ";" in wxr.wtp.title:
2100 semicolon_present = True
2101 base = base.replace(";", SEMICOLON_REPLACEMENT)
2102 default_splitter = head_split_no_semicolon_re
2103 else:
2104 default_splitter = head_split_re
2105 # A kludge to handle article titles/phrases with commas.
2106 # Preprocess splits to first capture the title, then handle
2107 # all the others as usual.
2108 presplits = re.split(r"({})".format(wxr.wtp.title), base)
2109 splits = []
2110 for psplit in presplits:
2111 if psplit == wxr.wtp.title:
2112 splits.append(psplit)
2113 else:
2114 splits.extend(re.split(default_splitter, psplit))
2115 else:
2116 # Do the normal split; previous behavior.
2117 splits = re.split(head_split_re, base)
2118 # print("BASE: ", repr(base))
2119 # print("SPLITS:", splits)
2120 alts: list[str] = []
2121 # print("parse_word_head: splits:", splits,
2122 # "head_split_re_parens:", head_split_re_parens)
2123 for i in range(
2124 0, len(splits) - head_split_re_parens, head_split_re_parens + 1
2125 ):
2126 v = splits[i]
2127 ending = splits[i + 1] or "" # XXX is this correct???
2128 # print("parse_word_head alts v={!r} ending={!r} alts={}"
2129 # .format(v, ending, alts))
2130 if alts and (v == "" and ending):
2131 assert ending[0] == " "
2132 alts[-1] += " or" + ending # endings starts with space
2133 elif v or ending:
2134 alts.append((v or "") + (ending or ""))
2135 last = splits[-1].strip()
2136 conn = "" if len(splits) < 3 else splits[-2]
2137 # print("parse_word_head alts last={!r} conn={!r} alts={}"
2138 # .format(last, conn, alts))
2139 if ( 2139 ↛ 2150line 2139 didn't jump to line 2150 because the condition on line 2139 was never true
2140 alts
2141 and last
2142 and (
2143 last.split()[0] in xlat_head_map
2144 or (
2145 conn == " or "
2146 and (alts[-1] + " or " + last).strip() in xlat_head_map
2147 )
2148 )
2149 ):
2150 alts[-1] += " or " + last
2151 elif last: 2151 ↛ 2152line 2151 didn't jump to line 2152 because the condition on line 2151 was never true
2152 alts.append(last)
2154 # print("parse_word_head alts: {}".format(alts))
2155 # print(f"{base=}")
2157 # Process the head alternatives
2158 canonicals: list[tuple[list[str], list[str]]] = []
2159 mode: Optional[str] = None
2160 for alt_i, alt in enumerate(alts):
2161 alt = alt.strip()
2162 if alt.startswith("compound form:"): 2162 ↛ 2163line 2162 didn't jump to line 2163 because the condition on line 2162 was never true
2163 mode = "compound-form"
2164 alt = alt[14:].strip()
2165 if ((dash_i := alt.find(" -")) > 0) and (
2166 dash_i > (wxr.wtp.title or "").find(" -")
2167 ):
2168 # test_en_head / test_suffixes_at_end_of_form1
2169 # Some heads have suffixes that end up attached to the form
2170 # like in https://en.wiktionary.org/wiki/%E6%A5%BD%E3%81%97%E3%81%84
2171 alt = alt[:dash_i]
2172 if mode == "compound-form": 2172 ↛ 2173line 2172 didn't jump to line 2173 because the condition on line 2172 was never true
2173 add_related(
2174 wxr,
2175 data,
2176 ["in-compounds"],
2177 [alt],
2178 text,
2179 True,
2180 is_reconstruction,
2181 head_group,
2182 ruby,
2183 links,
2184 link_dict,
2185 )
2186 continue
2187 # For non-first parts, see if it can be treated as tags-only
2188 if alt_i == 0:
2189 expanded_alts = [alt]
2190 else:
2191 expanded_alts = map_with(xlat_descs_map, [alt])
2192 # print("EXPANDED_ALTS:", expanded_alts)
2193 tagsets: Optional[list[tuple[str, ...]]]
2194 for alt in expanded_alts:
2195 baseparts = list(m.group(0) for m in word_re.finditer(alt))
2196 if alt_i > 0:
2197 tagsets, topics = decode_tags(" ".join(baseparts))
2198 if not any("error-unknown-tag" in x for x in tagsets):
2199 data_extend(data, "topics", topics)
2200 for tags1 in tagsets:
2201 data_extend(data, "tags", tags1)
2202 continue
2204 alt, tags = parse_head_final_tags(
2205 wxr, language or "MISSING_LANG", alt
2206 )
2207 tags = list(tags) # Make sure we don't modify anything cached
2208 tags.append("canonical")
2209 if alt_i == 0 and "," in wxr.wtp.title or ";" in wxr.wtp.title: # type:ignore[operator]
2210 # Kludge to handle article titles/phrases with commas.
2211 # basepart's regex strips commas, which leads to a
2212 # canonical form that is the title phrase without a comma.
2213 # basepart in add_related is almost immediately joined with
2214 # spaces anyhow. XXX not exactly sure why it's
2215 # canonicals.append((tags, baseparts)) and not (tags, [alt])
2216 baseparts = [alt]
2217 canonicals.append((tags, baseparts))
2219 # If more of this kind of replace-and-return-original kind of stuff is
2220 # needed, make semicolon_present into a flag enum, something like `modified`
2221 if semicolon_present:
2222 new_cans = []
2223 for tags, baseparts in canonicals:
2224 new_cans.append(
2225 (
2226 tags,
2227 [s.replace(SEMICOLON_REPLACEMENT, ";") for s in baseparts],
2228 )
2229 )
2230 canonicals = new_cans
2231 for tags, baseparts in canonicals:
2232 add_related(
2233 wxr,
2234 data,
2235 tags,
2236 baseparts,
2237 text,
2238 len(canonicals) > 1,
2239 is_reconstruction,
2240 head_group,
2241 ruby,
2242 links,
2243 link_dict,
2244 )
2246 # Handle parenthesized descriptors for the word form and links to
2247 # related words
2248 text = quote_kept_parens(text)
2249 parens = list(
2250 m.group(2)
2251 for m in re.finditer(r"(^|\s)\((([^()]|\([^()]*\))*)\)", text)
2252 )
2253 parens.extend(
2254 m.group(1)
2255 for m in re.finditer(r"[^\s]\((([^()]|\([^()]*\))*)\)($|\s)", text)
2256 )
2257 have_romanization = False
2258 have_ruby = False
2259 hiragana = ""
2260 katakana = ""
2261 for paren in parens:
2262 paren = paren.strip()
2263 if not paren: 2263 ↛ 2264line 2263 didn't jump to line 2264 because the condition on line 2263 was never true
2264 continue
2265 can_be_form = True
2266 if label_templates is not None and paren.startswith(
2267 "__LABEL_TEMPLATE_"
2268 ):
2269 # wxr.wtp.warning("Found label template in head")
2270 # continue
2271 can_be_form = False
2272 m = re.match(r"__LABEL_TEMPLATE_(\d+)__", paren)
2273 if m is None: 2273 ↛ 2274line 2273 didn't jump to line 2274 because the condition on line 2273 was never true
2274 wxr.wtp.warning(
2275 f"Label template list magic phrase is broken: `{paren}`",
2276 sortid="20260508/label list index broken",
2277 )
2278 continue
2279 ht = label_templates[int(m.group(1))]
2280 desc = ht.get("expansion", "").strip()
2281 if desc: 2281 ↛ 2284line 2281 didn't jump to line 2284 because the condition on line 2281 was always true
2282 paren = desc
2283 else:
2284 wxr.wtp.warning(
2285 f"Label template seems to have no text contents: {ht=}",
2286 sortid="20260508/label_templates",
2287 )
2288 continue
2289 if paren.startswith("see "):
2290 continue
2291 if paren.startswith("U+"): 2291 ↛ 2292line 2291 didn't jump to line 2292 because the condition on line 2291 was never true
2292 continue
2293 # In some rare cases, strip word that inflects form the form
2294 # description, e.g. "look through rose-tinted glasses"/English.
2295 # `([looks])`
2296 paren = re.sub(r"\s*\(\[[^])]*\]\)", "", paren)
2298 # If it starts with hiragana or katakana, treat as such form. Note
2299 # that each hiragana/katakana character is in separate parentheses,
2300 # so we must concatenate them.
2301 try:
2302 un = unicodedata.name(paren[0]).split()[0]
2303 except ValueError:
2304 un = "INVALID"
2305 if un == "KATAKANA": 2305 ↛ 2306line 2305 didn't jump to line 2306 because the condition on line 2305 was never true
2306 katakana += paren
2307 have_ruby = True
2308 continue
2309 if un == "HIRAGANA": 2309 ↛ 2310line 2309 didn't jump to line 2310 because the condition on line 2309 was never true
2310 hiragana += paren
2311 have_ruby = True
2312 continue
2314 # Parse format ", 16 (Japan, Mainland), 17 (Hong Kong, Taiwan) strokes,"
2315 # in the middle of the parenthesized expression, e.g. 薄
2316 def strokes_repl(m: re.Match) -> str:
2317 strokes1, tags1, strokes2, tags2 = m.groups()
2318 for strokes, tags in [[strokes1, tags1], [strokes2, tags2]]:
2319 tags = tags.split(", ")
2320 tags = list(
2321 "Mainland China" if t == "Mainland" else t for t in tags
2322 )
2323 tags.append("strokes")
2324 add_related(
2325 wxr,
2326 data,
2327 tags,
2328 [strokes],
2329 text,
2330 True,
2331 is_reconstruction,
2332 head_group,
2333 ruby,
2334 links,
2335 link_dict,
2336 )
2337 return ", "
2339 if can_be_form is True:
2340 paren = re.sub(
2341 r", (\d+) \(([^()]+)\), (\d+) \(([^()]+)\) strokes, ",
2342 strokes_repl,
2343 paren,
2344 )
2346 descriptors = map_with(xlat_descs_map, [paren])
2347 new_desc = []
2348 for desc in descriptors:
2349 new_desc.extend(
2350 map_with(
2351 xlat_tags_map,
2352 split_at_comma_semi(
2353 desc, extra=[", or "], skipped=link_words_not_alnum
2354 ),
2355 )
2356 )
2357 prev_tags: Union[list[list[str]], list[tuple[str, ...]], None] = None
2358 following_tags = None # Added to prev_tags from previous parenthesized
2359 # part, e.g. walrus/English
2360 # "(both nonstandard, proscribed, uncommon)"
2361 for desc_i, desc in enumerate(new_desc):
2362 # print("HEAD DESC: {!r}".format(desc))
2364 # Abort on certain descriptors (assume remaining values are
2365 # examples or uninteresting, cf. gaan/Navajo, horior/Latin)
2366 if re.match(r"^(per |e\.g\.$)", desc): 2366 ↛ 2367line 2366 didn't jump to line 2367 because the condition on line 2366 was never true
2367 break
2369 # If it all consists of CJK characters, add it with the
2370 # CJK tag. This is used at least for some Vietnamese
2371 # words (e.g., ba/Vietnamese)
2372 try:
2373 if ( 2373 ↛ 2377line 2373 didn't jump to line 2377 because the condition on line 2373 was never true
2374 all(unicodedata.name(x).startswith("CJK ") for x in desc)
2375 and can_be_form
2376 ):
2377 add_related(
2378 wxr,
2379 data,
2380 ["CJK"],
2381 [desc],
2382 text,
2383 True,
2384 is_reconstruction,
2385 head_group,
2386 ruby,
2387 links,
2388 link_dict,
2389 )
2390 continue
2391 except ValueError:
2392 pass
2394 # Handle some special cases
2395 splitdesc = desc.split()
2396 if ( 2396 ↛ 2406line 2396 didn't jump to line 2406 because the condition on line 2396 was never true
2397 len(splitdesc) >= 3
2398 and splitdesc[1] == "superlative"
2399 and classify_desc(splitdesc[0]) != "tags"
2400 and prev_tags
2401 and can_be_form
2402 ):
2403 # Handle the special case of second comparative after comma,
2404 # followed by superlative without comma. E.g.
2405 # mal/Portuguese/Adv
2406 for ts in prev_tags:
2407 add_related(
2408 wxr,
2409 data,
2410 ts,
2411 [splitdesc[0]],
2412 text,
2413 True,
2414 is_reconstruction,
2415 head_group,
2416 ruby,
2417 links,
2418 link_dict,
2419 )
2420 desc = " ".join(splitdesc[1:])
2421 elif ( 2421 ↛ 2430line 2421 didn't jump to line 2430 because the condition on line 2421 was never true
2422 len(splitdesc) == 2
2423 and splitdesc[0] in ("also", "and")
2424 and prev_tags
2425 and classify_desc(splitdesc[1]) != "tags"
2426 and can_be_form
2427 ):
2428 # Sometimes alternative forms are prefixed with "also" or
2429 # "and"
2430 for ts in prev_tags:
2431 add_related(
2432 wxr,
2433 data,
2434 ts,
2435 [splitdesc[1]],
2436 text,
2437 True,
2438 is_reconstruction,
2439 head_group,
2440 ruby,
2441 links,
2442 link_dict,
2443 )
2444 continue
2445 elif len(splitdesc) >= 2 and splitdesc[0] in ("including",): 2445 ↛ 2446line 2445 didn't jump to line 2446 because the condition on line 2445 was never true
2446 continue
2448 # If only one word, assume it is comma-separated alternative
2449 # to the previous one
2450 if len(splitdesc) == 1:
2451 cls = classify_desc(desc)
2452 if cls != "tags" and can_be_form:
2453 if prev_tags: 2453 ↛ 2455line 2453 didn't jump to line 2455 because the condition on line 2453 was never true
2454 # Assume comma-separated alternative to previous one
2455 for ts in prev_tags:
2456 add_related(
2457 wxr,
2458 data,
2459 ts,
2460 [desc],
2461 text,
2462 True,
2463 is_reconstruction,
2464 head_group,
2465 ruby,
2466 links,
2467 link_dict,
2468 )
2469 continue
2470 elif distw(titleparts, desc) <= 0.5: 2470 ↛ 2473line 2470 didn't jump to line 2473 because the condition on line 2470 was never true
2471 # Similar to head word, assume a dialectal variation to
2472 # the base form. Cf. go/Alemannic German/Verb
2473 add_related(
2474 wxr,
2475 data,
2476 ["alternative"],
2477 [desc],
2478 text,
2479 True,
2480 is_reconstruction,
2481 head_group,
2482 ruby,
2483 links,
2484 link_dict,
2485 )
2486 continue
2487 elif (
2488 cls in ("romanization", "english")
2489 and not have_romanization
2490 and classify_desc(titleword) == "other"
2491 and not (
2492 "categories" in data and desc in data["categories"]
2493 )
2494 ):
2495 # Assume it to be a romanization
2496 add_romanization(
2497 wxr,
2498 data,
2499 desc,
2500 text,
2501 is_reconstruction,
2502 head_group,
2503 ruby,
2504 )
2505 have_romanization = True
2506 continue
2508 m = re.match(r"^(\d+) strokes?$", desc)
2509 if m and can_be_form:
2510 # Special case, used to give #strokes for Han characters
2511 add_related(
2512 wxr,
2513 data,
2514 ["strokes"],
2515 [m.group(1)],
2516 text,
2517 True,
2518 is_reconstruction,
2519 head_group,
2520 ruby,
2521 links,
2522 link_dict,
2523 )
2524 continue
2526 # See if it is radical+strokes
2527 m = re.match(
2528 r"^([\u2F00-\u2FDF\u2E80-\u2EFF\U00018800-\U00018AFF"
2529 r"\uA490-\uA4CF\u4E00-\u9FFF]\+\d+)"
2530 r"( in (Japanese|Chinese|traditional Chinese|"
2531 r"simplified Chinese))?$",
2532 desc,
2533 )
2534 if m and can_be_form: 2534 ↛ 2537line 2534 didn't jump to line 2537 because the condition on line 2534 was never true
2535 # Special case, used to give radical + strokes for Han
2536 # characters
2537 radical_strokes = m.group(1)
2538 lang = m.group(3)
2539 t = ["radical+strokes"]
2540 if lang:
2541 t.extend(lang.split())
2542 add_related(
2543 wxr,
2544 data,
2545 t,
2546 [radical_strokes],
2547 text,
2548 True,
2549 is_reconstruction,
2550 head_group,
2551 ruby,
2552 links,
2553 link_dict,
2554 )
2555 prev_tags = None
2556 following_tags = None
2557 continue
2559 # See if it indicates historical Katakana ortography (←) or
2560 # just otherwise katakana/hiragana form
2561 m = re.match(r"←\s*|kana\s+", desc)
2562 if m: 2562 ↛ 2563line 2562 didn't jump to line 2563 because the condition on line 2562 was never true
2563 if desc.startswith("←"):
2564 t1 = "historical "
2565 else:
2566 t1 = ""
2567 x = desc[m.end() :]
2568 if x.endswith("?"):
2569 x = x[:-1]
2570 # XXX should we add a tag indicating uncertainty?
2571 if x:
2572 name = unicodedata.name(x[0])
2573 if name.startswith("HIRAGANA "):
2574 desc = t1 + "hiragana " + x
2575 elif name.startswith("KATAKANA "):
2576 desc = t1 + "katakana " + x
2578 # See if it is "n strokes in Chinese" or similar
2579 m = re.match(
2580 r"(\d+) strokes in (Chinese|Japanese|"
2581 r"traditional Chinese|simplified Chinese)$",
2582 desc,
2583 )
2584 if m and can_be_form: 2584 ↛ 2586line 2584 didn't jump to line 2586 because the condition on line 2584 was never true
2585 # Special case, used to give just strokes for some Han chars
2586 strokes = m.group(1)
2587 lang = m.group(2)
2588 t = ["strokes"]
2589 t.extend(lang.split())
2590 add_related(
2591 wxr,
2592 data,
2593 t,
2594 [strokes],
2595 text,
2596 True,
2597 is_reconstruction,
2598 head_group,
2599 ruby,
2600 links,
2601 link_dict,
2602 )
2603 prev_tags = None
2604 following_tags = None
2605 continue
2607 # American Sign Language has images (or requests for image)
2608 # as heads, + this ASL gloss after.
2609 m2 = re.search(r"\(ASL gloss:\s+(.*)\)", text)
2610 if m2 and can_be_form: 2610 ↛ 2611line 2610 didn't jump to line 2611 because the condition on line 2610 was never true
2611 add_related(
2612 wxr,
2613 data,
2614 ["ASL-gloss"],
2615 [m2.group(1)],
2616 text,
2617 True,
2618 is_reconstruction,
2619 head_group,
2620 ruby,
2621 links,
2622 link_dict,
2623 )
2624 continue
2626 parts = list(m.group(0) for m in re.finditer(word_re, desc))
2627 if not parts: 2627 ↛ 2628line 2627 didn't jump to line 2628 because the condition on line 2627 was never true
2628 prev_tags = None
2629 following_tags = None
2630 continue
2632 # Check for certain language-specific header part starts that
2633 # modify
2634 if ( 2634 ↛ 2639line 2634 didn't jump to line 2639 because the condition on line 2634 was never true
2635 len(parts) == 2
2636 and language in lang_specific_head_map
2637 and can_be_form
2638 ):
2639 ht2 = lang_specific_head_map[language]
2640 if parts[0] in ht2:
2641 rem_tags, add_tags = ht2[parts[0]]
2642 new_prev_tags1: list[list[str]] = []
2643 tags2: Union[tuple[str, ...], list[str]]
2644 for tags2 in prev_tags or [()]:
2645 if rem_tags is True: # Remove all old tags
2646 tsets = set()
2647 else:
2648 tsets = set(tags2) - set(rem_tags.split())
2649 tsets = tsets | set(add_tags.split())
2650 tags = list(sorted(tsets))
2651 add_related(
2652 wxr,
2653 data,
2654 tags,
2655 [parts[1]],
2656 text,
2657 True,
2658 is_reconstruction,
2659 head_group,
2660 ruby,
2661 links,
2662 link_dict,
2663 )
2664 new_prev_tags1.append(tags)
2665 prev_tags = new_prev_tags1
2666 following_tags = None
2667 continue
2669 # Handle the special case of descriptors that are parenthesized,
2670 # e.g., (archaic or Scotland)
2671 m = re.match(r"\(([^)]+)\)\s+(.*)$", desc)
2672 if m is not None and classify_desc(m.group(1)) == "tags": 2672 ↛ 2673line 2672 didn't jump to line 2673 because the condition on line 2672 was never true
2673 tagpart = m.group(1)
2674 related = [m.group(2)]
2675 tagsets, topics = decode_tags(tagpart, no_unknown_starts=True)
2676 if topics:
2677 wxr.wtp.debug(
2678 "parenthized head part {!r} contains topics: {}".format(
2679 tagpart, topics
2680 ),
2681 sortid="form_descriptions/1647",
2682 )
2683 elif m is not None and re.match(r"in the sense ", m.group(1)): 2683 ↛ 2686line 2683 didn't jump to line 2686 because the condition on line 2683 was never true
2684 # Handle certain ignored cases
2685 # e.g. bord/Danish: in the sense "plank"
2686 related = [m.group(2)]
2687 tagsets = [()]
2688 else:
2689 # Normal parsing of the descriptor
2690 alt_related = None
2691 alt_tagsets = None
2692 tagsets = None
2693 for i in range(len(parts), 0, -1):
2694 related = parts[i:]
2695 tagparts = parts[:i]
2696 # print(" i={} related={} tagparts={}"
2697 # .format(i, related, tagparts))
2698 tagsets, topics = decode_tags(
2699 " ".join(tagparts), no_unknown_starts=True
2700 )
2701 # print("tagparts={!r} tagsets={} topics={} related={} "
2702 # "alt_related={} distw={:.2f}"
2703 # .format(tagparts, tagsets, topics, related,
2704 # alt_related,
2705 # distw(titleparts, parts[i - 1])))
2706 if (
2707 topics
2708 or not tagsets
2709 or any("error-unknown-tag" in x for x in tagsets)
2710 ):
2711 if alt_related is not None: 2711 ↛ 2713line 2711 didn't jump to line 2713 because the condition on line 2711 was never true
2712 # We already had a good division, so let's stop.
2713 break
2714 # Bad division, try deeper
2715 continue
2716 # print(f"{parts[i-1]=}, {parts=}")
2717 if (
2718 i > 1
2719 and len(parts[i - 1]) >= 4
2720 and (
2721 distw(titleparts, parts[i - 1]) <= 0.4
2722 or (
2723 wxr.wtp.section == "English"
2724 and wxr.wtp.title
2725 in WORDS_WITH_FALSE_POSITIVE_TAGS
2726 and parts[i - 1]
2727 in WORDS_WITH_FALSE_POSITIVE_TAGS[wxr.wtp.title]
2728 )
2729 )
2730 # Fixes 'unaccountability' wiktext #1196
2731 and not (
2732 wxr.wtp.section == "English"
2733 and wxr.wtp.title in WORDS_WITH_FALSE_POSITIVE_FORMS
2734 and parts[i - 1]
2735 in WORDS_WITH_FALSE_POSITIVE_FORMS[wxr.wtp.title]
2736 )
2737 # Fixes wiktextract #983, where "participle"
2738 # was too close to "Martinize" and so this accepted
2739 # ["participle", "Martinize"] as matching; this
2740 # kludge prevents this from happening if titleparts
2741 # is shorter than what would be 'related'.
2742 # This breaks if we want to detect stuff that
2743 # actually gets an extra space-separated word when
2744 # 'inflected'.
2745 and (
2746 len(titleparts) >= len(parts[i - 1 :])
2747 or "or" in parts[i - 1 :]
2748 )
2749 ):
2750 # print(f"Reached; {parts=}, {parts[i-1]=}")
2751 alt_related = related
2752 alt_tagsets = tagsets
2753 continue
2754 alt_related = None
2755 alt_tagsets = None
2756 break
2757 # for-else
2758 else:
2759 if alt_related is None and can_be_form:
2760 # Check if the parenthesized part is likely a
2761 # romanization
2762 if ( 2762 ↛ 2770line 2762 didn't jump to line 2770 because the condition on line 2762 was never true
2763 (have_ruby or classify_desc(base) == "other")
2764 and classify_desc(paren) == "romanization"
2765 and not (
2766 "categories" in data
2767 and desc in data["categories"]
2768 )
2769 ):
2770 for r in split_at_comma_semi(
2771 paren,
2772 extra=[" or "],
2773 skipped=link_words_not_alnum,
2774 ):
2775 add_romanization(
2776 wxr,
2777 data,
2778 r,
2779 text,
2780 is_reconstruction,
2781 head_group,
2782 ruby,
2783 )
2784 have_romanization = True
2785 continue
2786 tagsets = [("error-unrecognized-head-form",)]
2787 wxr.wtp.debug(
2788 "unrecognized head form: {}".format(desc),
2789 sortid="form_descriptions/1698",
2790 )
2791 continue
2793 if alt_related is not None: 2793 ↛ 2794line 2793 didn't jump to line 2794 because the condition on line 2793 was never true
2794 related = alt_related
2795 tagsets = alt_tagsets
2797 # print("FORM END: tagsets={} related={}".format(tagsets, related))
2798 # print("==================")
2800 if ( 2800 ↛ 2821line 2800 didn't jump to line 2821 because the condition on line 2800 was never true
2801 len(related) <= 0
2802 and wxr.wtp.section == "English"
2803 and tagsets is not None
2804 and len(tagsets) > 0
2805 and not any(
2806 s.startswith("error-") for tagset in tagsets for s in tagset
2807 )
2808 and any(
2809 s in FORM_ASSOCIATED_TAG_WORDS
2810 for tagset in tagsets
2811 for s in tagset
2812 )
2813 and (
2814 wxr.wtp.title not in FALSE_POSITIVE_MISSING_FORMS
2815 and not any(
2816 rel in FALSE_POSITIVE_MISSING_FORMS[wxr.wtp.title or ""]
2817 for rel in related
2818 )
2819 )
2820 ):
2821 wxr.wtp.debug(
2822 f"Form tags without form: {desc=}, {tagsets=}",
2823 sortid="form_description/20250107",
2824 )
2825 if not tagsets: 2825 ↛ 2826line 2825 didn't jump to line 2826 because the condition on line 2825 was never true
2826 continue
2828 # print(f"{alts=}, {related=}")
2830 assert isinstance(related, (list, tuple))
2831 related_str = " ".join(related)
2832 if "or" in titleparts:
2833 alts = [related_str]
2834 else:
2835 alts = split_at_comma_semi(
2836 related_str,
2837 separators=[r"\bor\b"],
2838 skipped=link_words_not_alnum,
2839 )
2840 # print(f"{related_str=}, {alts=}")
2841 if not alts:
2842 alts = [""]
2843 for related_str in alts:
2844 if related_str and can_be_form:
2845 if prev_tags and (
2846 all(
2847 all(
2848 t in ["nonstandard", "dialectal"]
2849 or valid_tags[t] == "dialect"
2850 for t in tags
2851 )
2852 for ts in tagsets
2853 )
2854 or (
2855 any("participle" in ts for ts in prev_tags)
2856 and all(
2857 "attributive" in ts
2858 or any(valid_tags[t] == "gender" for t in ts)
2859 for ts in tagsets
2860 )
2861 )
2862 ):
2863 # Merged with previous tags. Don't update previous
2864 # tags here; cf. burn/English/Verb
2865 for tags_l in tagsets:
2866 for ts in prev_tags:
2867 tags_l1 = sorted(set(tags_l) | set(ts))
2868 add_related(
2869 wxr,
2870 data,
2871 tags_l1,
2872 [related_str],
2873 text,
2874 True,
2875 is_reconstruction,
2876 head_group,
2877 ruby,
2878 links,
2879 link_dict,
2880 )
2881 else:
2882 # Not merged with previous tags
2883 for tags_l in tagsets:
2884 if following_tags is not None: 2884 ↛ 2885line 2884 didn't jump to line 2885 because the condition on line 2884 was never true
2885 for ts in following_tags:
2886 tags_l1 = list(
2887 sorted(set(tags_l) | set(ts))
2888 )
2889 add_related(
2890 wxr,
2891 data,
2892 tags_l1,
2893 [related_str],
2894 text,
2895 True,
2896 is_reconstruction,
2897 head_group,
2898 ruby,
2899 links,
2900 link_dict,
2901 )
2902 else:
2903 ret = add_related(
2904 wxr,
2905 data,
2906 tags_l,
2907 [related_str],
2908 text,
2909 True,
2910 is_reconstruction,
2911 head_group,
2912 ruby,
2913 links,
2914 link_dict,
2915 )
2916 if ret is not None: 2916 ↛ 2917line 2916 didn't jump to line 2917 because the condition on line 2916 was never true
2917 following_tags = ret
2918 prev_tags = tagsets
2919 else:
2920 if desc_i < len(new_desc) - 1 and all( 2920 ↛ 2927line 2920 didn't jump to line 2927 because the condition on line 2920 was never true
2921 "participle" in ts or "infinitive" in ts
2922 for ts in tagsets
2923 ):
2924 # Interpret it as a standalone form description
2925 # in the middle, probably followed by forms or
2926 # language-specific descriptors. cf. drikke/Danish
2927 new_prev_tags2 = []
2928 for ts1 in prev_tags or [()]:
2929 for ts2 in tagsets:
2930 ts = tuple(sorted(set(ts1) | set(ts2)))
2931 new_prev_tags2.append(ts)
2932 prev_tags = new_prev_tags2
2933 continue
2934 for tags in tagsets:
2935 data_extend(data, "tags", tags)
2936 prev_tags = tagsets
2937 following_tags = None
2939 # Finally, if we collected hirakana/katakana, add them now
2940 if hiragana: 2940 ↛ 2941line 2940 didn't jump to line 2941 because the condition on line 2940 was never true
2941 add_related(
2942 wxr,
2943 data,
2944 ["hiragana"],
2945 [hiragana],
2946 text,
2947 True,
2948 is_reconstruction,
2949 head_group,
2950 ruby,
2951 )
2952 if katakana: 2952 ↛ 2953line 2952 didn't jump to line 2953 because the condition on line 2952 was never true
2953 add_related(
2954 wxr,
2955 data,
2956 ["katakana"],
2957 [katakana],
2958 text,
2959 True,
2960 is_reconstruction,
2961 head_group,
2962 ruby,
2963 )
2965 # XXX check if this is actually relevant, tags in word root data
2966 # is extremely rare (not sure where they slip through).
2967 tags = data.get("tags", []) # type:ignore
2968 if len(tags) > 0:
2969 # wxr.wtp.debug(
2970 # f"Tags appear in word root data: {data['tags']=}", # type:ignore
2971 # sortid="form_descriptions/2620/20240606",
2972 # ) # Messes up tests.
2973 data["tags"] = sorted(set(tags)) # type:ignore
2976def parse_sense_qualifier(
2977 wxr: WiktextractContext, text: str, data: Union[SenseData, LinkageData]
2978) -> None:
2979 """Parses tags or topics for a sense or some other data. The values are
2980 added into the dictionary ``data``."""
2981 assert isinstance(wxr, WiktextractContext)
2982 assert isinstance(text, str)
2983 assert isinstance(data, dict)
2984 # print("parse_sense_qualifier:", text)
2985 if re.match(r"\([^()]+\)$", text): 2985 ↛ 2986line 2985 didn't jump to line 2986 because the condition on line 2985 was never true
2986 text = text[1:-1]
2987 if re.match(r'"[^"]+"$', text): 2987 ↛ 2988line 2987 didn't jump to line 2988 because the condition on line 2987 was never true
2988 text = text[1:-1]
2989 lst = map_with(xlat_descs_map, [text])
2990 sense_tags: list[str] = []
2991 for text in lst:
2992 for semi in split_at_comma_semi(text):
2993 if not semi: 2993 ↛ 2994line 2993 didn't jump to line 2994 because the condition on line 2993 was never true
2994 continue
2995 orig_semi = semi
2996 idx = semi.find(":")
2997 if idx >= 0: 2997 ↛ 2998line 2997 didn't jump to line 2998 because the condition on line 2997 was never true
2998 semi = semi[:idx]
2999 cls = classify_desc(semi, allow_unknown_tags=True)
3000 # print("parse_sense_qualifier: classify_desc: {} -> {}"
3001 # .format(semi, cls))
3002 if cls == "tags":
3003 tagsets, topics = decode_tags(semi)
3004 data_extend(data, "topics", topics)
3005 # XXX should think how to handle distinct options better,
3006 # e.g., "singular and plural genitive"; that can't really be
3007 # done with changing the calling convention of this function.
3008 # Should split sense if more than one category of tags differs.
3009 for tags in tagsets:
3010 sense_tags.extend(tags)
3011 elif cls == "taxonomic": 3011 ↛ 3012line 3011 didn't jump to line 3012 because the condition on line 3011 was never true
3012 if re.match(r"×[A-Z]", semi):
3013 sense_tags.append("extinct")
3014 semi = semi[1:]
3015 data["taxonomic"] = semi
3016 elif cls == "english":
3017 if "qualifier" in data and data["qualifier"] != orig_semi: 3017 ↛ 3018line 3017 didn't jump to line 3018 because the condition on line 3017 was never true
3018 data["qualifier"] += "; " + orig_semi
3019 else:
3020 data["qualifier"] = orig_semi
3021 else:
3022 wxr.wtp.debug(
3023 "unrecognized sense qualifier: {}".format(text),
3024 sortid="form_descriptions/1831",
3025 )
3026 sense_tags = sorted(set(sense_tags))
3027 data_extend(data, "tags", sense_tags)
3030def parse_pronunciation_tags(
3031 wxr: WiktextractContext, text: str, data: SoundData
3032) -> None:
3033 assert isinstance(wxr, WiktextractContext)
3034 assert isinstance(text, str)
3035 assert isinstance(data, dict)
3036 text = text.strip()
3037 if not text:
3038 return
3039 cls = classify_desc(text)
3040 notes = []
3041 if cls == "tags":
3042 tagsets, topics = decode_tags(text)
3043 data_extend(data, "topics", topics)
3044 for tagset in tagsets:
3045 for t in tagset:
3046 if " " in t: 3046 ↛ 3047line 3046 didn't jump to line 3047 because the condition on line 3046 was never true
3047 notes.append(t)
3048 else:
3049 data_append(data, "tags", t)
3050 else:
3051 notes.append(text)
3052 if notes:
3053 data["note"] = "; ".join(notes)
3056def parse_translation_desc(
3057 wxr: WiktextractContext, lang: str, text: str, tr: TranslationData
3058) -> None:
3059 assert isinstance(wxr, WiktextractContext)
3060 assert isinstance(lang, str) # The language of ``text``
3061 assert isinstance(text, str)
3062 assert isinstance(tr, dict)
3063 # print("parse_translation_desc:", text)
3065 # Process all parenthesized parts from the translation item
3066 note = None
3067 restore_beginning = ""
3068 restore_end = ""
3069 while True:
3070 beginning = False
3071 # See if we can find a parenthesized expression at the end
3072 m = re.search(r"\s*\((([^()]|\([^()]+\))+)\)\.?$", text)
3073 if m:
3074 par = m.group(1)
3075 text = text[: m.start()]
3076 if par.startswith(("literally ", "lit.")):
3077 continue # Not useful for disambiguation in many idioms
3078 else:
3079 # See if we can find a parenthesized expression at the start
3080 m = re.match(r"^\^?\((([^()]|\([^()]+\))+)\):?(\s+|$)", text)
3081 if m:
3082 par = m.group(1)
3083 text = text[m.end() :]
3084 beginning = True
3085 if re.match(r"^(\d|\s|,| or | and )+$", par): 3085 ↛ 3090line 3085 didn't jump to line 3090 because the condition on line 3085 was never true
3086 # Looks like this beginning parenthesized expression only
3087 # contains digits or their combinations. We assume such
3088 # to be sense descriptions if no sense has been selected,
3089 # or otherwise just ignore them.
3090 if not tr.get("sense"):
3091 tr["sense"] = par
3092 continue
3093 else:
3094 # See if we can find a parenthesized expression in the middle.
3095 # Romanizations are sometimes between word and gender marker,
3096 # e.g. wife/English/Tr/Yiddish.
3097 m = re.search(r"\s+\((([^()]|\([^()]+\))+)\)", text)
3098 if m:
3099 par = m.group(1)
3100 text = text[: m.start()] + text[m.end() :]
3101 else:
3102 # No more parenthesized expressions - break out of the loop
3103 break
3105 # Some cleanup of artifacts that may result from skipping some templates
3106 # in earlier stages
3107 if par.startswith(": "): 3107 ↛ 3108line 3107 didn't jump to line 3108 because the condition on line 3107 was never true
3108 par = par[2:]
3109 if par.endswith(","): 3109 ↛ 3110line 3109 didn't jump to line 3110 because the condition on line 3109 was never true
3110 par = par[:-1]
3111 if re.match(r'^[“"]([^“”"]*)[“”"]$', par): 3111 ↛ 3112line 3111 didn't jump to line 3112 because the condition on line 3111 was never true
3112 par = par[1:-1]
3113 par = par.strip()
3115 # Check for special script pronunciation followed by romanization,
3116 # used in many Asian languages.
3117 lst = par.split(", ")
3118 if len(lst) == 2:
3119 a, r = lst
3120 if classify_desc(a) == "other":
3121 cls = classify_desc(r)
3122 # print("parse_translation_desc: r={} cls={}".format(r, cls))
3123 if cls == "romanization" or (
3124 cls == "english" and len(r.split()) == 1 and r[0].islower()
3125 ):
3126 if tr.get("alt") and tr.get("alt") != a: 3126 ↛ 3127line 3126 didn't jump to line 3127 because the condition on line 3126 was never true
3127 wxr.wtp.debug(
3128 'more than one value in "alt": {} vs. {}'.format(
3129 tr["alt"], a
3130 ),
3131 sortid="form_descriptions/1930",
3132 )
3133 tr["alt"] = a
3134 if tr.get("roman") and tr.get("roman") != r: 3134 ↛ 3135line 3134 didn't jump to line 3135 because the condition on line 3134 was never true
3135 wxr.wtp.debug(
3136 'more than one value in "roman": {} vs. {}'.format(
3137 tr["roman"], r
3138 ),
3139 sortid="form_descriptions/1936",
3140 )
3141 tr["roman"] = r
3142 continue
3144 # Check for certain comma-separated tags combined with English text
3145 # at the beginning or end of a comma-separated parenthesized list
3146 while len(lst) > 1:
3147 cls = classify_desc(lst[0])
3148 if cls == "tags": 3148 ↛ 3149line 3148 didn't jump to line 3149 because the condition on line 3148 was never true
3149 tagsets, topics = decode_tags(lst[0])
3150 for t in tagsets:
3151 data_extend(tr, "tags", t)
3152 data_extend(tr, "topics", topics)
3153 lst = lst[1:]
3154 continue
3155 cls = classify_desc(lst[-1])
3156 if cls == "tags":
3157 tagsets, topics = decode_tags(lst[-1])
3158 for t in tagsets:
3159 data_extend(tr, "tags", t)
3160 data_extend(tr, "topics", topics)
3161 lst = lst[:-1]
3162 continue
3163 break
3164 par = ", ".join(lst)
3166 if not par: 3166 ↛ 3167line 3166 didn't jump to line 3167 because the condition on line 3166 was never true
3167 continue
3168 if re.search(tr_ignored_parens_re, par): 3168 ↛ 3169line 3168 didn't jump to line 3169 because the condition on line 3168 was never true
3169 continue
3170 if par.startswith("numeral:"):
3171 par = par[8:].strip()
3173 # Classify the part in parenthesis and process accordingly
3174 cls = classify_desc(par)
3175 # print("parse_translation_desc classify: {!r} -> {}"
3176 # .format(par, cls))
3177 if par == text:
3178 pass
3179 if par == "f": 3179 ↛ 3180line 3179 didn't jump to line 3180 because the condition on line 3179 was never true
3180 data_append(tr, "tags", "feminine")
3181 elif par == "m": 3181 ↛ 3182line 3181 didn't jump to line 3182 because the condition on line 3181 was never true
3182 data_append(tr, "tags", "masculine")
3183 elif cls == "tags":
3184 tagsets, topics = decode_tags(par)
3185 for tags in tagsets:
3186 data_extend(tr, "tags", tags)
3187 data_extend(tr, "topics", topics)
3188 elif cls == "english":
3189 # If the text contains any of certain grammatical words, treat it
3190 # as a "note" instead of "english"
3191 if re.search(tr_note_re, par):
3192 if par.endswith(":"): 3192 ↛ 3193line 3192 didn't jump to line 3193 because the condition on line 3192 was never true
3193 par = par[:-1]
3194 if par not in ("see entry for forms",): 3194 ↛ 3069line 3194 didn't jump to line 3069 because the condition on line 3194 was always true
3195 if note: 3195 ↛ 3196line 3195 didn't jump to line 3196 because the condition on line 3195 was never true
3196 note = note + ";" + par
3197 else:
3198 note = par
3199 else:
3200 # There can be more than one parenthesized english item, see
3201 # e.g. Aunt/English/Translations/Tamil
3202 if "translation" in tr and "english" in tr:
3203 tr["english"] += "; " + par # DEPRECATED for "translation"
3204 tr["translation"] += "; " + par
3205 else:
3206 tr["english"] = par # DEPRECATED for "translation"
3207 tr["translation"] = par
3208 elif cls == "romanization":
3209 # print("roman text={!r} text cls={}"
3210 # .format(text, classify_desc(text)))
3211 if classify_desc(text) in (
3212 "english",
3213 "romanization",
3214 ) and lang not in ("Egyptian",):
3215 if beginning:
3216 restore_beginning += "({}) ".format(par)
3217 else:
3218 restore_end = " ({})".format(par) + restore_end
3219 else:
3220 if tr.get("roman"): 3220 ↛ 3221line 3220 didn't jump to line 3221 because the condition on line 3220 was never true
3221 wxr.wtp.debug(
3222 'more than one value in "roman": {} vs. {}'.format(
3223 tr["roman"], par
3224 ),
3225 sortid="form_descriptions/2013",
3226 )
3227 tr["roman"] = par
3228 elif cls == "taxonomic": 3228 ↛ 3229line 3228 didn't jump to line 3229 because the condition on line 3228 was never true
3229 if tr.get("taxonomic"):
3230 wxr.wtp.debug(
3231 'more than one value in "taxonomic": {} vs. {}'.format(
3232 tr["taxonomic"], par
3233 ),
3234 sortid="form_descriptions/2019",
3235 )
3236 if re.match(r"×[A-Z]", par):
3237 data_append(tr, "tags", "extinct")
3238 par = par[1:]
3239 tr["taxonomic"] = par
3240 elif cls == "other": 3240 ↛ 3250line 3240 didn't jump to line 3250 because the condition on line 3240 was always true
3241 if tr.get("alt"): 3241 ↛ 3242line 3241 didn't jump to line 3242 because the condition on line 3241 was never true
3242 wxr.wtp.debug(
3243 'more than one value in "alt": {} vs. {}'.format(
3244 tr["alt"], par
3245 ),
3246 sortid="form_descriptions/2028",
3247 )
3248 tr["alt"] = par
3249 else:
3250 wxr.wtp.debug(
3251 "parse_translation_desc unimplemented cls {}: {}".format(
3252 cls, par
3253 ),
3254 sortid="form_descriptions/2033",
3255 )
3257 # Check for gender indications in suffix
3258 text, final_tags = parse_head_final_tags(wxr, lang, text)
3259 data_extend(tr, "tags", final_tags)
3261 # Restore those parts that we did not want to remove (they are often
3262 # optional words or words that are always used with the given translation)
3263 text = restore_beginning + text + restore_end
3265 if note:
3266 tr["note"] = note.strip()
3267 if text and text not in ignored_translations:
3268 tr["word"] = text.strip()
3270 # Sometimes gender seems to be at the end of "roman" field, see e.g.
3271 # fire/English/Noun/Translations/Egyptian (for "oxidation reaction")
3272 roman = tr.get("roman")
3273 if roman:
3274 if roman.endswith(" f"): 3274 ↛ 3275line 3274 didn't jump to line 3275 because the condition on line 3274 was never true
3275 data_append(tr, "tags", "feminine")
3276 tr["roman"] = roman[:-2].strip()
3277 elif roman.endswith(" m"): 3277 ↛ 3278line 3277 didn't jump to line 3278 because the condition on line 3277 was never true
3278 data_append(tr, "tags", "masculine")
3279 tr["roman"] = roman[:-2].strip()
3281 # If the word now has "translation" field but no "roman" field, and
3282 # the word would be classified "other" (generally non-latin
3283 # characters), and the value in "translation" is only one lowercase
3284 # word, move it to "roman". This happens semi-frequently when the
3285 # translation is transliterated the same as some English word.
3286 roman = tr.get("roman")
3287 english = tr.get("translation")
3288 if english and not roman and "word" in tr:
3289 cls = classify_desc(tr["word"])
3290 if cls == "other" and " " not in english and english[0].islower():
3291 del tr["translation"]
3292 if "english" in tr: # DEPRECATED for "translation" 3292 ↛ 3294line 3292 didn't jump to line 3294 because the condition on line 3292 was always true
3293 del tr["english"]
3294 tr["roman"] = english
3296 # If the entry now has both tr["roman"] and tr["word"] and they have
3297 # the same value, delete tr["roman"] (e.g., man/English/Translations
3298 # Evenki)
3299 if tr.get("word") and tr.get("roman") == tr.get("word"): 3299 ↛ 3300line 3299 didn't jump to line 3300 because the condition on line 3299 was never true
3300 del tr["roman"]
3303def parse_alt_or_inflection_of(
3304 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str]
3305) -> Optional[tuple[list[str], Optional[list[AltOf]]]]:
3306 """Tries to parse an inflection-of or alt-of description. If successful,
3307 this returns (tags, alt-of/inflection-of-dict). If the description cannot
3308 be parsed, this returns None. This may also return (tags, None) when the
3309 gloss describes a form (or some other tags were extracted from it), but
3310 there was no alt-of/form-of/synonym-of word."""
3311 # print("parse_alt_or_inflection_of: {!r}".format(gloss))
3312 # Occasionally inflection_of/alt_of have "A(n) " etc. at the beginning.
3314 # Never interpret a gloss that is equal to the word itself as a tag
3315 # (e.g., instrumental/Romanian, instrumental/Spanish).
3316 if gloss.lower() == wxr.wtp.title.lower() or ( # type:ignore[union-attr]
3317 len(gloss) >= 5 and distw([gloss.lower()], wxr.wtp.title.lower()) < 0.2 # type:ignore[union-attr]
3318 ):
3319 return None
3321 # First try parsing it as-is
3322 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args)
3323 if parsed is not None:
3324 return parsed
3326 # Next try parsing it with the first character converted to lowercase if
3327 # it was previously uppercase.
3328 if gloss and gloss[0].isupper():
3329 gloss = gloss[0].lower() + gloss[1:]
3330 parsed = parse_alt_or_inflection_of1(wxr, gloss, gloss_template_args)
3331 if parsed is not None:
3332 return parsed
3334 return None
3337# These tags are not allowed in alt-or-inflection-of parsing
3338alt_infl_disallowed: set[str] = set(
3339 [
3340 "error-unknown-tag",
3341 "place", # Not in inflected forms and causes problems e.g. house/
3342 # English
3343 ]
3344)
3347def parse_alt_or_inflection_of1(
3348 wxr: WiktextractContext, gloss: str, gloss_template_args: set[str]
3349) -> Optional[tuple[list[str], Optional[list[AltOf]]]]:
3350 """Helper function for parse_alt_or_inflection_of. This handles a single
3351 capitalization."""
3352 if not gloss or not gloss.strip(): 3352 ↛ 3353line 3352 didn't jump to line 3353 because the condition on line 3352 was never true
3353 return None
3355 # Prevent some common errors where we would parse something we shouldn't
3356 if re.search(r"(?i)form of address ", gloss): 3356 ↛ 3357line 3356 didn't jump to line 3357 because the condition on line 3356 was never true
3357 return None
3359 gloss = re.sub(r"only used in [^,]+, ", "", gloss)
3361 # First try all formats ending with "of" (or other known last words that
3362 # can end a form description)
3363 matches = list(re.finditer(r"\b(of|for|by|as|letter|number) ", gloss))
3364 m: Optional[re.Match]
3365 for m in reversed(matches):
3366 desc = gloss[: m.end()].strip()
3367 base = gloss[m.end() :].strip()
3368 tagsets, topics = decode_tags(desc, no_unknown_starts=True)
3369 if not topics and any(
3370 not (alt_infl_disallowed & set(ts)) for ts in tagsets
3371 ):
3372 # Successfully parsed, including "of" etc.
3373 tags: list[str] = []
3374 # If you have ("Western-Armenian", ..., "form-of") as your
3375 # tag set, it's most probable that it's something like
3376 # "Western Armenian form of խոսել (xosel)", which should
3377 # get "alt-of" instead of "form-of" (inflection).
3378 # խօսիլ/Armenian
3379 for ts_t in tagsets:
3380 if "form-of" in ts_t and any(
3381 valid_tags.get(tk) == "dialect" for tk in ts_t
3382 ):
3383 ts_s = (set(ts_t) - {"form-of"}) | {"alt-of"}
3384 else:
3385 ts_s = set(ts_t)
3386 if not (alt_infl_disallowed & ts_s): 3386 ↛ 3379line 3386 didn't jump to line 3379 because the condition on line 3386 was always true
3387 tags.extend(ts_s)
3388 if (
3389 "alt-of" in tags
3390 or "form-of" in tags
3391 or "synonym-of" in tags
3392 or "compound-of" in tags
3393 ):
3394 break
3395 if m.group(1) == "of":
3396 # Try parsing without the final "of". This is commonly used in
3397 # various form-of expressions.
3398 desc = gloss[: m.start()]
3399 base = gloss[m.end() :]
3400 tagsets, topics = decode_tags(desc, no_unknown_starts=True)
3401 # print("ALT_OR_INFL: desc={!r} base={!r} tagsets={} topics={}"
3402 # .format(desc, base, tagsets, topics))
3403 if not topics and any(
3404 not (alt_infl_disallowed & set(t)) for t in tagsets
3405 ):
3406 tags = []
3407 for t in tagsets:
3408 if not (alt_infl_disallowed & set(t)): 3408 ↛ 3407line 3408 didn't jump to line 3407 because the condition on line 3408 was always true
3409 tags.extend(t)
3410 # It must have at least one tag from form_of_tags
3411 if set(tags) & form_of_tags:
3412 # Accept this as form-of
3413 tags.append("form-of")
3414 break
3415 if set(tags) & alt_of_tags:
3416 # Accept this as alt-of
3417 tags.append("alt-of")
3418 break
3420 else:
3421 # Did not find a form description based on last word; see if the
3422 # whole description is tags
3423 tagsets, topics = decode_tags(gloss, no_unknown_starts=True)
3424 if not topics and any(
3425 not (alt_infl_disallowed & set(ts)) and form_of_tags & set(ts)
3426 for ts in tagsets
3427 ):
3428 tags = []
3429 for ts in tagsets:
3430 if not (alt_infl_disallowed & set(ts)) and form_of_tags & set( 3430 ↛ 3429line 3430 didn't jump to line 3429 because the condition on line 3430 was always true
3431 ts
3432 ):
3433 tags.extend(ts)
3434 base = ""
3435 else:
3436 return None
3438 # kludge for Spanish (again): 'x of [word] combined with [clitic]'
3439 m = re.search(r"combined with \w+$", base)
3440 if m: 3440 ↛ 3441line 3440 didn't jump to line 3441 because the condition on line 3440 was never true
3441 tagsets, topics = decode_tags(m.group(0), no_unknown_starts=True)
3442 if not topics:
3443 for ts in tagsets:
3444 tags.extend(ts)
3445 base = base[: m.start()]
3447 # It is fairly common for form_of glosses to end with something like
3448 # "ablative case" or "in instructive case". Parse that ending.
3449 base = base.strip()
3450 lst = base.split()
3451 # print("parse_alt_or_inflection_of: lst={}".format(lst))
3452 if len(lst) >= 3 and lst[-1] in ("case", "case."): 3452 ↛ 3453line 3452 didn't jump to line 3453 because the condition on line 3452 was never true
3453 node = valid_sequences.children.get(lst[-2])
3454 if node and node.end:
3455 for s in node.tags:
3456 tags.extend(s.split(" "))
3457 lst = lst[:-2]
3458 if lst[-1] == "in" and len(lst) > 1:
3459 lst = lst[:-1]
3461 # Eliminate empty and duplicate tags
3462 tags = sorted(set(t for t in tags if t))
3464 # Clean up some extra stuff from the linked word, separating the text
3465 # into ``base`` (the linked word) and ``extra`` (additional information,
3466 # such as English translation or clarifying word sense information).
3467 orig_base = base
3468 base = re.sub(alt_of_form_of_clean_re, "", orig_base)
3469 base = re.sub(r" [(⟨][^()]*[)⟩]", "", base) # Remove all (...) groups
3470 extra = orig_base[len(base) :]
3471 extra = re.sub(r"^[- :;.,,—]+", "", extra)
3472 if extra.endswith(".") and extra.count(".") == 1:
3473 extra = extra[:-1].strip()
3474 m = re.match(r"^\(([^()]*)\)$", extra)
3475 if m: 3475 ↛ 3476line 3475 didn't jump to line 3476 because the condition on line 3475 was never true
3476 extra = m.group(1)
3477 else:
3478 # These weird backets used in "slash mark"
3479 m = re.match(r"^⟨([^()]*)⟩$", extra)
3480 if m: 3480 ↛ 3481line 3480 didn't jump to line 3481 because the condition on line 3480 was never true
3481 extra = m.group(1)
3482 m = re.match(r'^[“"]([^"“”]*)["”]$', extra)
3483 if m: 3483 ↛ 3484line 3483 didn't jump to line 3484 because the condition on line 3483 was never true
3484 extra = m.group(1)
3485 # Note: base might still contain comma-separated values and values
3486 # separated by "and"
3487 base = base.strip()
3488 if base.endswith(",") and len(base) > 2: 3488 ↛ 3489line 3488 didn't jump to line 3489 because the condition on line 3488 was never true
3489 base = base[:-1].strip()
3490 while (
3491 base.endswith(".")
3492 and not wxr.wtp.page_exists(base)
3493 and base not in gloss_template_args
3494 ):
3495 base = base[:-1].strip()
3496 if base.endswith('(\u201cconjecture")'): 3496 ↛ 3497line 3496 didn't jump to line 3497 because the condition on line 3496 was never true
3497 base = base[:-14].strip()
3498 tags.append("conjecture")
3499 while ( 3499 ↛ 3504line 3499 didn't jump to line 3504 because the condition on line 3499 was never true
3500 base.endswith(".")
3501 and not wxr.wtp.page_exists(base)
3502 and base not in gloss_template_args
3503 ):
3504 base = base[:-1].strip()
3505 if ( 3505 ↛ 3510line 3505 didn't jump to line 3510 because the condition on line 3505 was never true
3506 base.endswith(".")
3507 and base not in gloss_template_args
3508 and base[:-1] in gloss_template_args
3509 ):
3510 base = base[:-1]
3511 base = base.strip()
3512 if not base:
3513 return tags, None
3515 # Kludge: Spanish verb forms seem to have a dot added at the end.
3516 # Remove it; we know of no Spanish verbs ending with a dot.
3517 language = wxr.wtp.section
3518 pos = wxr.wtp.subsection
3519 # print("language={} pos={} base={}".format(language, pos, base))
3520 if ( 3520 ↛ 3526line 3520 didn't jump to line 3526 because the condition on line 3520 was never true
3521 base.endswith(".")
3522 and len(base) > 1
3523 and base[-2].isalpha()
3524 and (language == "Spanish" and pos == "Verb")
3525 ):
3526 base = base[:-1]
3528 # Split base to alternatives when multiple alternatives provided
3529 parts = split_at_comma_semi(base, extra=[" / ", "/", r" \+ "])
3530 titleword = re.sub(r"^Reconstruction:[^/]*/", "", wxr.wtp.title or "")
3531 if (
3532 len(parts) <= 1
3533 or base.startswith("/")
3534 or base.endswith("/")
3535 or "/" in titleword
3536 ):
3537 parts = [base]
3538 # Split base to alternatives when of form "a or b" and "a" and "b" are
3539 # similar (generally spelling variants of the same word or similar words)
3540 if len(parts) == 1:
3541 pp = base.split()
3542 if len(pp) == 3 and pp[1] == "or" and distw([pp[0]], pp[2]) < 0.4:
3543 parts = [pp[0], pp[2]]
3545 # Create form-of/alt-of entries based on the extracted data
3546 dt_lst: list[AltOf] = []
3547 for p in parts:
3548 # Check for some suspicious base forms
3549 m = re.search(r"[.,] |[{}()]", p)
3550 if m and not wxr.wtp.page_exists(p): 3550 ↛ 3551line 3550 didn't jump to line 3551 because the condition on line 3550 was never true
3551 wxr.wtp.debug(
3552 "suspicious alt_of/form_of with {!r}: {}".format(m.group(0), p),
3553 sortid="form_descriptions/2278",
3554 )
3555 if p.startswith("*") and len(p) >= 3 and p[1].isalpha(): 3555 ↛ 3556line 3555 didn't jump to line 3556 because the condition on line 3555 was never true
3556 p = p[1:]
3557 dt: AltOf = {"word": p}
3558 if extra:
3559 dt["extra"] = extra
3560 dt_lst.append(dt)
3561 # print("alt_or_infl_of returning tags={} lst={} base={!r}"
3562 # .format(tags, lst, base))
3563 return tags, dt_lst
3566@functools.lru_cache(maxsize=65536)
3567def classify_desc(
3568 desc: str,
3569 allow_unknown_tags=False,
3570 no_unknown_starts=False,
3571 accepted: Union[tuple[str, ...], frozenset[str]] = tuple(),
3572) -> str:
3573 """Determines whether the given description is most likely tags, english,
3574 a romanization, or something else. Returns one of: "tags", "english",
3575 "romanization", or "other". If ``allow_unknown_tags`` is True, then
3576 allow "tags" classification even when the only tags are those starting
3577 with a word in allowed_unknown_starts."""
3578 assert isinstance(desc, str)
3579 # Empty and whitespace-only strings are treated as "other"
3580 desc = desc.strip()
3581 if not desc:
3582 return "other"
3584 normalized_desc = unicodedata.normalize("NFKD", desc)
3586 # If it can be fully decoded as tags without errors, treat as tags
3587 tagsets, topics = decode_tags(desc, no_unknown_starts=no_unknown_starts)
3588 for tagset in tagsets:
3589 assert isinstance(tagset, (list, tuple, set))
3590 if "error-unknown-tag" not in tagset and (
3591 topics or allow_unknown_tags or any(" " not in x for x in tagset)
3592 ):
3593 return "tags"
3595 # Check if it looks like the taxonomic name of a species
3596 if desc in known_species:
3597 return "taxonomic"
3598 desc1 = re.sub(r"^×([A-Z])", r"\1", desc)
3599 desc1 = re.sub(r"\s*×.*", "", desc1)
3600 lst = desc1.split()
3601 if len(lst) > 1 and len(lst) <= 5 and lst[0] in known_firsts:
3602 have_non_english = 1 if lst[0].lower() not in english_words else 0
3603 for x in lst[1:]:
3604 if x in ("A", "B", "C", "D", "E", "F", "I", "II", "III", "IV", "V"):
3605 continue
3606 if x[0].isupper():
3607 break
3608 if x not in english_words:
3609 have_non_english += 1
3610 else:
3611 # Starts with known taxonomic term, does not contain uppercase
3612 # words (except allowed letters) and at least one word is not
3613 # English
3614 if have_non_english >= len(lst) - 1 and have_non_english > 0: 3614 ↛ 3620line 3614 didn't jump to line 3620 because the condition on line 3614 was always true
3615 return "taxonomic"
3617 # If all words are in our English dictionary, interpret as English.
3618 # [ -~] is regex black magic, "ALL CHARACTERS from space to tilde"
3619 # in ASCII. Took me a while to figure out.
3620 if re.match(r"[ -~―—“”…'‘’ʹ€]+$", normalized_desc) and len(desc) > 1:
3621 if desc in english_words and desc[0].isalpha():
3622 return "english" # Handles ones containing whitespace
3623 desc1 = re.sub(
3624 tokenizer_fixup_re, lambda m: tokenizer_fixup_map[m.group(0)], desc
3625 )
3626 tokens = tokenizer.tokenize(desc1)
3627 if not tokens: 3627 ↛ 3628line 3627 didn't jump to line 3628 because the condition on line 3627 was never true
3628 return "other"
3629 lst_bool = list(
3630 x not in not_english_words
3631 and
3632 # not x.isdigit() and
3633 (
3634 x in english_words
3635 or x.lower() in english_words
3636 or x in known_firsts
3637 or x[0].isdigit()
3638 or x in accepted
3639 or
3640 # (x[0].isupper() and x.find("-") < 0 and x.isascii()) or
3641 (
3642 x.endswith("s") and len(x) >= 4 and x[:-1] in english_words
3643 ) # Plural
3644 or (
3645 x.endswith("ies")
3646 and len(x) >= 5
3647 and x[:-3] + "y" in english_words
3648 ) # E.g. lily - lilies
3649 or (
3650 x.endswith("ing")
3651 and len(x) >= 5
3652 and x[:-3] in english_words
3653 ) # E.g. bring - bringing
3654 or (
3655 x.endswith("ing")
3656 and len(x) >= 5
3657 and x[:-3] + "e" in english_words
3658 ) # E.g., tone - toning
3659 or (
3660 x.endswith("ed") and len(x) >= 5 and x[:-2] in english_words
3661 ) # E.g. hang - hanged
3662 or (
3663 x.endswith("ed")
3664 and len(x) >= 5
3665 and x[:-2] + "e" in english_words
3666 ) # E.g. atone - atoned
3667 or (x.endswith("'s") and x[:-2] in english_words)
3668 or (x.endswith("s'") and x[:-2] in english_words)
3669 or (
3670 x.endswith("ise")
3671 and len(x) >= 5
3672 and x[:-3] + "ize" in english_words
3673 )
3674 or (
3675 x.endswith("ised")
3676 and len(x) >= 6
3677 and x[:-4] + "ized" in english_words
3678 )
3679 or (
3680 x.endswith("ising")
3681 and len(x) >= 7
3682 and x[:-5] + "izing" in english_words
3683 )
3684 or (
3685 re.search(r"[-/]", x)
3686 and all(
3687 ((y in english_words and len(y) > 2) or not y)
3688 for y in re.split(r"[-/]", x)
3689 )
3690 )
3691 )
3692 for x in tokens
3693 )
3694 cnt = lst_bool.count(True)
3695 rejected_words = tuple(
3696 x for i, x in enumerate(tokens) if not lst_bool[i]
3697 )
3698 if (
3699 any(
3700 lst_bool[i] and x[0].isalpha() and len(x) > 1
3701 for i, x in enumerate(tokens)
3702 )
3703 and not desc.startswith("-")
3704 and not desc.endswith("-")
3705 and re.search(r"\w+", desc)
3706 and (
3707 cnt == len(lst_bool)
3708 or (
3709 any(
3710 lst_bool[i] and len(x) > 3 for i, x in enumerate(tokens)
3711 )
3712 and cnt >= len(lst_bool) - 1
3713 )
3714 or cnt / len(lst_bool) >= 0.8
3715 or (
3716 all(x in potentially_english_words for x in rejected_words)
3717 and cnt / len(lst_bool) >= 0.50
3718 )
3719 )
3720 ):
3721 return "english"
3722 # Some translations have apparent pronunciation descriptions in /.../
3723 # which we'll put in the romanization field (even though they probably are
3724 # not exactly romanizations).
3725 if desc.startswith("/") and desc.endswith("/"):
3726 return "romanization"
3727 # If all characters are in classes that could occur in romanizations,
3728 # treat as romanization
3729 classes = list(
3730 unicodedata.category(x) if x not in ("-", ",", ":", "/", '"') else "OK"
3731 for x in normalized_desc
3732 )
3733 classes1 = []
3734 num_latin = 0
3735 num_greek = 0
3736 # part = ""
3737 # for ch, cl in zip(normalized_desc, classes):
3738 # part += f"{ch}({cl})"
3739 # print(part)
3740 for ch, cl in zip(normalized_desc, classes):
3741 if ch in (
3742 "'", # ' in Arabic, / in IPA-like parenthesized forms
3743 ".", # e.g., "..." in translations
3744 ";",
3745 ":",
3746 "!",
3747 "‘",
3748 "’",
3749 '"',
3750 "“",
3751 "”",
3752 "/",
3753 "?",
3754 "…", # alternative to "..."
3755 "⁉", # 見る/Japanese automatic transcriptions...
3756 "?",
3757 "!",
3758 "⁻", # superscript -, used in some Cantonese roman, e.g. "we"
3759 "ʔ",
3760 "ʼ",
3761 "ʾ",
3762 "ʹ",
3763 ): # ʹ e.g. in understand/English/verb Russian transl
3764 classes1.append("OK")
3765 continue
3766 if cl not in ("Ll", "Lu"):
3767 classes1.append(cl)
3768 continue
3769 try:
3770 name = unicodedata.name(ch)
3771 first = name.split()[0]
3772 if first == "LATIN":
3773 num_latin += 1
3774 elif first == "GREEK":
3775 num_greek += 1
3776 elif first == "COMBINING": # Combining diacritic 3776 ↛ 3777line 3776 didn't jump to line 3777 because the condition on line 3776 was never true
3777 cl = "OK"
3778 elif re.match(non_latin_scripts_re, name): 3778 ↛ 3782line 3778 didn't jump to line 3782 because the condition on line 3778 was always true
3779 cl = "NO" # Not acceptable in romanizations
3780 except ValueError:
3781 cl = "NO" # Not acceptable in romanizations
3782 classes1.append(cl)
3783 # print("classify_desc: {!r} classes1: {}".format(desc, classes1))
3784 # print(set(classes1) )
3785 if all(
3786 x in ("Ll", "Lu", "Lt", "Lm", "Mn", "Mc", "Zs", "Nd", "OK")
3787 for x in classes1
3788 ):
3789 if (
3790 (num_latin >= num_greek + 2 or num_greek == 0)
3791 and classes1.count("OK") < len(classes1)
3792 and classes1.count("Nd") < len(classes1)
3793 ):
3794 return "romanization"
3795 # Otherwise it is something else, such as hanji version of the word
3796 return "other"
3799def remove_text_in_parentheses(text: str) -> str:
3800 parentheses = 0
3801 new_text = ""
3802 for c in text:
3803 if c == "(":
3804 parentheses += 1
3805 elif c == ")":
3806 parentheses -= 1
3807 elif parentheses == 0:
3808 new_text += c
3809 return new_text