Coverage for src/wiktextract/extractor/simple/tags.py: 89%
9 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 06:55 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 06:55 +0000
1from wiktextract.tags import uppercase_tags, valid_tags
3TAGS: dict[str, list[str]] = {
4 "no-gloss": ["no-gloss"],
5 "comparative": ["comparative"],
6 "Comparative": ["comparative"],
7 "determiner": ["determiner"],
8 "Negative": ["negative"],
9 "Past": ["past"],
10 "Past participle": ["past", "participle"],
11 "Past tense": ["past"],
12 "Plain form": ["canonical"],
13 "Plain present": ["present"],
14 "plural": ["plural"],
15 "Plural": ["plural"],
16 "Positive": ["positive"],
17 "Present": ["present"],
18 "Present participle": ["present", "participle"],
19 "Proper noun": ["proper-noun"],
20 "singular": ["singular"],
21 "superlative": ["superlative"],
22 "Superlative": ["superlative"],
23 "Third person singular": ["third-person", "singular"],
24 "Third-person singular": ["third-person", "singular"],
25 "stressed": ["stressed"],
26 "unstressed": ["unstressed"],
27 "UK": ["UK"],
28 "US": ["US"],
29 "United Kingdom": ["UK"],
30 "United States": ["US"],
31 "before a vowel": ["before-vowel"],
32 "before a consonant": ["before-consonant"],
33 "CA": ["Canada"],
34 "AU": ["Australia"],
35 "Australian": ["Australia"],
36 "California": ["California"],
37 "Canadian": ["Canada"],
38 "CA synth": [""],
39 "GB": ["UK"],
40 "India": ["India"],
41 "Indian English": ["Indian-English"],
42 "Kenya": ["Kenya"],
43 "Limbu": ["Limbu"],
44 "Massachusetts": ["Massachusetts"],
45 "Mid-Atlantic": ["Mid-Atlantic"],
46 "New York accent": ["New-York"],
47 "Northen England": ["Northern-England"],
48 "NZ": ["New-Zealand"],
49 "Rhode Island": ["Rhode-Island"],
50 "Southern England": ["Southern-England"],
51 "uk": ["UK"],
52 "Uk": ["UK"],
53 "UK male": ["UK"],
54 "US female": ["US"],
55 "US Inland North": ["Inland-Northern-American"],
56 "US-Inland North": ["Inland-Northern-American"],
57 "American": ["US"],
58 "Audio US": ["US"],
59}
62# Check validity
63# valid_tags is from the lower level, originally created for the English
64# extractor but also applicable to other extractors: these are the tags
65# that should be used for tagging. Can be added to when needed, but
66# often there's already an equivalent tag with a slightly different name.
67for tags in TAGS.values():
68 for tag in tags:
69 if tag.islower() and tag.isalpha() and tag not in valid_tags: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true
70 assert False, f"Invalid tag in simple_tag_map: {tag}"
72# uppercase_tags are specific tags with uppercase names that are for stuff
73# like locations and dialect and language names.
74for k in uppercase_tags:
75 if k not in TAGS:
76 TAGS[k] = [k.replace(" ", "-")]