Coverage for src/wiktextract/extractor/simple/tags.py: 89%

9 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 06:55 +0000

1from wiktextract.tags import uppercase_tags, valid_tags 

2 

3TAGS: dict[str, list[str]] = { 

4 "no-gloss": ["no-gloss"], 

5 "comparative": ["comparative"], 

6 "Comparative": ["comparative"], 

7 "determiner": ["determiner"], 

8 "Negative": ["negative"], 

9 "Past": ["past"], 

10 "Past participle": ["past", "participle"], 

11 "Past tense": ["past"], 

12 "Plain form": ["canonical"], 

13 "Plain present": ["present"], 

14 "plural": ["plural"], 

15 "Plural": ["plural"], 

16 "Positive": ["positive"], 

17 "Present": ["present"], 

18 "Present participle": ["present", "participle"], 

19 "Proper noun": ["proper-noun"], 

20 "singular": ["singular"], 

21 "superlative": ["superlative"], 

22 "Superlative": ["superlative"], 

23 "Third person singular": ["third-person", "singular"], 

24 "Third-person singular": ["third-person", "singular"], 

25 "stressed": ["stressed"], 

26 "unstressed": ["unstressed"], 

27 "UK": ["UK"], 

28 "US": ["US"], 

29 "United Kingdom": ["UK"], 

30 "United States": ["US"], 

31 "before a vowel": ["before-vowel"], 

32 "before a consonant": ["before-consonant"], 

33 "CA": ["Canada"], 

34 "AU": ["Australia"], 

35 "Australian": ["Australia"], 

36 "California": ["California"], 

37 "Canadian": ["Canada"], 

38 "CA synth": [""], 

39 "GB": ["UK"], 

40 "India": ["India"], 

41 "Indian English": ["Indian-English"], 

42 "Kenya": ["Kenya"], 

43 "Limbu": ["Limbu"], 

44 "Massachusetts": ["Massachusetts"], 

45 "Mid-Atlantic": ["Mid-Atlantic"], 

46 "New York accent": ["New-York"], 

47 "Northen England": ["Northern-England"], 

48 "NZ": ["New-Zealand"], 

49 "Rhode Island": ["Rhode-Island"], 

50 "Southern England": ["Southern-England"], 

51 "uk": ["UK"], 

52 "Uk": ["UK"], 

53 "UK male": ["UK"], 

54 "US female": ["US"], 

55 "US Inland North": ["Inland-Northern-American"], 

56 "US-Inland North": ["Inland-Northern-American"], 

57 "American": ["US"], 

58 "Audio US": ["US"], 

59} 

60 

61 

62# Check validity 

63# valid_tags is from the lower level, originally created for the English 

64# extractor but also applicable to other extractors: these are the tags 

65# that should be used for tagging. Can be added to when needed, but 

66# often there's already an equivalent tag with a slightly different name. 

67for tags in TAGS.values(): 

68 for tag in tags: 

69 if tag.islower() and tag.isalpha() and tag not in valid_tags: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 assert False, f"Invalid tag in simple_tag_map: {tag}" 

71 

72# uppercase_tags are specific tags with uppercase names that are for stuff 

73# like locations and dialect and language names. 

74for k in uppercase_tags: 

75 if k not in TAGS: 

76 TAGS[k] = [k.replace(" ", "-")]