Coverage for src/wiktextract/extractor/simple/models.py: 100%

62 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from pydantic import BaseModel, ConfigDict, Field 

2 

3# Pydantic models are basically classes that take the place of the dicts 

4# used in the main English extractor. They use more resources, but also do 

5# a lot of validation work and are easier for the type-checker. 

6 

7# Pydantic config stuff. 

8class SimpleEnglishBaseModel(BaseModel): 

9 model_config = ConfigDict( 

10 extra="forbid", 

11 strict=True, 

12 validate_assignment=True, 

13 # We use pydantic mainly for the automatic validation; this 

14 # setting ensures the validation is done even when assigning something 

15 # after initialization, otherwise it doesn't check anything. 

16 validate_default=True, 

17 ) 

18 

19# Not an example, this is for example entries next to glosses. 

20class Example(SimpleEnglishBaseModel): 

21 text: str = Field(default="", description="Example usage sentence") 

22 author: str = Field(default="", description="Author's name") 

23 title: str = Field(default="", description="Title of the reference") 

24 # SEW example templates are simple and don't seem to have these 

25 # latter datas. 

26 # date: str = Field(default="", description="Original date") 

27 # date_published: str = Field(default="", description="Date of publication") 

28 # collection: str = Field( 

29 # default="", 

30 # description="Name of the collection the example was taken from", 

31 # ) 

32 # editor: str = Field(default="", description="Editor") 

33 # translator: str = Field(default="", description="Translator") 

34 # source: str = Field( 

35 # default="", 

36 # description="Source of reference", 

37 # ) 

38 

39# General glass for "link to another related word", like synonym, antonym, etc. 

40# Instead of having classes for each, we have differnet fields of list[Linkage], 

41# like `synonyms: list[Linkage] = []`. 

42class Linkage(SimpleEnglishBaseModel): 

43 word: str 

44 # sense_index: str = "" 

45 # note: str = "" 

46 # raw_tags: list[str] = [] 

47 # tags: list[str] = [] 

48 

49# Basically a line or lines of gloss, a meaning of a word. These are collected 

50# under the POS as a list. 

51class Sense(SimpleEnglishBaseModel): 

52 glosses: list[str] = [] # ["Gloss supercategory", "Specific gloss."] 

53 tags: list[str] = [] 

54 raw_tags: list[str] = [] 

55 # topics: list[str] = [] # XXX do these. 

56 categories: list[str] = [] # Wikipedia category link data; not printed. 

57 examples: list[Example] = [] 

58 synonyms: list[Linkage] = [] 

59 antonyms: list[Linkage] = [] 

60 # ruby: list[tuple[str, ...]] = [] 

61 

62# An inflected form of the word, like `{ form: "bats", tags: ["plural"] }` 

63class Form(SimpleEnglishBaseModel): 

64 form: str = "" 

65 tags: list[str] = [] 

66 raw_tags: list[str] = [] 

67 # sense_index: str = "" 

68 

69 

70# A pronunciation or audio file. If you have a string of IPA or SAMPA or 

71# something else, that is extracted as its own Sound entry. 

72class Sound(SimpleEnglishBaseModel): 

73 ipa: str = Field(default="", description="International Phonetic Alphabet") 

74 enpr: str = Field(default="", description="American Heritage Dictionary") 

75 sampa: str = Field( 

76 default="", description="Speech Assessment Methods Phonetic Alphabet" 

77 ) 

78 audio: str = Field(default="", description="Audio file name") 

79 wav_url: str = Field(default="") 

80 ogg_url: str = Field(default="") 

81 mp3_url: str = Field(default="") 

82 oga_url: str = Field(default="") 

83 flac_url: str = Field(default="") 

84 lang_code: str = Field(default="en", description="Wiktionary language code") 

85 lang: str = Field(default="English", description="Localized language name") 

86 raw_tags: list[str] = [] 

87 tags: list[str] = [] 

88 rhymes: list[str] = [] 

89 homophones: list[str] = [] 

90 # text: str = "" # Use raw_tags instead 

91 # "Temporary" field used to sort out different sound data between POSes when 

92 # they are originally found in one combined pronunciation section 

93 poses: list[str] = [] 

94 

95# Sometimes we collect raw template arguments separately, like in the main 

96# line English extractor where we keep data from etymology templates. 

97class TemplateData(SimpleEnglishBaseModel): 

98 name: str = Field(default="", description="Template's name.") 

99 args: dict[str, str] = Field( 

100 default={}, description="Arguments given to the template, if any." 

101 ) 

102 expansion: str = Field( 

103 default="", 

104 description="The result of expanding the template.", 

105 ) 

106 

107# The highest level entry: This is returned from the program as a JSON object 

108# in the JSONL output. 

109class WordEntry(SimpleEnglishBaseModel): 

110 model_config = ConfigDict(title="Simple English Wiktionary") 

111 

112 word: str = Field(description="Word string") 

113 # For Simple English, the language is always English 

114 forms: list[Form] = Field(default=[], description="Inflection forms list") 

115 # We do not use "en" as the default value here, because we also 

116 # remove all default values so that we don't have empty or meaningless 

117 # fields in the output. 

118 lang_code: str = Field(default="", description="Wiktionary language code") 

119 lang: str = Field(default="", description="Localized language name") 

120 pos: str = Field(default="", description="Part of speech type") 

121 pos_title: str = "" # `==Noun==` 

122 pos_num: int = -1 # `==Noun 2==` Default -1 gets removed. 

123 etymology_text: str = Field( 

124 default="", description="Etymology section as cleaned text." 

125 ) 

126 etymology_templates: list[TemplateData] = Field( 

127 default=[], 

128 description="Templates and their arguments and expansions from the " 

129 "etymology section.", 

130 ) 

131 # Simple Wiktionary doesn't have numbered etymology sections 

132 senses: list[Sense] = [] 

133 title: str = Field(default="", description="Redirect page source title") 

134 redirect: str = Field(default="", description="Redirect page target title") 

135 categories: list[str] = [] 

136 sounds: list[Sound] = [] 

137 tags: list[str] = [] 

138 raw_tags: list[str] = [] 

139 hyphenation: str = "" # Should be a list `hyphenations`. 

140 head_templates: list[TemplateData] = []