Coverage for src/wiktextract/extractor/el/models.py: 92%

87 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1from pydantic import BaseModel, ConfigDict, Field 

2 

3# Pydantic models are basically classes that take the place of the dicts 

4# used in the main English extractor. They use more resources, but also do 

5# a lot of validation work and are easier for the type-checker. 

6 

7 

8# Search and replace Greek with `Language Name` 

9# Pydantic config stuff. 

10class GreekBaseModel(BaseModel): 

11 model_config = ConfigDict( 

12 extra="forbid", 

13 strict=True, 

14 # We use pydantic mainly for the automatic validation; this 

15 # setting ensures the validation is done even when assigning something 

16 # after initialization, otherwise it doesn't check anything. 

17 validate_assignment=True, 

18 validate_default=True, 

19 ) 

20 

21 

22# Examples and quotations in glosses 

23class Example(GreekBaseModel): 

24 text: str = Field(default="", description="Example usage sentence") 

25 type: str = "" # example or quotation etc. 

26 translation: str = Field( 

27 default="", 

28 description="Greek Translation of the example sentence", 

29 ) 

30 # author: str = Field(default="", description="Author's name") 

31 # title: str = Field(default="", description="Title of the reference") 

32 # ref: str = Field(default="", description="Raw reference string") 

33 # url: str = Field( 

34 # default="", description="A web link. Not necessarily well-formated." 

35 # ) 

36 # date: str = Field(default="", description="Original date") 

37 # date_published: str = Field(default="", description="Date of publication") 

38 # collection: str = Field( 

39 # default="", 

40 # description="Name of the collection the example was taken from", 

41 # ) 

42 # pages: str = Field(default="", description="Page numbers") 

43 # year: str = Field(default="", description="Year of publication") 

44 # publisher: str = Field(default="", description="Published by") 

45 # editor: str = Field(default="", description="Editor") 

46 # translator: str = Field(default="", description="Translator") 

47 # source: str = Field( 

48 # default="", 

49 # description="Source of reference", 

50 # ) 

51 # collection: str = Field( 

52 # default="", 

53 # description="Name of collection that reference was published in", 

54 # ) 

55 # volume: str = Field(default="", description="Volume number") 

56 # comment: str = Field(default="", description="Comment on the reference") 

57 # accessdate: str = Field( 

58 # default="", description="Date of access of online reference" 

59 # ) 

60 # date: str = Field(default="", description="Date of publication") 

61 # number: str = Field(default="", description="Issue number") 

62 # # chapter: Optional[str] = Field(default=None, description="Chapter name") 

63 # place: str = Field(default="", description="Place of publication") 

64 # edition: str = Field(default="", description="Edition number") 

65 # isbn: str = Field(default="", description="ISBN number") 

66 # literal_meaning: str = "" 

67 

68 

69class Translation(GreekBaseModel): 

70 sense: str = Field( 

71 default="", description="A gloss of the sense being translated" 

72 ) 

73 word: str = Field(default="", description="Translation term") 

74 lang_code: str = Field( 

75 default="", 

76 description="Wiktionary language code of the translation term", 

77 ) 

78 lang: str = Field(default="", description="Localized language name") 

79 # uncertain: bool = Field( 

80 # default=False, description="Translation marked as uncertain" 

81 # ) 

82 roman: str = Field( 

83 default="", description="Transliteration to Roman characters" 

84 ) 

85 sense_index: str = "" 

86 # note: str = "" 

87 # literal_meaning: str = "" 

88 raw_tags: list[str] = [] 

89 tags: list[str] = [] 

90 # notes: list[str] = Field(default=[], description="A list of notes") 

91 

92 

93# General glass for "link to another related word", like synonym, antonym, etc. 

94# Instead of having classes for each, we have differnet fields of list[Linkage], 

95# like `synonyms: list[Linkage] = []`. 

96class Linkage(GreekBaseModel): 

97 word: str 

98 # translation: str 

99 # extra: str 

100 # roman: str 

101 # sense: str 

102 # sense_index: str = "" 

103 # note: str = "" 

104 raw_tags: list[str] = [] 

105 tags: list[str] = [] 

106 # topics: list[str] = [] 

107 # urls: list[str] 

108 examples: list[str] = [] 

109 

110 

111class FormOf(GreekBaseModel): 

112 word: str 

113 # extra: str 

114 # roman: str 

115 

116 

117# Basically a line or lines of gloss, a meaning of a word. These are collected 

118# under the POS as a list. 

119class Sense(GreekBaseModel): 

120 glosses: list[str] = [] # ["Gloss supercategory", "Specific gloss."] 

121 tags: list[str] = [] 

122 raw_tags: list[str] = [] 

123 form_of: list[FormOf] = [] 

124 # alt_of : list[FormOf] = [] 

125 # compound_of: list[FormOf] = [] 

126 # topics: list[str] = [] 

127 categories: list[str] = [] # Wikipedia category link data; not printed. 

128 examples: list[Example] = [] 

129 synonyms: list[Linkage] = [] 

130 antonyms: list[Linkage] = [] 

131 # holonyms: list[Linkage] = [] 

132 # hypernyms: list[Linkage] = [] 

133 # hyponyms: list[Linkage] = [] 

134 # instances: list[Linkage] = [] 

135 # meronyms: list[Linkage] = [] 

136 related: list[Linkage] = [] 

137 # links: list[list[str]] = [] 

138 # coordinate_terms: list[Linkage] = [] 

139 # ruby: list[tuple[str, ...]] = [] 

140 # sense_index: str = Field(default="", description="Sense number used in " 

141 # "Wiktionary") 

142 # head_nr: int = -1 

143 # wikidata: list[str] = [] 

144 # wikipedia: list[str] = [] 

145 

146 def merge(self, other: "Sense") -> None: 

147 """Combine the fields of this Sense with another Sense""" 

148 self.tags = list(set(self.tags + other.tags)) 

149 self.raw_tags = list(set(self.raw_tags + other.raw_tags)) 

150 self.categories = list(set(self.categories + other.categories)) 

151 self.examples.extend(other.examples) 

152 self.synonyms.extend(other.synonyms) 

153 self.antonyms.extend(other.antonyms) 

154 self.related.extend(other.related) 

155 

156 

157# An inflected form of the word, like `{ form: "bats", tags: ["plural"] }` 

158class Form(GreekBaseModel): 

159 form: str = "" 

160 tags: list[str] = [] 

161 raw_tags: list[str] = [] 

162 topics: list[str] = [] 

163 # head_nr: int = -1 

164 ipa: str = "" 

165 # roman: str = "" 

166 # ruby: list[tuple[str, str]] = [] 

167 source: str = "" 

168 # sense_index: str = "" 

169 

170 

171# A pronunciation or audio file. If you have a string of IPA or SAMPA or 

172# something else, that is extracted as its own Sound entry. 

173class Sound(GreekBaseModel): 

174 ipa: str = Field(default="", description="International Phonetic Alphabet") 

175 # enpr: str = Field(default="", description="American Heritage Dictionary") 

176 # sampa: str = Field( 

177 # default="", description="Speech Assessment Methods Phonetic Alphabet" 

178 # ) 

179 audio: str = Field(default="", description="Audio file name") 

180 # wav_url: str = Field(default="") 

181 # ogg_url: str = Field(default="") 

182 # mp3_url: str = Field(default="") 

183 # oga_url: str = Field(default="") 

184 # flac_url: str = Field(default="") 

185 lang_code: str = Field(default="", description="Wiktionary language code") 

186 lang: str = Field(default="", description="Localized language name") 

187 raw_tags: list[str] = [] 

188 tags: list[str] = [] 

189 # rhymes: list[str] = [] 

190 homophones: list[str] = [] 

191 # text: str = "" # Use raw_tags instead 

192 # "Temporary" field used to sort out different sound data between POSes when 

193 # they are originally found in one combined pronunciation section 

194 poses: list[str] = [] 

195 

196 

197# Sometimes we collect raw template arguments separately, like in the main 

198# line English extractor where we keep data from etymology templates. 

199class TemplateData(GreekBaseModel): 

200 name: str = Field(default="", description="Template's name.") 

201 args: dict[str, str] = Field( 

202 default={}, description="Arguments given to the template, if any." 

203 ) 

204 expansion: str = Field( 

205 default="", 

206 description="The result of expanding the template.", 

207 ) 

208 

209 

210# The highest level entry: This is returned from the program as a JSON object 

211# in the JSONL output. These are prototypically Part of Speech sections, 

212# like "Noun" under a higher level section like "Etymology". 

213class WordEntry(GreekBaseModel): 

214 model_config = ConfigDict(title="Greek Wiktionary") 

215 

216 word: str = Field(description="Word string") 

217 # original_title: str = "" 

218 forms: list[Form] = Field(default=[], description="Inflection forms list") 

219 lang_code: str = Field(default="", description="Wiktionary language code") 

220 lang: str = Field(default="", description="Localized language name") 

221 pos: str = Field(default="", description="Part of speech type") 

222 pos_title: str = "" # `==Noun==` 

223 pos_num: int = -1 # `==Noun 2==` Default -1 gets removed. 

224 etymology_text: str = Field( 

225 default="", description="Etymology section as cleaned text." 

226 ) 

227 etymology_templates: list[TemplateData] = Field( 

228 default=[], 

229 description="Templates and their arguments and expansions from the " 

230 "etymology section.", 

231 ) 

232 # For sections like "Etymology 1" 

233 etymology_number: int = -1 

234 senses: list[Sense] = [] 

235 title: str = Field(default="", description="Redirect page source title") 

236 redirect: str = Field(default="", description="Redirect page target title") 

237 categories: list[str] = [] 

238 sounds: list[Sound] = [] 

239 tags: list[str] = [] 

240 raw_tags: list[str] = [] 

241 hyphenation: str = "" # Should be a list `hyphenations`. 

242 head_templates: list[TemplateData] = [] 

243 # alt_of: list[FormOf] = [] 

244 # form_of: list[FormOf] = [] 

245 antonyms: list[Linkage] = [] 

246 # coordinate_terms: list[Linkage] = [] 

247 derived: list[Linkage] = [] 

248 # descendants: list[Linkage] = [] 

249 # holonyms: list[Linkage] = [] 

250 # hypernyms: list[Linkage] = [] 

251 # hyponyms: list[Linkage] = [] 

252 # meronyms: list[Linkage] = [] 

253 # instances: list[Linkage] = [] 

254 # troponyms: list[Linkage] = [] 

255 # inflection_templates: list[TemplateData] = [] 

256 # info_template: list[TemplateData] = [] 

257 # literal_meaning: str = "" 

258 related: list[Linkage] = [] 

259 synonyms: list[Linkage] = [] 

260 translations: list[Translation] = [] 

261 # wikidata: list[str] = [] 

262 # wikipedia: list[str] = []