Coverage for src/wiktextract/extractor/el/models.py: 93%

94 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1from typing import Literal 

2 

3from pydantic import BaseModel, ConfigDict, Field 

4 

5from wiktextract.extractor.el.section_titles import POSName 

6 

7# Pydantic models are basically classes that take the place of the dicts 

8# used in the main English extractor. They use more resources, but also do 

9# a lot of validation work and are easier for the type-checker. 

10 

11 

12# Search and replace Greek with `Language Name` 

13# Pydantic config stuff. 

14class GreekBaseModel(BaseModel): 

15 model_config = ConfigDict( 

16 extra="forbid", 

17 strict=True, 

18 # We use pydantic mainly for the automatic validation; this 

19 # setting ensures the validation is done even when assigning something 

20 # after initialization, otherwise it doesn't check anything. 

21 validate_assignment=True, 

22 validate_default=True, 

23 ) 

24 

25 

26# Examples and quotations in glosses 

27class Example(GreekBaseModel): 

28 text: str = Field(default="", description="Example usage sentence") 

29 type: str = "" # example or quotation etc. 

30 translation: str = Field( 

31 default="", 

32 description="Greek Translation of the example sentence", 

33 ) 

34 # author: str = Field(default="", description="Author's name") 

35 # title: str = Field(default="", description="Title of the reference") 

36 # ref: str = Field(default="", description="Raw reference string") 

37 # url: str = Field( 

38 # default="", description="A web link. Not necessarily well-formated." 

39 # ) 

40 # date: str = Field(default="", description="Original date") 

41 # date_published: str = Field(default="", description="Date of publication") 

42 # collection: str = Field( 

43 # default="", 

44 # description="Name of the collection the example was taken from", 

45 # ) 

46 # pages: str = Field(default="", description="Page numbers") 

47 # year: str = Field(default="", description="Year of publication") 

48 # publisher: str = Field(default="", description="Published by") 

49 # editor: str = Field(default="", description="Editor") 

50 # translator: str = Field(default="", description="Translator") 

51 # source: str = Field( 

52 # default="", 

53 # description="Source of reference", 

54 # ) 

55 # collection: str = Field( 

56 # default="", 

57 # description="Name of collection that reference was published in", 

58 # ) 

59 # volume: str = Field(default="", description="Volume number") 

60 # comment: str = Field(default="", description="Comment on the reference") 

61 # accessdate: str = Field( 

62 # default="", description="Date of access of online reference" 

63 # ) 

64 # date: str = Field(default="", description="Date of publication") 

65 # number: str = Field(default="", description="Issue number") 

66 # # chapter: Optional[str] = Field(default=None, description="Chapter name") 

67 # place: str = Field(default="", description="Place of publication") 

68 # edition: str = Field(default="", description="Edition number") 

69 # isbn: str = Field(default="", description="ISBN number") 

70 # literal_meaning: str = "" 

71 

72 

73class Translation(GreekBaseModel): 

74 sense: str = Field( 

75 default="", description="A gloss of the sense being translated" 

76 ) 

77 word: str = Field(default="", description="Translation term") 

78 lang_code: str = Field( 

79 default="", 

80 description="Wiktionary language code of the translation term", 

81 ) 

82 lang: str = Field(default="", description="Localized language name") 

83 # uncertain: bool = Field( 

84 # default=False, description="Translation marked as uncertain" 

85 # ) 

86 roman: str = Field( 

87 default="", description="Transliteration to Roman characters" 

88 ) 

89 sense_index: str = "" 

90 # note: str = "" 

91 # literal_meaning: str = "" 

92 raw_tags: list[str] = [] 

93 tags: list[str] = [] 

94 # notes: list[str] = Field(default=[], description="A list of notes") 

95 

96 

97# General glass for "link to another related word", like synonym, antonym, etc. 

98# Instead of having classes for each, we have differnet fields of list[Linkage], 

99# like `synonyms: list[Linkage] = []`. 

100class Linkage(GreekBaseModel): 

101 word: str 

102 # translation: str 

103 # extra: str 

104 # roman: str 

105 # sense: str 

106 # sense_index: str = "" 

107 # note: str = "" 

108 raw_tags: list[str] = [] 

109 tags: list[str] = [] 

110 topics: list[str] = [] 

111 # urls: list[str] 

112 examples: list[str] = [] 

113 

114 

115class FormOf(GreekBaseModel): 

116 word: str 

117 # extra: str 

118 # roman: str 

119 

120 

121# Basically a line or lines of gloss, a meaning of a word. These are collected 

122# under the POS as a list. 

123class Sense(GreekBaseModel): 

124 glosses: list[str] = [] # ["Gloss supercategory", "Specific gloss."] 

125 tags: list[str] = [] 

126 raw_tags: list[str] = [] 

127 topics: list[str] = [] 

128 form_of: list[FormOf] = [] 

129 # alt_of : list[FormOf] = [] 

130 # compound_of: list[FormOf] = [] 

131 categories: list[str] = [] # Wikipedia category link data; not printed. 

132 examples: list[Example] = [] 

133 synonyms: list[Linkage] = [] 

134 antonyms: list[Linkage] = [] 

135 # holonyms: list[Linkage] = [] 

136 # hypernyms: list[Linkage] = [] 

137 # hyponyms: list[Linkage] = [] 

138 # instances: list[Linkage] = [] 

139 # meronyms: list[Linkage] = [] 

140 related: list[Linkage] = [] 

141 # links: list[list[str]] = [] 

142 # coordinate_terms: list[Linkage] = [] 

143 # ruby: list[tuple[str, ...]] = [] 

144 # sense_index: str = Field(default="", description="Sense number used in " 

145 # "Wiktionary") 

146 # head_nr: int = -1 

147 # wikidata: list[str] = [] 

148 # wikipedia: list[str] = [] 

149 

150 def merge(self, other: "Sense") -> None: 

151 """Combine the fields of this Sense with another Sense""" 

152 self.tags = sorted(set(self.tags + other.tags)) 

153 self.raw_tags = sorted(set(self.raw_tags + other.raw_tags)) 

154 self.categories = sorted(set(self.categories + other.categories)) 

155 self.examples.extend(other.examples) 

156 self.synonyms.extend(other.synonyms) 

157 self.antonyms.extend(other.antonyms) 

158 self.related.extend(other.related) 

159 

160 

161FormSource = Literal[ 

162 "conjugation", 

163 "declension", 

164 "header", 

165 "inflection", # Can be further narrowed to conjugation/declension 

166 "linkage", 

167 "", 

168] 

169 

170 

171# An inflected form of the word, like `{ form: "bats", tags: ["plural"] }` 

172class Form(GreekBaseModel): 

173 form: str = "" 

174 tags: list[str] = [] 

175 raw_tags: list[str] = [] 

176 topics: list[str] = [] 

177 # head_nr: int = -1 

178 ipa: str = "" 

179 # roman: str = "" 

180 # ruby: list[tuple[str, str]] = [] 

181 source: FormSource = "" 

182 # sense_index: str = "" 

183 

184 

185# A pronunciation or audio file. If you have a string of IPA or SAMPA or 

186# something else, that is extracted as its own Sound entry. 

187class Sound(GreekBaseModel): 

188 ipa: str = Field(default="", description="International Phonetic Alphabet") 

189 # enpr: str = Field(default="", description="American Heritage Dictionary") 

190 # sampa: str = Field( 

191 # default="", description="Speech Assessment Methods Phonetic Alphabet" 

192 # ) 

193 audio: str = Field(default="", description="Audio file name") 

194 # wav_url: str = Field(default="") 

195 # ogg_url: str = Field(default="") 

196 # mp3_url: str = Field(default="") 

197 # oga_url: str = Field(default="") 

198 # flac_url: str = Field(default="") 

199 lang_code: str = Field(default="", description="Wiktionary language code") 

200 lang: str = Field(default="", description="Localized language name") 

201 raw_tags: list[str] = [] 

202 tags: list[str] = [] 

203 # rhymes: list[str] = [] 

204 homophones: list[str] = [] 

205 # text: str = "" # Use raw_tags instead 

206 # "Temporary" field used to sort out different sound data between POSes when 

207 # they are originally found in one combined pronunciation section 

208 poses: list[str] = [] 

209 

210 

211# Sometimes we collect raw template arguments separately, like in the main 

212# line English extractor where we keep data from etymology templates. 

213class TemplateData(GreekBaseModel): 

214 name: str = Field(default="", description="Template's name.") 

215 args: dict[str, str] = Field( 

216 default={}, description="Arguments given to the template, if any." 

217 ) 

218 expansion: str = Field( 

219 default="", 

220 description="The result of expanding the template.", 

221 ) 

222 

223 

224# The highest level entry: This is returned from the program as a JSON object 

225# in the JSONL output. These are prototypically Part of Speech sections, 

226# like "Noun" under a higher level section like "Etymology". 

227class WordEntry(GreekBaseModel): 

228 model_config = ConfigDict(title="Greek Wiktionary") 

229 

230 word: str = Field(description="Word string") 

231 # original_title: str = "" 

232 forms: list[Form] = Field(default=[], description="Inflection forms list") 

233 lang_code: str = Field(default="", description="Wiktionary language code") 

234 lang: str = Field(default="", description="Localized language name") 

235 pos: POSName = Field(default="", description="Part of speech type") 

236 pos_title: str = "" # `==Noun==` 

237 pos_num: int = -1 # `==Noun 2==` Default -1 gets removed. 

238 etymology_text: str = Field( 

239 default="", description="Etymology section as cleaned text." 

240 ) 

241 etymology_templates: list[TemplateData] = Field( 

242 default=[], 

243 description="Templates and their arguments and expansions from the " 

244 "etymology section.", 

245 ) 

246 # For sections like "Etymology 1" 

247 etymology_number: int = -1 

248 senses: list[Sense] = [] 

249 title: str = Field(default="", description="Redirect page source title") 

250 redirect: str = Field(default="", description="Redirect page target title") 

251 categories: list[str] = [] 

252 sounds: list[Sound] = [] 

253 tags: list[str] = [] 

254 topics: list[str] = [] 

255 raw_tags: list[str] = [] 

256 hyphenation: str = "" # Should be a list `hyphenations`. 

257 head_templates: list[TemplateData] = [] 

258 # alt_of: list[FormOf] = [] 

259 form_of: list[FormOf] = [] 

260 antonyms: list[Linkage] = [] 

261 # coordinate_terms: list[Linkage] = [] 

262 derived: list[Linkage] = [] 

263 # descendants: list[Linkage] = [] 

264 # holonyms: list[Linkage] = [] 

265 # hypernyms: list[Linkage] = [] 

266 # hyponyms: list[Linkage] = [] 

267 # meronyms: list[Linkage] = [] 

268 # instances: list[Linkage] = [] 

269 # troponyms: list[Linkage] = [] 

270 # inflection_templates: list[TemplateData] = [] 

271 # info_template: list[TemplateData] = [] 

272 # literal_meaning: str = "" 

273 related: list[Linkage] = [] 

274 synonyms: list[Linkage] = [] 

275 translations: list[Translation] = [] 

276 # wikidata: list[str] = [] 

277 # wikipedia: list[str] = []