Coverage for src/wiktextract/extractor/el/models.py: 92%

1from pydantic import BaseModel, ConfigDict, Field

3# Pydantic models are basically classes that take the place of the dicts

4# used in the main English extractor. They use more resources, but also do

5# a lot of validation work and are easier for the type-checker.

8# Search and replace Greek with `Language Name`

9# Pydantic config stuff.

10class GreekBaseModel(BaseModel):

11 model_config = ConfigDict(

12 extra="forbid",

13 strict=True,

14 # We use pydantic mainly for the automatic validation; this

15 # setting ensures the validation is done even when assigning something

16 # after initialization, otherwise it doesn't check anything.

17 validate_assignment=True,

18 validate_default=True,

19 )

22# Examples and quotations in glosses

23class Example(GreekBaseModel):

24 text: str = Field(default="", description="Example usage sentence")

25 type: str = "" # example or quotation etc.

26 translation: str = Field(

27 default="",

28 description="Greek Translation of the example sentence",

29 )

30 # author: str = Field(default="", description="Author's name")

31 # title: str = Field(default="", description="Title of the reference")

32 # ref: str = Field(default="", description="Raw reference string")

33 # url: str = Field(

34 # default="", description="A web link. Not necessarily well-formated."

35 # )

36 # date: str = Field(default="", description="Original date")

37 # date_published: str = Field(default="", description="Date of publication")

38 # collection: str = Field(

39 # default="",

40 # description="Name of the collection the example was taken from",

41 # )

42 # pages: str = Field(default="", description="Page numbers")

43 # year: str = Field(default="", description="Year of publication")

44 # publisher: str = Field(default="", description="Published by")

45 # editor: str = Field(default="", description="Editor")

46 # translator: str = Field(default="", description="Translator")

47 # source: str = Field(

48 # default="",

49 # description="Source of reference",

50 # )

51 # collection: str = Field(

52 # default="",

53 # description="Name of collection that reference was published in",

54 # )

55 # volume: str = Field(default="", description="Volume number")

56 # comment: str = Field(default="", description="Comment on the reference")

57 # accessdate: str = Field(

58 # default="", description="Date of access of online reference"

59 # )

60 # date: str = Field(default="", description="Date of publication")

61 # number: str = Field(default="", description="Issue number")

62 # # chapter: Optional[str] = Field(default=None, description="Chapter name")

63 # place: str = Field(default="", description="Place of publication")

64 # edition: str = Field(default="", description="Edition number")

65 # isbn: str = Field(default="", description="ISBN number")

66 # literal_meaning: str = ""

69class Translation(GreekBaseModel):

70 sense: str = Field(

71 default="", description="A gloss of the sense being translated"

72 )

73 word: str = Field(default="", description="Translation term")

74 lang_code: str = Field(

75 default="",

76 description="Wiktionary language code of the translation term",

77 )

78 lang: str = Field(default="", description="Localized language name")

79 # uncertain: bool = Field(

80 # default=False, description="Translation marked as uncertain"

81 # )

82 roman: str = Field(

83 default="", description="Transliteration to Roman characters"

84 )

85 sense_index: str = ""

86 # note: str = ""

87 # literal_meaning: str = ""

88 raw_tags: list[str] = []

89 tags: list[str] = []

90 # notes: list[str] = Field(default=[], description="A list of notes")

93# General glass for "link to another related word", like synonym, antonym, etc.

94# Instead of having classes for each, we have differnet fields of list[Linkage],

95# like `synonyms: list[Linkage] = []`.

96class Linkage(GreekBaseModel):

97 word: str

98 # translation: str

99 # extra: str

100 # roman: str

101 # sense: str

102 # sense_index: str = ""

103 # note: str = ""

104 raw_tags: list[str] = []

105 tags: list[str] = []

106 # topics: list[str] = []

107 # urls: list[str]

108 examples: list[str] = []

109

110

111class FormOf(GreekBaseModel):

112 word: str

113 # extra: str

114 # roman: str

115

116

117# Basically a line or lines of gloss, a meaning of a word. These are collected

118# under the POS as a list.

119class Sense(GreekBaseModel):

120 glosses: list[str] = [] # ["Gloss supercategory", "Specific gloss."]

121 tags: list[str] = []

122 raw_tags: list[str] = []

123 form_of: list[FormOf] = []

124 # alt_of : list[FormOf] = []

125 # compound_of: list[FormOf] = []

126 # topics: list[str] = []

127 categories: list[str] = [] # Wikipedia category link data; not printed.

128 examples: list[Example] = []

129 synonyms: list[Linkage] = []

130 antonyms: list[Linkage] = []

131 # holonyms: list[Linkage] = []

132 # hypernyms: list[Linkage] = []

133 # hyponyms: list[Linkage] = []

134 # instances: list[Linkage] = []

135 # meronyms: list[Linkage] = []

136 related: list[Linkage] = []

137 # links: list[list[str]] = []

138 # coordinate_terms: list[Linkage] = []

139 # ruby: list[tuple[str, ...]] = []

140 # sense_index: str = Field(default="", description="Sense number used in "

141 # "Wiktionary")

142 # head_nr: int = -1

143 # wikidata: list[str] = []

144 # wikipedia: list[str] = []

145

146 def merge(self, other: "Sense") -> None:

147 """Combine the fields of this Sense with another Sense"""

148 self.tags = list(set(self.tags + other.tags))

149 self.raw_tags = list(set(self.raw_tags + other.raw_tags))

150 self.categories = list(set(self.categories + other.categories))

151 self.examples.extend(other.examples)

152 self.synonyms.extend(other.synonyms)

153 self.antonyms.extend(other.antonyms)

154 self.related.extend(other.related)

155

156

157# An inflected form of the word, like `{ form: "bats", tags: ["plural"] }`

158class Form(GreekBaseModel):

159 form: str = ""

160 tags: list[str] = []

161 raw_tags: list[str] = []

162 topics: list[str] = []

163 # head_nr: int = -1

164 ipa: str = ""

165 # roman: str = ""

166 # ruby: list[tuple[str, str]] = []

167 source: str = ""

168 # sense_index: str = ""

169

170

171# A pronunciation or audio file. If you have a string of IPA or SAMPA or

172# something else, that is extracted as its own Sound entry.

173class Sound(GreekBaseModel):

174 ipa: str = Field(default="", description="International Phonetic Alphabet")

175 # enpr: str = Field(default="", description="American Heritage Dictionary")

176 # sampa: str = Field(

177 # default="", description="Speech Assessment Methods Phonetic Alphabet"

178 # )

179 audio: str = Field(default="", description="Audio file name")

180 # wav_url: str = Field(default="")

181 # ogg_url: str = Field(default="")

182 # mp3_url: str = Field(default="")

183 # oga_url: str = Field(default="")

184 # flac_url: str = Field(default="")

185 lang_code: str = Field(default="", description="Wiktionary language code")

186 lang: str = Field(default="", description="Localized language name")

187 raw_tags: list[str] = []

188 tags: list[str] = []

189 # rhymes: list[str] = []

190 homophones: list[str] = []

191 # text: str = "" # Use raw_tags instead

192 # "Temporary" field used to sort out different sound data between POSes when

193 # they are originally found in one combined pronunciation section

194 poses: list[str] = []

195

196

197# Sometimes we collect raw template arguments separately, like in the main

198# line English extractor where we keep data from etymology templates.

199class TemplateData(GreekBaseModel):

200 name: str = Field(default="", description="Template's name.")

201 args: dict[str, str] = Field(

202 default={}, description="Arguments given to the template, if any."

203 )

204 expansion: str = Field(

205 default="",

206 description="The result of expanding the template.",

207 )

208

209

210# The highest level entry: This is returned from the program as a JSON object

211# in the JSONL output. These are prototypically Part of Speech sections,

212# like "Noun" under a higher level section like "Etymology".

213class WordEntry(GreekBaseModel):

214 model_config = ConfigDict(title="Greek Wiktionary")

215

216 word: str = Field(description="Word string")

217 # original_title: str = ""

218 forms: list[Form] = Field(default=[], description="Inflection forms list")

219 lang_code: str = Field(default="", description="Wiktionary language code")

220 lang: str = Field(default="", description="Localized language name")

221 pos: str = Field(default="", description="Part of speech type")

222 pos_title: str = "" # `==Noun==`

223 pos_num: int = -1 # `==Noun 2==` Default -1 gets removed.

224 etymology_text: str = Field(

225 default="", description="Etymology section as cleaned text."

226 )

227 etymology_templates: list[TemplateData] = Field(

228 default=[],

229 description="Templates and their arguments and expansions from the "

230 "etymology section.",

231 )

232 # For sections like "Etymology 1"

233 etymology_number: int = -1

234 senses: list[Sense] = []

235 title: str = Field(default="", description="Redirect page source title")

236 redirect: str = Field(default="", description="Redirect page target title")

237 categories: list[str] = []

238 sounds: list[Sound] = []

239 tags: list[str] = []

240 raw_tags: list[str] = []

241 hyphenation: str = "" # Should be a list `hyphenations`.

242 head_templates: list[TemplateData] = []

243 # alt_of: list[FormOf] = []

244 # form_of: list[FormOf] = []

245 antonyms: list[Linkage] = []

246 # coordinate_terms: list[Linkage] = []

247 derived: list[Linkage] = []

248 # descendants: list[Linkage] = []

249 # holonyms: list[Linkage] = []

250 # hypernyms: list[Linkage] = []

251 # hyponyms: list[Linkage] = []

252 # meronyms: list[Linkage] = []

253 # instances: list[Linkage] = []

254 # troponyms: list[Linkage] = []

255 # inflection_templates: list[TemplateData] = []

256 # info_template: list[TemplateData] = []

257 # literal_meaning: str = ""

258 related: list[Linkage] = []

259 synonyms: list[Linkage] = []

260 translations: list[Translation] = []

261 # wikidata: list[str] = []

262 # wikipedia: list[str] = []