Coverage for src/wiktextract/extractor/el/models.py: 92%
87 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1from pydantic import BaseModel, ConfigDict, Field
3# Pydantic models are basically classes that take the place of the dicts
4# used in the main English extractor. They use more resources, but also do
5# a lot of validation work and are easier for the type-checker.
8# Search and replace Greek with `Language Name`
9# Pydantic config stuff.
10class GreekBaseModel(BaseModel):
11 model_config = ConfigDict(
12 extra="forbid",
13 strict=True,
14 # We use pydantic mainly for the automatic validation; this
15 # setting ensures the validation is done even when assigning something
16 # after initialization, otherwise it doesn't check anything.
17 validate_assignment=True,
18 validate_default=True,
19 )
22# Examples and quotations in glosses
23class Example(GreekBaseModel):
24 text: str = Field(default="", description="Example usage sentence")
25 type: str = "" # example or quotation etc.
26 translation: str = Field(
27 default="",
28 description="Greek Translation of the example sentence",
29 )
30 # author: str = Field(default="", description="Author's name")
31 # title: str = Field(default="", description="Title of the reference")
32 # ref: str = Field(default="", description="Raw reference string")
33 # url: str = Field(
34 # default="", description="A web link. Not necessarily well-formated."
35 # )
36 # date: str = Field(default="", description="Original date")
37 # date_published: str = Field(default="", description="Date of publication")
38 # collection: str = Field(
39 # default="",
40 # description="Name of the collection the example was taken from",
41 # )
42 # pages: str = Field(default="", description="Page numbers")
43 # year: str = Field(default="", description="Year of publication")
44 # publisher: str = Field(default="", description="Published by")
45 # editor: str = Field(default="", description="Editor")
46 # translator: str = Field(default="", description="Translator")
47 # source: str = Field(
48 # default="",
49 # description="Source of reference",
50 # )
51 # collection: str = Field(
52 # default="",
53 # description="Name of collection that reference was published in",
54 # )
55 # volume: str = Field(default="", description="Volume number")
56 # comment: str = Field(default="", description="Comment on the reference")
57 # accessdate: str = Field(
58 # default="", description="Date of access of online reference"
59 # )
60 # date: str = Field(default="", description="Date of publication")
61 # number: str = Field(default="", description="Issue number")
62 # # chapter: Optional[str] = Field(default=None, description="Chapter name")
63 # place: str = Field(default="", description="Place of publication")
64 # edition: str = Field(default="", description="Edition number")
65 # isbn: str = Field(default="", description="ISBN number")
66 # literal_meaning: str = ""
69class Translation(GreekBaseModel):
70 sense: str = Field(
71 default="", description="A gloss of the sense being translated"
72 )
73 word: str = Field(default="", description="Translation term")
74 lang_code: str = Field(
75 default="",
76 description="Wiktionary language code of the translation term",
77 )
78 lang: str = Field(default="", description="Localized language name")
79 # uncertain: bool = Field(
80 # default=False, description="Translation marked as uncertain"
81 # )
82 roman: str = Field(
83 default="", description="Transliteration to Roman characters"
84 )
85 sense_index: str = ""
86 # note: str = ""
87 # literal_meaning: str = ""
88 raw_tags: list[str] = []
89 tags: list[str] = []
90 # notes: list[str] = Field(default=[], description="A list of notes")
93# General glass for "link to another related word", like synonym, antonym, etc.
94# Instead of having classes for each, we have differnet fields of list[Linkage],
95# like `synonyms: list[Linkage] = []`.
96class Linkage(GreekBaseModel):
97 word: str
98 # translation: str
99 # extra: str
100 # roman: str
101 # sense: str
102 # sense_index: str = ""
103 # note: str = ""
104 raw_tags: list[str] = []
105 tags: list[str] = []
106 # topics: list[str] = []
107 # urls: list[str]
108 examples: list[str] = []
111class FormOf(GreekBaseModel):
112 word: str
113 # extra: str
114 # roman: str
117# Basically a line or lines of gloss, a meaning of a word. These are collected
118# under the POS as a list.
119class Sense(GreekBaseModel):
120 glosses: list[str] = [] # ["Gloss supercategory", "Specific gloss."]
121 tags: list[str] = []
122 raw_tags: list[str] = []
123 form_of: list[FormOf] = []
124 # alt_of : list[FormOf] = []
125 # compound_of: list[FormOf] = []
126 # topics: list[str] = []
127 categories: list[str] = [] # Wikipedia category link data; not printed.
128 examples: list[Example] = []
129 synonyms: list[Linkage] = []
130 antonyms: list[Linkage] = []
131 # holonyms: list[Linkage] = []
132 # hypernyms: list[Linkage] = []
133 # hyponyms: list[Linkage] = []
134 # instances: list[Linkage] = []
135 # meronyms: list[Linkage] = []
136 related: list[Linkage] = []
137 # links: list[list[str]] = []
138 # coordinate_terms: list[Linkage] = []
139 # ruby: list[tuple[str, ...]] = []
140 # sense_index: str = Field(default="", description="Sense number used in "
141 # "Wiktionary")
142 # head_nr: int = -1
143 # wikidata: list[str] = []
144 # wikipedia: list[str] = []
146 def merge(self, other: "Sense") -> None:
147 """Combine the fields of this Sense with another Sense"""
148 self.tags = list(set(self.tags + other.tags))
149 self.raw_tags = list(set(self.raw_tags + other.raw_tags))
150 self.categories = list(set(self.categories + other.categories))
151 self.examples.extend(other.examples)
152 self.synonyms.extend(other.synonyms)
153 self.antonyms.extend(other.antonyms)
154 self.related.extend(other.related)
157# An inflected form of the word, like `{ form: "bats", tags: ["plural"] }`
158class Form(GreekBaseModel):
159 form: str = ""
160 tags: list[str] = []
161 raw_tags: list[str] = []
162 topics: list[str] = []
163 # head_nr: int = -1
164 ipa: str = ""
165 # roman: str = ""
166 # ruby: list[tuple[str, str]] = []
167 source: str = ""
168 # sense_index: str = ""
171# A pronunciation or audio file. If you have a string of IPA or SAMPA or
172# something else, that is extracted as its own Sound entry.
173class Sound(GreekBaseModel):
174 ipa: str = Field(default="", description="International Phonetic Alphabet")
175 # enpr: str = Field(default="", description="American Heritage Dictionary")
176 # sampa: str = Field(
177 # default="", description="Speech Assessment Methods Phonetic Alphabet"
178 # )
179 audio: str = Field(default="", description="Audio file name")
180 # wav_url: str = Field(default="")
181 # ogg_url: str = Field(default="")
182 # mp3_url: str = Field(default="")
183 # oga_url: str = Field(default="")
184 # flac_url: str = Field(default="")
185 lang_code: str = Field(default="", description="Wiktionary language code")
186 lang: str = Field(default="", description="Localized language name")
187 raw_tags: list[str] = []
188 tags: list[str] = []
189 # rhymes: list[str] = []
190 homophones: list[str] = []
191 # text: str = "" # Use raw_tags instead
192 # "Temporary" field used to sort out different sound data between POSes when
193 # they are originally found in one combined pronunciation section
194 poses: list[str] = []
197# Sometimes we collect raw template arguments separately, like in the main
198# line English extractor where we keep data from etymology templates.
199class TemplateData(GreekBaseModel):
200 name: str = Field(default="", description="Template's name.")
201 args: dict[str, str] = Field(
202 default={}, description="Arguments given to the template, if any."
203 )
204 expansion: str = Field(
205 default="",
206 description="The result of expanding the template.",
207 )
210# The highest level entry: This is returned from the program as a JSON object
211# in the JSONL output. These are prototypically Part of Speech sections,
212# like "Noun" under a higher level section like "Etymology".
213class WordEntry(GreekBaseModel):
214 model_config = ConfigDict(title="Greek Wiktionary")
216 word: str = Field(description="Word string")
217 # original_title: str = ""
218 forms: list[Form] = Field(default=[], description="Inflection forms list")
219 lang_code: str = Field(default="", description="Wiktionary language code")
220 lang: str = Field(default="", description="Localized language name")
221 pos: str = Field(default="", description="Part of speech type")
222 pos_title: str = "" # `==Noun==`
223 pos_num: int = -1 # `==Noun 2==` Default -1 gets removed.
224 etymology_text: str = Field(
225 default="", description="Etymology section as cleaned text."
226 )
227 etymology_templates: list[TemplateData] = Field(
228 default=[],
229 description="Templates and their arguments and expansions from the "
230 "etymology section.",
231 )
232 # For sections like "Etymology 1"
233 etymology_number: int = -1
234 senses: list[Sense] = []
235 title: str = Field(default="", description="Redirect page source title")
236 redirect: str = Field(default="", description="Redirect page target title")
237 categories: list[str] = []
238 sounds: list[Sound] = []
239 tags: list[str] = []
240 raw_tags: list[str] = []
241 hyphenation: str = "" # Should be a list `hyphenations`.
242 head_templates: list[TemplateData] = []
243 # alt_of: list[FormOf] = []
244 # form_of: list[FormOf] = []
245 antonyms: list[Linkage] = []
246 # coordinate_terms: list[Linkage] = []
247 derived: list[Linkage] = []
248 # descendants: list[Linkage] = []
249 # holonyms: list[Linkage] = []
250 # hypernyms: list[Linkage] = []
251 # hyponyms: list[Linkage] = []
252 # meronyms: list[Linkage] = []
253 # instances: list[Linkage] = []
254 # troponyms: list[Linkage] = []
255 # inflection_templates: list[TemplateData] = []
256 # info_template: list[TemplateData] = []
257 # literal_meaning: str = ""
258 related: list[Linkage] = []
259 synonyms: list[Linkage] = []
260 translations: list[Translation] = []
261 # wikidata: list[str] = []
262 # wikipedia: list[str] = []