Coverage for src/wiktextract/extractor/el/models.py: 93%
94 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1from typing import Literal
3from pydantic import BaseModel, ConfigDict, Field
5from wiktextract.extractor.el.section_titles import POSName
7# Pydantic models are basically classes that take the place of the dicts
8# used in the main English extractor. They use more resources, but also do
9# a lot of validation work and are easier for the type-checker.
12# Search and replace Greek with `Language Name`
13# Pydantic config stuff.
14class GreekBaseModel(BaseModel):
15 model_config = ConfigDict(
16 extra="forbid",
17 strict=True,
18 # We use pydantic mainly for the automatic validation; this
19 # setting ensures the validation is done even when assigning something
20 # after initialization, otherwise it doesn't check anything.
21 validate_assignment=True,
22 validate_default=True,
23 )
26# Examples and quotations in glosses
27class Example(GreekBaseModel):
28 text: str = Field(default="", description="Example usage sentence")
29 type: str = "" # example or quotation etc.
30 translation: str = Field(
31 default="",
32 description="Greek Translation of the example sentence",
33 )
34 # author: str = Field(default="", description="Author's name")
35 # title: str = Field(default="", description="Title of the reference")
36 # ref: str = Field(default="", description="Raw reference string")
37 # url: str = Field(
38 # default="", description="A web link. Not necessarily well-formated."
39 # )
40 # date: str = Field(default="", description="Original date")
41 # date_published: str = Field(default="", description="Date of publication")
42 # collection: str = Field(
43 # default="",
44 # description="Name of the collection the example was taken from",
45 # )
46 # pages: str = Field(default="", description="Page numbers")
47 # year: str = Field(default="", description="Year of publication")
48 # publisher: str = Field(default="", description="Published by")
49 # editor: str = Field(default="", description="Editor")
50 # translator: str = Field(default="", description="Translator")
51 # source: str = Field(
52 # default="",
53 # description="Source of reference",
54 # )
55 # collection: str = Field(
56 # default="",
57 # description="Name of collection that reference was published in",
58 # )
59 # volume: str = Field(default="", description="Volume number")
60 # comment: str = Field(default="", description="Comment on the reference")
61 # accessdate: str = Field(
62 # default="", description="Date of access of online reference"
63 # )
64 # date: str = Field(default="", description="Date of publication")
65 # number: str = Field(default="", description="Issue number")
66 # # chapter: Optional[str] = Field(default=None, description="Chapter name")
67 # place: str = Field(default="", description="Place of publication")
68 # edition: str = Field(default="", description="Edition number")
69 # isbn: str = Field(default="", description="ISBN number")
70 # literal_meaning: str = ""
73class Translation(GreekBaseModel):
74 sense: str = Field(
75 default="", description="A gloss of the sense being translated"
76 )
77 word: str = Field(default="", description="Translation term")
78 lang_code: str = Field(
79 default="",
80 description="Wiktionary language code of the translation term",
81 )
82 lang: str = Field(default="", description="Localized language name")
83 # uncertain: bool = Field(
84 # default=False, description="Translation marked as uncertain"
85 # )
86 roman: str = Field(
87 default="", description="Transliteration to Roman characters"
88 )
89 sense_index: str = ""
90 # note: str = ""
91 # literal_meaning: str = ""
92 raw_tags: list[str] = []
93 tags: list[str] = []
94 # notes: list[str] = Field(default=[], description="A list of notes")
97# General glass for "link to another related word", like synonym, antonym, etc.
98# Instead of having classes for each, we have differnet fields of list[Linkage],
99# like `synonyms: list[Linkage] = []`.
100class Linkage(GreekBaseModel):
101 word: str
102 # translation: str
103 # extra: str
104 # roman: str
105 # sense: str
106 # sense_index: str = ""
107 # note: str = ""
108 raw_tags: list[str] = []
109 tags: list[str] = []
110 topics: list[str] = []
111 # urls: list[str]
112 examples: list[str] = []
115class FormOf(GreekBaseModel):
116 word: str
117 # extra: str
118 # roman: str
121# Basically a line or lines of gloss, a meaning of a word. These are collected
122# under the POS as a list.
123class Sense(GreekBaseModel):
124 glosses: list[str] = [] # ["Gloss supercategory", "Specific gloss."]
125 tags: list[str] = []
126 raw_tags: list[str] = []
127 topics: list[str] = []
128 form_of: list[FormOf] = []
129 # alt_of : list[FormOf] = []
130 # compound_of: list[FormOf] = []
131 categories: list[str] = [] # Wikipedia category link data; not printed.
132 examples: list[Example] = []
133 synonyms: list[Linkage] = []
134 antonyms: list[Linkage] = []
135 # holonyms: list[Linkage] = []
136 # hypernyms: list[Linkage] = []
137 # hyponyms: list[Linkage] = []
138 # instances: list[Linkage] = []
139 # meronyms: list[Linkage] = []
140 related: list[Linkage] = []
141 # links: list[list[str]] = []
142 # coordinate_terms: list[Linkage] = []
143 # ruby: list[tuple[str, ...]] = []
144 # sense_index: str = Field(default="", description="Sense number used in "
145 # "Wiktionary")
146 # head_nr: int = -1
147 # wikidata: list[str] = []
148 # wikipedia: list[str] = []
150 def merge(self, other: "Sense") -> None:
151 """Combine the fields of this Sense with another Sense"""
152 self.tags = sorted(set(self.tags + other.tags))
153 self.raw_tags = sorted(set(self.raw_tags + other.raw_tags))
154 self.categories = sorted(set(self.categories + other.categories))
155 self.examples.extend(other.examples)
156 self.synonyms.extend(other.synonyms)
157 self.antonyms.extend(other.antonyms)
158 self.related.extend(other.related)
161FormSource = Literal[
162 "conjugation",
163 "declension",
164 "header",
165 "inflection", # Can be further narrowed to conjugation/declension
166 "linkage",
167 "",
168]
171# An inflected form of the word, like `{ form: "bats", tags: ["plural"] }`
172class Form(GreekBaseModel):
173 form: str = ""
174 tags: list[str] = []
175 raw_tags: list[str] = []
176 topics: list[str] = []
177 # head_nr: int = -1
178 ipa: str = ""
179 # roman: str = ""
180 # ruby: list[tuple[str, str]] = []
181 source: FormSource = ""
182 # sense_index: str = ""
185# A pronunciation or audio file. If you have a string of IPA or SAMPA or
186# something else, that is extracted as its own Sound entry.
187class Sound(GreekBaseModel):
188 ipa: str = Field(default="", description="International Phonetic Alphabet")
189 # enpr: str = Field(default="", description="American Heritage Dictionary")
190 # sampa: str = Field(
191 # default="", description="Speech Assessment Methods Phonetic Alphabet"
192 # )
193 audio: str = Field(default="", description="Audio file name")
194 # wav_url: str = Field(default="")
195 # ogg_url: str = Field(default="")
196 # mp3_url: str = Field(default="")
197 # oga_url: str = Field(default="")
198 # flac_url: str = Field(default="")
199 lang_code: str = Field(default="", description="Wiktionary language code")
200 lang: str = Field(default="", description="Localized language name")
201 raw_tags: list[str] = []
202 tags: list[str] = []
203 # rhymes: list[str] = []
204 homophones: list[str] = []
205 # text: str = "" # Use raw_tags instead
206 # "Temporary" field used to sort out different sound data between POSes when
207 # they are originally found in one combined pronunciation section
208 poses: list[str] = []
211# Sometimes we collect raw template arguments separately, like in the main
212# line English extractor where we keep data from etymology templates.
213class TemplateData(GreekBaseModel):
214 name: str = Field(default="", description="Template's name.")
215 args: dict[str, str] = Field(
216 default={}, description="Arguments given to the template, if any."
217 )
218 expansion: str = Field(
219 default="",
220 description="The result of expanding the template.",
221 )
224# The highest level entry: This is returned from the program as a JSON object
225# in the JSONL output. These are prototypically Part of Speech sections,
226# like "Noun" under a higher level section like "Etymology".
227class WordEntry(GreekBaseModel):
228 model_config = ConfigDict(title="Greek Wiktionary")
230 word: str = Field(description="Word string")
231 # original_title: str = ""
232 forms: list[Form] = Field(default=[], description="Inflection forms list")
233 lang_code: str = Field(default="", description="Wiktionary language code")
234 lang: str = Field(default="", description="Localized language name")
235 pos: POSName = Field(default="", description="Part of speech type")
236 pos_title: str = "" # `==Noun==`
237 pos_num: int = -1 # `==Noun 2==` Default -1 gets removed.
238 etymology_text: str = Field(
239 default="", description="Etymology section as cleaned text."
240 )
241 etymology_templates: list[TemplateData] = Field(
242 default=[],
243 description="Templates and their arguments and expansions from the "
244 "etymology section.",
245 )
246 # For sections like "Etymology 1"
247 etymology_number: int = -1
248 senses: list[Sense] = []
249 title: str = Field(default="", description="Redirect page source title")
250 redirect: str = Field(default="", description="Redirect page target title")
251 categories: list[str] = []
252 sounds: list[Sound] = []
253 tags: list[str] = []
254 topics: list[str] = []
255 raw_tags: list[str] = []
256 hyphenation: str = "" # Should be a list `hyphenations`.
257 head_templates: list[TemplateData] = []
258 # alt_of: list[FormOf] = []
259 form_of: list[FormOf] = []
260 antonyms: list[Linkage] = []
261 # coordinate_terms: list[Linkage] = []
262 derived: list[Linkage] = []
263 # descendants: list[Linkage] = []
264 # holonyms: list[Linkage] = []
265 # hypernyms: list[Linkage] = []
266 # hyponyms: list[Linkage] = []
267 # meronyms: list[Linkage] = []
268 # instances: list[Linkage] = []
269 # troponyms: list[Linkage] = []
270 # inflection_templates: list[TemplateData] = []
271 # info_template: list[TemplateData] = []
272 # literal_meaning: str = ""
273 related: list[Linkage] = []
274 synonyms: list[Linkage] = []
275 translations: list[Translation] = []
276 # wikidata: list[str] = []
277 # wikipedia: list[str] = []