Coverage for src / wiktextract / extractor / simple / models.py: 100%
62 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1from pydantic import BaseModel, ConfigDict, Field
3# Pydantic models are basically classes that take the place of the dicts
4# used in the main English extractor. They use more resources, but also do
5# a lot of validation work and are easier for the type-checker.
8# Pydantic config stuff.
9class SimpleEnglishBaseModel(BaseModel):
10 model_config = ConfigDict(
11 extra="forbid",
12 strict=True,
13 validate_assignment=True,
14 # We use pydantic mainly for the automatic validation; this
15 # setting ensures the validation is done even when assigning something
16 # after initialization, otherwise it doesn't check anything.
17 validate_default=True,
18 )
21# Not an example, this is for example entries next to glosses.
22class Example(SimpleEnglishBaseModel):
23 text: str = Field(default="", description="Example usage sentence")
24 author: str = Field(default="", description="Author's name")
25 title: str = Field(default="", description="Title of the reference")
26 # SEW example templates are simple and don't seem to have these
27 # latter data.
28 # date: str = Field(default="", description="Original date")
29 # date_published: str = Field(default="", description="Date of publication")
30 # collection: str = Field(
31 # default="",
32 # description="Name of the collection the example was taken from",
33 # )
34 # editor: str = Field(default="", description="Editor")
35 # translator: str = Field(default="", description="Translator")
36 # source: str = Field(
37 # default="",
38 # description="Source of reference",
39 # )
42# General glass for "link to another related word", like synonym, antonym, etc.
43# Instead of having classes for each, we have differnet fields of list[Linkage],
44# like `synonyms: list[Linkage] = []`.
45class Linkage(SimpleEnglishBaseModel):
46 word: str
47 # sense_index: str = ""
48 # note: str = ""
49 # raw_tags: list[str] = []
50 # tags: list[str] = []
53# Basically a line or lines of gloss, a meaning of a word. These are collected
54# under the POS as a list.
55class Sense(SimpleEnglishBaseModel):
56 glosses: list[str] = [] # ["Gloss supercategory", "Specific gloss."]
57 tags: list[str] = []
58 raw_tags: list[str] = []
59 # topics: list[str] = [] # XXX do these.
60 categories: list[str] = [] # Wikipedia category link data; not printed.
61 examples: list[Example] = []
62 synonyms: list[Linkage] = []
63 antonyms: list[Linkage] = []
64 # ruby: list[tuple[str, ...]] = []
67# An inflected form of the word, like `{ form: "bats", tags: ["plural"] }`
68class Form(SimpleEnglishBaseModel):
69 form: str = ""
70 tags: list[str] = []
71 raw_tags: list[str] = []
72 # sense_index: str = ""
75# A pronunciation or audio file. If you have a string of IPA or SAMPA or
76# something else, that is extracted as its own Sound entry.
77class Sound(SimpleEnglishBaseModel):
78 ipa: str = Field(default="", description="International Phonetic Alphabet")
79 enpr: str = Field(default="", description="American Heritage Dictionary")
80 sampa: str = Field(
81 default="", description="Speech Assessment Methods Phonetic Alphabet"
82 )
83 audio: str = Field(default="", description="Audio file name")
84 wav_url: str = Field(default="")
85 ogg_url: str = Field(default="")
86 mp3_url: str = Field(default="")
87 oga_url: str = Field(default="")
88 flac_url: str = Field(default="")
89 lang_code: str = Field(default="en", description="Wiktionary language code")
90 lang: str = Field(default="English", description="Localized language name")
91 raw_tags: list[str] = []
92 tags: list[str] = []
93 rhymes: list[str] = []
94 homophones: list[str] = []
95 # text: str = "" # Use raw_tags instead
96 # "Temporary" field used to sort out different sound data between POSes when
97 # they are originally found in one combined pronunciation section
98 poses: list[str] = []
101# Sometimes we collect raw template arguments separately, like in the main
102# line English extractor where we keep data from etymology templates.
103class TemplateData(SimpleEnglishBaseModel):
104 name: str = Field(default="", description="Template's name.")
105 args: dict[str, str] = Field(
106 default={}, description="Arguments given to the template, if any."
107 )
108 expansion: str = Field(
109 default="",
110 description="The result of expanding the template.",
111 )
114# The highest level entry: This is returned from the program as a JSON object
115# in the JSONL output.
116class WordEntry(SimpleEnglishBaseModel):
117 model_config = ConfigDict(title="Simple English Wiktionary")
119 word: str = Field(description="Word string")
120 # For Simple English, the language is always English
121 forms: list[Form] = Field(default=[], description="Inflection forms list")
122 # We do not use "en" as the default value here, because we also
123 # remove all default values so that we don't have empty or meaningless
124 # fields in the output.
125 lang_code: str = Field(default="", description="Wiktionary language code")
126 lang: str = Field(default="", description="Localized language name")
127 pos: str = Field(default="", description="Part of speech type")
128 pos_title: str = "" # `==Noun==`
129 pos_num: int = -1 # `==Noun 2==` Default -1 gets removed.
130 etymology_text: str = Field(
131 default="", description="Etymology section as cleaned text."
132 )
133 etymology_templates: list[TemplateData] = Field(
134 default=[],
135 description="Templates and their arguments and expansions from the "
136 "etymology section.",
137 )
138 # Simple Wiktionary doesn't have numbered etymology sections
139 senses: list[Sense] = []
140 title: str = Field(default="", description="Redirect page source title")
141 redirect: str = Field(default="", description="Redirect page target title")
142 categories: list[str] = []
143 sounds: list[Sound] = []
144 tags: list[str] = []
145 raw_tags: list[str] = []
146 hyphenation: str = "" # Should be a list `hyphenations`.
147 head_templates: list[TemplateData] = []