Coverage for src/wiktextract/extractor/simple/models.py: 100%
62 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from pydantic import BaseModel, ConfigDict, Field
3# Pydantic models are basically classes that take the place of the dicts
4# used in the main English extractor. They use more resources, but also do
5# a lot of validation work and are easier for the type-checker.
7# Pydantic config stuff.
8class SimpleEnglishBaseModel(BaseModel):
9 model_config = ConfigDict(
10 extra="forbid",
11 strict=True,
12 validate_assignment=True,
13 # We use pydantic mainly for the automatic validation; this
14 # setting ensures the validation is done even when assigning something
15 # after initialization, otherwise it doesn't check anything.
16 validate_default=True,
17 )
19# Not an example, this is for example entries next to glosses.
20class Example(SimpleEnglishBaseModel):
21 text: str = Field(default="", description="Example usage sentence")
22 author: str = Field(default="", description="Author's name")
23 title: str = Field(default="", description="Title of the reference")
24 # SEW example templates are simple and don't seem to have these
25 # latter datas.
26 # date: str = Field(default="", description="Original date")
27 # date_published: str = Field(default="", description="Date of publication")
28 # collection: str = Field(
29 # default="",
30 # description="Name of the collection the example was taken from",
31 # )
32 # editor: str = Field(default="", description="Editor")
33 # translator: str = Field(default="", description="Translator")
34 # source: str = Field(
35 # default="",
36 # description="Source of reference",
37 # )
39# General glass for "link to another related word", like synonym, antonym, etc.
40# Instead of having classes for each, we have differnet fields of list[Linkage],
41# like `synonyms: list[Linkage] = []`.
42class Linkage(SimpleEnglishBaseModel):
43 word: str
44 # sense_index: str = ""
45 # note: str = ""
46 # raw_tags: list[str] = []
47 # tags: list[str] = []
49# Basically a line or lines of gloss, a meaning of a word. These are collected
50# under the POS as a list.
51class Sense(SimpleEnglishBaseModel):
52 glosses: list[str] = [] # ["Gloss supercategory", "Specific gloss."]
53 tags: list[str] = []
54 raw_tags: list[str] = []
55 # topics: list[str] = [] # XXX do these.
56 categories: list[str] = [] # Wikipedia category link data; not printed.
57 examples: list[Example] = []
58 synonyms: list[Linkage] = []
59 antonyms: list[Linkage] = []
60 # ruby: list[tuple[str, ...]] = []
62# An inflected form of the word, like `{ form: "bats", tags: ["plural"] }`
63class Form(SimpleEnglishBaseModel):
64 form: str = ""
65 tags: list[str] = []
66 raw_tags: list[str] = []
67 # sense_index: str = ""
70# A pronunciation or audio file. If you have a string of IPA or SAMPA or
71# something else, that is extracted as its own Sound entry.
72class Sound(SimpleEnglishBaseModel):
73 ipa: str = Field(default="", description="International Phonetic Alphabet")
74 enpr: str = Field(default="", description="American Heritage Dictionary")
75 sampa: str = Field(
76 default="", description="Speech Assessment Methods Phonetic Alphabet"
77 )
78 audio: str = Field(default="", description="Audio file name")
79 wav_url: str = Field(default="")
80 ogg_url: str = Field(default="")
81 mp3_url: str = Field(default="")
82 oga_url: str = Field(default="")
83 flac_url: str = Field(default="")
84 lang_code: str = Field(default="en", description="Wiktionary language code")
85 lang: str = Field(default="English", description="Localized language name")
86 raw_tags: list[str] = []
87 tags: list[str] = []
88 rhymes: list[str] = []
89 homophones: list[str] = []
90 # text: str = "" # Use raw_tags instead
91 # "Temporary" field used to sort out different sound data between POSes when
92 # they are originally found in one combined pronunciation section
93 poses: list[str] = []
95# Sometimes we collect raw template arguments separately, like in the main
96# line English extractor where we keep data from etymology templates.
97class TemplateData(SimpleEnglishBaseModel):
98 name: str = Field(default="", description="Template's name.")
99 args: dict[str, str] = Field(
100 default={}, description="Arguments given to the template, if any."
101 )
102 expansion: str = Field(
103 default="",
104 description="The result of expanding the template.",
105 )
107# The highest level entry: This is returned from the program as a JSON object
108# in the JSONL output.
109class WordEntry(SimpleEnglishBaseModel):
110 model_config = ConfigDict(title="Simple English Wiktionary")
112 word: str = Field(description="Word string")
113 # For Simple English, the language is always English
114 forms: list[Form] = Field(default=[], description="Inflection forms list")
115 # We do not use "en" as the default value here, because we also
116 # remove all default values so that we don't have empty or meaningless
117 # fields in the output.
118 lang_code: str = Field(default="", description="Wiktionary language code")
119 lang: str = Field(default="", description="Localized language name")
120 pos: str = Field(default="", description="Part of speech type")
121 pos_title: str = "" # `==Noun==`
122 pos_num: int = -1 # `==Noun 2==` Default -1 gets removed.
123 etymology_text: str = Field(
124 default="", description="Etymology section as cleaned text."
125 )
126 etymology_templates: list[TemplateData] = Field(
127 default=[],
128 description="Templates and their arguments and expansions from the "
129 "etymology section.",
130 )
131 # Simple Wiktionary doesn't have numbered etymology sections
132 senses: list[Sense] = []
133 title: str = Field(default="", description="Redirect page source title")
134 redirect: str = Field(default="", description="Redirect page target title")
135 categories: list[str] = []
136 sounds: list[Sound] = []
137 tags: list[str] = []
138 raw_tags: list[str] = []
139 hyphenation: str = "" # Should be a list `hyphenations`.
140 head_templates: list[TemplateData] = []