Coverage for src/wiktextract/extractor/simple/models.py: 100%

1from pydantic import BaseModel, ConfigDict, Field

3# Pydantic models are basically classes that take the place of the dicts

4# used in the main English extractor. They use more resources, but also do

5# a lot of validation work and are easier for the type-checker.

7# Pydantic config stuff.

8class SimpleEnglishBaseModel(BaseModel):

9 model_config = ConfigDict(

10 extra="forbid",

11 strict=True,

12 validate_assignment=True,

13 # We use pydantic mainly for the automatic validation; this

14 # setting ensures the validation is done even when assigning something

15 # after initialization, otherwise it doesn't check anything.

16 validate_default=True,

17 )

19# Not an example, this is for example entries next to glosses.

20class Example(SimpleEnglishBaseModel):

21 text: str = Field(default="", description="Example usage sentence")

22 author: str = Field(default="", description="Author's name")

23 title: str = Field(default="", description="Title of the reference")

24 # SEW example templates are simple and don't seem to have these

25 # latter datas.

26 # date: str = Field(default="", description="Original date")

27 # date_published: str = Field(default="", description="Date of publication")

28 # collection: str = Field(

29 # default="",

30 # description="Name of the collection the example was taken from",

31 # )

32 # editor: str = Field(default="", description="Editor")

33 # translator: str = Field(default="", description="Translator")

34 # source: str = Field(

35 # default="",

36 # description="Source of reference",

37 # )

39# General glass for "link to another related word", like synonym, antonym, etc.

40# Instead of having classes for each, we have differnet fields of list[Linkage],

41# like `synonyms: list[Linkage] = []`.

42class Linkage(SimpleEnglishBaseModel):

43 word: str

44 # sense_index: str = ""

45 # note: str = ""

46 # raw_tags: list[str] = []

47 # tags: list[str] = []

49# Basically a line or lines of gloss, a meaning of a word. These are collected

50# under the POS as a list.

51class Sense(SimpleEnglishBaseModel):

52 glosses: list[str] = [] # ["Gloss supercategory", "Specific gloss."]

53 tags: list[str] = []

54 raw_tags: list[str] = []

55 # topics: list[str] = [] # XXX do these.

56 categories: list[str] = [] # Wikipedia category link data; not printed.

57 examples: list[Example] = []

58 synonyms: list[Linkage] = []

59 antonyms: list[Linkage] = []

60 # ruby: list[tuple[str, ...]] = []

62# An inflected form of the word, like `{ form: "bats", tags: ["plural"] }`

63class Form(SimpleEnglishBaseModel):

64 form: str = ""

65 tags: list[str] = []

66 raw_tags: list[str] = []

67 # sense_index: str = ""

70# A pronunciation or audio file. If you have a string of IPA or SAMPA or

71# something else, that is extracted as its own Sound entry.

72class Sound(SimpleEnglishBaseModel):

73 ipa: str = Field(default="", description="International Phonetic Alphabet")

74 enpr: str = Field(default="", description="American Heritage Dictionary")

75 sampa: str = Field(

76 default="", description="Speech Assessment Methods Phonetic Alphabet"

77 )

78 audio: str = Field(default="", description="Audio file name")

79 wav_url: str = Field(default="")

80 ogg_url: str = Field(default="")

81 mp3_url: str = Field(default="")

82 oga_url: str = Field(default="")

83 flac_url: str = Field(default="")

84 lang_code: str = Field(default="en", description="Wiktionary language code")

85 lang: str = Field(default="English", description="Localized language name")

86 raw_tags: list[str] = []

87 tags: list[str] = []

88 rhymes: list[str] = []

89 homophones: list[str] = []

90 # text: str = "" # Use raw_tags instead

91 # "Temporary" field used to sort out different sound data between POSes when

92 # they are originally found in one combined pronunciation section

93 poses: list[str] = []

95# Sometimes we collect raw template arguments separately, like in the main

96# line English extractor where we keep data from etymology templates.

97class TemplateData(SimpleEnglishBaseModel):

98 name: str = Field(default="", description="Template's name.")

99 args: dict[str, str] = Field(

100 default={}, description="Arguments given to the template, if any."

101 )

102 expansion: str = Field(

103 default="",

104 description="The result of expanding the template.",

105 )

106

107# The highest level entry: This is returned from the program as a JSON object

108# in the JSONL output.

109class WordEntry(SimpleEnglishBaseModel):

110 model_config = ConfigDict(title="Simple English Wiktionary")

111

112 word: str = Field(description="Word string")

113 # For Simple English, the language is always English

114 forms: list[Form] = Field(default=[], description="Inflection forms list")

115 # We do not use "en" as the default value here, because we also

116 # remove all default values so that we don't have empty or meaningless

117 # fields in the output.

118 lang_code: str = Field(default="", description="Wiktionary language code")

119 lang: str = Field(default="", description="Localized language name")

120 pos: str = Field(default="", description="Part of speech type")

121 pos_title: str = "" # `==Noun==`

122 pos_num: int = -1 # `==Noun 2==` Default -1 gets removed.

123 etymology_text: str = Field(

124 default="", description="Etymology section as cleaned text."

125 )

126 etymology_templates: list[TemplateData] = Field(

127 default=[],

128 description="Templates and their arguments and expansions from the "

129 "etymology section.",

130 )

131 # Simple Wiktionary doesn't have numbered etymology sections

132 senses: list[Sense] = []

133 title: str = Field(default="", description="Redirect page source title")

134 redirect: str = Field(default="", description="Redirect page target title")

135 categories: list[str] = []

136 sounds: list[Sound] = []

137 tags: list[str] = []

138 raw_tags: list[str] = []

139 hyphenation: str = "" # Should be a list `hyphenations`.

140 head_templates: list[TemplateData] = []