Coverage for src / wiktextract / extractor / simple / models.py: 100%

62 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1from pydantic import BaseModel, ConfigDict, Field 

2 

3# Pydantic models are basically classes that take the place of the dicts 

4# used in the main English extractor. They use more resources, but also do 

5# a lot of validation work and are easier for the type-checker. 

6 

7 

8# Pydantic config stuff. 

9class SimpleEnglishBaseModel(BaseModel): 

10 model_config = ConfigDict( 

11 extra="forbid", 

12 strict=True, 

13 validate_assignment=True, 

14 # We use pydantic mainly for the automatic validation; this 

15 # setting ensures the validation is done even when assigning something 

16 # after initialization, otherwise it doesn't check anything. 

17 validate_default=True, 

18 ) 

19 

20 

21# Not an example, this is for example entries next to glosses. 

22class Example(SimpleEnglishBaseModel): 

23 text: str = Field(default="", description="Example usage sentence") 

24 author: str = Field(default="", description="Author's name") 

25 title: str = Field(default="", description="Title of the reference") 

26 # SEW example templates are simple and don't seem to have these 

27 # latter data. 

28 # date: str = Field(default="", description="Original date") 

29 # date_published: str = Field(default="", description="Date of publication") 

30 # collection: str = Field( 

31 # default="", 

32 # description="Name of the collection the example was taken from", 

33 # ) 

34 # editor: str = Field(default="", description="Editor") 

35 # translator: str = Field(default="", description="Translator") 

36 # source: str = Field( 

37 # default="", 

38 # description="Source of reference", 

39 # ) 

40 

41 

42# General glass for "link to another related word", like synonym, antonym, etc. 

43# Instead of having classes for each, we have differnet fields of list[Linkage], 

44# like `synonyms: list[Linkage] = []`. 

45class Linkage(SimpleEnglishBaseModel): 

46 word: str 

47 # sense_index: str = "" 

48 # note: str = "" 

49 # raw_tags: list[str] = [] 

50 # tags: list[str] = [] 

51 

52 

53# Basically a line or lines of gloss, a meaning of a word. These are collected 

54# under the POS as a list. 

55class Sense(SimpleEnglishBaseModel): 

56 glosses: list[str] = [] # ["Gloss supercategory", "Specific gloss."] 

57 tags: list[str] = [] 

58 raw_tags: list[str] = [] 

59 # topics: list[str] = [] # XXX do these. 

60 categories: list[str] = [] # Wikipedia category link data; not printed. 

61 examples: list[Example] = [] 

62 synonyms: list[Linkage] = [] 

63 antonyms: list[Linkage] = [] 

64 # ruby: list[tuple[str, ...]] = [] 

65 

66 

67# An inflected form of the word, like `{ form: "bats", tags: ["plural"] }` 

68class Form(SimpleEnglishBaseModel): 

69 form: str = "" 

70 tags: list[str] = [] 

71 raw_tags: list[str] = [] 

72 # sense_index: str = "" 

73 

74 

75# A pronunciation or audio file. If you have a string of IPA or SAMPA or 

76# something else, that is extracted as its own Sound entry. 

77class Sound(SimpleEnglishBaseModel): 

78 ipa: str = Field(default="", description="International Phonetic Alphabet") 

79 enpr: str = Field(default="", description="American Heritage Dictionary") 

80 sampa: str = Field( 

81 default="", description="Speech Assessment Methods Phonetic Alphabet" 

82 ) 

83 audio: str = Field(default="", description="Audio file name") 

84 wav_url: str = Field(default="") 

85 ogg_url: str = Field(default="") 

86 mp3_url: str = Field(default="") 

87 oga_url: str = Field(default="") 

88 flac_url: str = Field(default="") 

89 lang_code: str = Field(default="en", description="Wiktionary language code") 

90 lang: str = Field(default="English", description="Localized language name") 

91 raw_tags: list[str] = [] 

92 tags: list[str] = [] 

93 rhymes: list[str] = [] 

94 homophones: list[str] = [] 

95 # text: str = "" # Use raw_tags instead 

96 # "Temporary" field used to sort out different sound data between POSes when 

97 # they are originally found in one combined pronunciation section 

98 poses: list[str] = [] 

99 

100 

101# Sometimes we collect raw template arguments separately, like in the main 

102# line English extractor where we keep data from etymology templates. 

103class TemplateData(SimpleEnglishBaseModel): 

104 name: str = Field(default="", description="Template's name.") 

105 args: dict[str, str] = Field( 

106 default={}, description="Arguments given to the template, if any." 

107 ) 

108 expansion: str = Field( 

109 default="", 

110 description="The result of expanding the template.", 

111 ) 

112 

113 

114# The highest level entry: This is returned from the program as a JSON object 

115# in the JSONL output. 

116class WordEntry(SimpleEnglishBaseModel): 

117 model_config = ConfigDict(title="Simple English Wiktionary") 

118 

119 word: str = Field(description="Word string") 

120 # For Simple English, the language is always English 

121 forms: list[Form] = Field(default=[], description="Inflection forms list") 

122 # We do not use "en" as the default value here, because we also 

123 # remove all default values so that we don't have empty or meaningless 

124 # fields in the output. 

125 lang_code: str = Field(default="", description="Wiktionary language code") 

126 lang: str = Field(default="", description="Localized language name") 

127 pos: str = Field(default="", description="Part of speech type") 

128 pos_title: str = "" # `==Noun==` 

129 pos_num: int = -1 # `==Noun 2==` Default -1 gets removed. 

130 etymology_text: str = Field( 

131 default="", description="Etymology section as cleaned text." 

132 ) 

133 etymology_templates: list[TemplateData] = Field( 

134 default=[], 

135 description="Templates and their arguments and expansions from the " 

136 "etymology section.", 

137 ) 

138 # Simple Wiktionary doesn't have numbered etymology sections 

139 senses: list[Sense] = [] 

140 title: str = Field(default="", description="Redirect page source title") 

141 redirect: str = Field(default="", description="Redirect page target title") 

142 categories: list[str] = [] 

143 sounds: list[Sound] = [] 

144 tags: list[str] = [] 

145 raw_tags: list[str] = [] 

146 hyphenation: str = "" # Should be a list `hyphenations`. 

147 head_templates: list[TemplateData] = []