Coverage for src/wiktextract/extractor/nl/tags.py: 84%

23 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from .models import WordEntry 

2 

3# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen 

4VERB_TAGS = { 

5 "ergatief": "ergative", # Sjabloon:erga 

6 "inergatief": "unergative", # Sjabloon:inerg 

7 "hulpwerkwoord": "auxiliary", # Sjabloon:auxl 

8} 

9 

10# https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels 

11GLOSS_TAGS = { 

12 "figuurlijk": "figuratively", 

13 "afkorting": "abbreviation", 

14 "causatief": "causative", 

15 # "chattaal": "", 

16 "dichterlijk": "poetic", 

17 "eufemisme": "euphemistic", 

18 "familienaam": "surname", 

19 "formeel": "formal", 

20 "gezegde": "proverb", 

21 # "heteroniem": "heteronym", 

22 "historisch": "historical", 

23 "informeel": "informal", 

24 "initiaalwoord": "acronym", 

25 # "klemtoonhomogram": "", 

26 "krachtterm": "vulgar", 

27 # "leesteken": "punctuation", 

28 "letterwoord": "acronym", 

29 "middeleeuwen": "Middle-Ages", 

30 "vrouwelijke naam": ["feminine", "name"], 

31 "mannelijke naam": ["masculine", "name"], 

32 "mannelijke en vrouwelijke naam": ["masculine", "feminine", "name"], 

33 "neologisme": "neologism", 

34 "oudheid": "archaic", 

35 # "palindroom": "palindrome", 

36 "pejoratief": "pejorative", 

37 "persoon": "person", 

38 # "pregnant": "extra meaning", 

39 "samenkoppeling": "compound", 

40 # "sanitair": "", 

41 "scheldwoord": "pejorative", 

42 "schertsend": "humorous", 

43 "spottend": "ironic", 

44 "spreektaal": "vernacular", 

45 "spreekwoord": "proverb", 

46 # "stopwoord": "filled pause", 

47 "straattaal": "slang", 

48 "streektaal": "regiolectal", 

49 # "taal": "language", 

50 "toponiem": "toponymic", 

51 "verkorting": "clipping", 

52 "verouderd": "obsolete", 

53 "Vroegnieuwnederlands": "Early-Modern-Dutch", 

54 "vulgair": "vulgar", 

55 "zegswijze": "idiomatic", 

56 "zeldzaam": "rare", 

57 "Latijns-Amerika": "Latin-America", 

58} 

59 

60TABLE_TAGS = { 

61 # Sjabloon:-nlnoun- 

62 "enkelvoud": "singular", 

63 "meervoud": "plural", 

64 "verkleinwoord": "diminutive", 

65 # Sjabloon:adjcomp 

66 "stellend": "positive", 

67 "vergrotend": "comparative", 

68 "overtreffend": "superlative", 

69 "onverbogen": "uninflected", 

70 "verbogen": "inflected", 

71 "partitief": "partitive", 

72 # Sjabloon:-nlverb- 

73 "onbepaalde wijs": "infinitive", 

74 "kort": "short-form", 

75 "onvoltooid": "imperfect", 

76 "tegenwoordig": "present", 

77 "toekomend": "future", 

78 "voltooid": "perfect", 

79 "onvoltooid deelwoord": ["imperfect", "participle"], 

80 "voltooid deelwoord": ["past", "participle"], 

81 "gebiedende wijs": "imperative", 

82 "aanvoegende wijs": "subjunctive", 

83 "aantonende wijs": "indicative", 

84 "eerste": "first-person", 

85 "tweede": "second-person", 

86 "derde": "third-person", 

87 "verleden": "past", 

88 "voorwaardelijk": "conditional", 

89} 

90 

91 

92TAGS = {**VERB_TAGS, **GLOSS_TAGS, **TABLE_TAGS} 

93 

94# https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels 

95TOPICS = { 

96 "aardrijkskunde": "geography", 

97 "adel": "nobility", 

98 "anatomie": "anatomy", 

99 "antropologie": "anthropology", 

100 "archeologie": "archaeology", 

101 "astrologie": "astrology", 

102 "astronomie": "astronomy", 

103 # "bacteriën": "bacterium", 

104 # "badminton": "badminton", 

105 "basketbal": "basketball", 

106 "bedrijf": "business", 

107 "bedrijfskunde": "business", # "business administration", 

108 # "bedrijfstak": "industrial branch", 

109 "beeldhouwkunst": "arts", # "sculpting" 

110 # "beroep": "profession", 

111 "beschrijvende plantkunde": "botany", # "descriptive botany" 

112 # "bidsprinkhanen": "mantises", 

113 "biochemie": "biochemistry", 

114 "biologie": "biology", 

115 "bloemplanten": "botany", 

116 "boekbinderij": "bookbinding", 

117 "boekhouding": "accounting", 

118 "bosbouw": "forestry", 

119 "bouwkunde": "architecture", 

120 # "breukgetal": "", 

121 "bridge": "bridge", 

122 # "buideldieren": "marsupial", 

123 # "buikpotigen": "", 

124 # "buissnaveligen": "", 

125 # "buistandigen": "", 

126 # "cloacadieren": "monotreme", 

127 "communicatie": "communications", 

128 # "coniferen": "conifers", 

129 "cosmetica": "cosmetics", 

130 "cryptografie": "cryptography", 

131 # "cultuur": "culture", 

132 "dag": "weekday", 

133 "dans": "dance", 

134 "demografie": "demography", 

135 "demoniem": "demonym", 

136 "dichtkunst": "poetry", 

137 # "dierengeluid": "animal sound", 

138 "diergeneeskunde": ["veterinary", "medicine"], 

139 "dierkunde": "zoology", 

140 # "dierluizen": "", 

141 "diplomatie": "diplomacy", 

142 "drinken": "beverages", 

143 # "duifachtigen": "", 

144 # "duikers": "", 

145 # "dysfemisme": "dysphemism", 

146 "ecologie": "ecology", 

147 "economie": "economics", 

148 # "eendvogels": "anseriform", 

149 # "eenheid": "", 

150 "effectenhandel": "trading", 

151 "egyptologie": "Egyptology", 

152 # "toponiem: eiland": "", 

153 "elektronica": "electronics", 

154 "elektrotechniek": "electrical-engineering", 

155 # "element": "element", 

156 "emotie": "emotion", 

157 # "evenhoevigen": "", 

158 # "familie": "family", 

159 "farmacologie": "pharmacology", 

160 # "feest": "party", 

161 "fietsen": "cycling", 

162 "filatelie": "philately", 

163 "filmkunst": "cinematography", 

164 "filosofie": "philosophy", 

165 "financieel": "financial", 

166 # "flamingoachtigen": "", 

167 "folklore": "folklore", 

168 "fotografie": "photography", 

169 # "fruit": "fruit", 

170 # "futen": "grebe", 

171 "fysiologie": "physiology", 

172 "genetica": "genetics", 

173 # "gentachtigen": "", 

174 "geologie": "geology", 

175 "geopolitiek": "geopolitics", 

176 "gereedschap": "tools", 

177 "geschiedenis": "history", 

178 "glaciologie": "glaciology", 

179 # "godheid": "deity", 

180 # "graan": "grain", 

181 "grammatica": "grammar", 

182 "groente": "vegetable", 

183 # "grondmechanica": "", 

184 "haar": "hairstyle", 

185 "handel": "business", 

186 "heraldiek": "heraldry", 

187 "hobby": "hobbies", 

188 "hoofddeksel": "headgear", 

189 # "horeca": "", 

190 "houtbewerking": "woodworking", 

191 # "huishouden": "housekeeping", 

192 "imkerij": "beekeeping", 

193 # "industrie": "industry", 

194 "informatica": "computer sciences", 

195 "internet": "Internet", 

196 # "jaarwisseling": "", 

197 "jachttaal": "hunting", 

198 # "jongerentaal": "", 

199 "juridisch": "legal", 

200 "kaartspel": "card-games", 

201 # "kamperen": "camping", 

202 # "kerst": "Christmas", 

203 # "kindertaal": "child language", 

204 "kleding": "clothing", 

205 "kleur": "colour", 

206 # "knutselen": "", 

207 "kookkunst": "culinary", 

208 # "krachtsport": "", 

209 "kristallografie": "crystallography", 

210 # "kruid": "", 

211 # "kuiperij": "", 

212 "kunst": "arts", 

213 "landbouw": "agriculture", 

214 "landmeetkunde": "surveying", 

215 "leenstelsel": "feudalism", 

216 # "leerbewerking": "", 

217 # "leidekkerij": "", 

218 "letterkunde": "literature", 

219 "lhbt": "LGBT", 

220 "logica": "logic", 

221 "luchtvaart": "aviation", 

222 # "maatschappij": "company", 

223 # "magie": "magic", 

224 "makelaardij": "real-estate", 

225 # "materiaalkunde": "materials science", 

226 # "media": "", 

227 "medisch": "medicine", 

228 # "meer": "lake", 

229 "meetkunde": "geometry", 

230 "metaalbewerking": "metalworking", 

231 "metallurgie": "metallurgy", 

232 "klimatologie": "climatology", 

233 "meteorologie": "meteorology", 

234 # "metonymisch": "", 

235 "meubel": "furniture", 

236 "mijnbouw": "mining", 

237 "milieukunde": "ecology", 

238 "militair": "military", 

239 "mineraal": "mining", 

240 "mineralogie": "mineralogy", 

241 # "misdaad": "crime", 

242 "mode": "fashion", 

243 # "molenaarsambacht": "", 

244 "muziek": "music", 

245 "muziekinstrument": "music", 

246 "mycologie": "mycology", 

247 "mythologie": "mythology", 

248 "natuurkunde": "physics", 

249 "neurologie": "neurology", 

250 "numismatiek": "numismatics", 

251 "oenologie": "oenology", 

252 "onderwijs": "education", 

253 "oorlog": "war", 

254 "optica": "optics", 

255 "ordehandhaving": "law enforcement", 

256 # "paardrijden": "horseriding", 

257 # "planologie": "planology", 

258 "plantkunde": "botany", 

259 "politiek": "politics", 

260 "post": "mail", 

261 "psychologie": "psychology", 

262 "regering": "government", 

263 "religie": "religion", 

264 # "ruimtevaart": "space travel", 

265 "schaak": "chess", 

266 "scheepvaart": "shipping", 

267 "scheikunde": "chemistry", 

268 # "schilderkunst": "painting", 

269 # "schoeisel": "shoewear", 

270 "scouting": "scouting", 

271 "seismologie": "seismology", 

272 "seksualiteit": "sexuality", 

273 "sieraad": "jewellery", 

274 # "slapen": "sleep", 

275 # "snoepgoed": "candy", 

276 "sociologie": "sociology", 

277 # "specerij": "spice", 

278 "speelgoed": "toys", 

279 "spel": "games", 

280 # "spellingsalfabet": "spelling alphabet", 

281 "spoorwegen": "railways", 

282 "sport": "sports", 

283 "statistiek": "statistics", 

284 # "sterrenbeeld": "constellation", 

285 "valutanaam": "money", 

286 "taalkunde": "linguistics", 

287 "tandheelkunde": "dentistry", 

288 "techniek": "technology", 

289 # "teken- en schrijfmateriaal": "", 

290 "tekstkritiek": "textual criticism", 

291 "telecommunicatie": "telecommunications", 

292 "tennis": "tennis", 

293 "textiel": "textiles", 

294 "textielindustrie": "textiles", 

295 "thermodynamica": "thermodynamics", 

296 # "tijdrekening": "timekeeping", 

297 "toerisme": "tourism", 

298 "toneel": "theater", 

299 "transport": "transport", 

300 "tuinbouw": "horticulture", 

301 # "tuinieren": "gardening", 

302 "typografie": "typography", 

303 "valkerij": "falconry", 

304 # "veeteelt": "husbandry", 

305 "verkeer": "traffic", 

306 "visserij": "fishing", 

307 "voeding": "food", 

308 "voetbal": "football", 

309 "volleybal": "volleyball", 

310 # "waterbeheer": "water management", 

311 "wegenbouw": ["road", "construction"], 

312 "werelddeel": "continents", 

313 "werktuigbouwkunde": "mechanical-engineering", 

314 "wetenschap": "sciences", 

315 "wielrennen": "cycling", 

316 # "Wikimedia": "Wikimedia", 

317 # "wikitaal": "", 

318 # "windstreek": "", 

319 # "wintersport": "", 

320 "wiskunde": "mathematics", 

321 # "wonen": "", 

322 "zoötomie": "zootomy", 

323 "zwemmen": "swimming", 

324} 

325 

326 

327def translate_raw_tags(data: WordEntry) -> None: 

328 raw_tags = [] 

329 for raw_tag in data.raw_tags: 

330 if raw_tag in TAGS: 

331 tr_tag = TAGS[raw_tag] 

332 if isinstance(tr_tag, str): 

333 data.tags.append(tr_tag) 

334 elif isinstance(tr_tag, list): 334 ↛ 329line 334 didn't jump to line 329 because the condition on line 334 was always true

335 data.tags.extend(tr_tag) 

336 elif raw_tag in TOPICS: 

337 tr_topic = TOPICS[raw_tag] 

338 if isinstance(tr_topic, str): 338 ↛ 340line 338 didn't jump to line 340 because the condition on line 338 was always true

339 data.topics.append(tr_topic) 

340 elif isinstance(tr_topic, list): 

341 data.topics.extend(tr_topic) 

342 else: 

343 raw_tags.append(raw_tag) 

344 data.raw_tags = raw_tags