Coverage for src/wiktextract/extractor/nl/tags.py: 85%

25 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1from .models import WordEntry 

2 

3# https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen 

4# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen 

5GLOSS_TAG_TEMPLATES = frozenset( 

6 [ 

7 "absol", 

8 "accus", 

9 "auxl", 

10 "copl", 

11 "deponens", 

12 "ditr", 

13 "erga", 

14 "inerg", 

15 "intr", 

16 "modl", 

17 "onpr", 

18 "ov", 

19 "rcpq", 

20 "refl", 

21 "s-verb", 

22 "plurt", 

23 "singt", 

24 "versterkend voorvoegsel", 

25 ] 

26) 

27 

28 

29# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen 

30# https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels 

31GLOSS_TAGS = { 

32 "figuurlijk": "figuratively", 

33 "afkorting": "abbreviation", 

34 "causatief": "causative", 

35 # "chattaal": "", 

36 "dichterlijk": "poetic", 

37 "eufemisme": "euphemistic", 

38 "familienaam": "surname", 

39 "formeel": "formal", 

40 "gezegde": "proverb", 

41 # "heteroniem": "heteronym", 

42 "historisch": "historical", 

43 "informeel": "informal", 

44 "initiaalwoord": "acronym", 

45 # "klemtoonhomogram": "", 

46 "krachtterm": "vulgar", 

47 # "leesteken": "punctuation", 

48 "letterwoord": "acronym", 

49 "middeleeuwen": "Middle-Ages", 

50 "vrouwelijke naam": ["feminine", "name"], 

51 "mannelijke naam": ["masculine", "name"], 

52 "mannelijke en vrouwelijke naam": ["masculine", "feminine", "name"], 

53 "neologisme": "neologism", 

54 "oudheid": "archaic", 

55 # "palindroom": "palindrome", 

56 "pejoratief": "pejorative", 

57 "persoon": "person", 

58 # "pregnant": "extra meaning", 

59 "samenkoppeling": "compound", 

60 # "sanitair": "", 

61 "scheldwoord": "pejorative", 

62 "schertsend": "humorous", 

63 "spottend": "ironic", 

64 "spreektaal": "vernacular", 

65 "spreekwoord": "proverb", 

66 # "stopwoord": "filled pause", 

67 "straattaal": "slang", 

68 "streektaal": "regiolectal", 

69 "taal": "linguistics", 

70 "toponiem": "toponymic", 

71 "verkorting": "clipping", 

72 "verouderd": "obsolete", 

73 "Vroegnieuwnederlands": "Early-Modern-Dutch", 

74 "vulgair": "vulgar", 

75 "zegswijze": "idiomatic", 

76 "zeldzaam": "rare", 

77 "Latijns-Amerika": "Latin-America", 

78 "absoluut": "absolute", # Sjabloon:absol 

79 "accusatief": "accusative", # Sjabloon:accus 

80 "hulpwerkwoord": "auxiliary", # Sjabloon:auxl 

81 "koppelwerkwoord": "copulative", # Sjabloon:copl 

82 "deponens": "deponent", 

83 "ditransitief": "ditransitive", # Sjabloon:ditr 

84 "ergatief": "ergative", # Sjabloon:erga 

85 "inergatief": "unergative", # Sjabloon:inerg 

86 "onovergankelijk": "intransitive", # Sjabloon:intr 

87 "modaal werkwoord": ["modal", "verb"], # Sjabloon:modl 

88 "onpersoonlijk": "impersonal", # Sjabloon:onpr 

89 "overgankelijk": "transitive", # Sjabloon:ov 

90 "wederkerig": "reciprocal", # Sjabloon:rcpq 

91 "wederkerend": "reflexive", # Sjabloon:refl 

92 "alleen meervoud": "plural-only", # Sjabloon:plurt 

93 "geen meervoud": "no-plural", # Sjabloon:singt 

94 "versterkend voorvoegsel": ["intensifier", "prefix"], 

95 "in een bijzin": "with-subordinate-clause", # Sjabloon:ovt-mv-bijz 

96 "bij inversie": "inversion", # Sjabloon:1ps 

97 "Noord-Nederland": "Northern-Netherland", 

98 "Vlaanderen": "Flanders", 

99 "Brabant": "Brabant", 

100 "Limburg": "Limburg", 

101} 

102 

103TABLE_TAGS = { 

104 # Sjabloon:-nlnoun- 

105 "enkelvoud": "singular", 

106 "meervoud": "plural", 

107 "verkleinwoord": "diminutive", 

108 "bezitsvorm": "possessive", 

109 # Sjabloon:adjcomp 

110 "stellend": "positive", 

111 "vergrotend": "comparative", 

112 "overtreffend": "superlative", 

113 "onverbogen": "uninflected", 

114 "verbogen": "inflected", 

115 "partitief": "partitive", 

116 # Sjabloon:-nlverb- 

117 "onbepaalde wijs": "infinitive", 

118 "kort": "short-form", 

119 "lang": "long-form", 

120 "onvoltooid": "imperfect", 

121 "tegenwoordig": "present", 

122 "toekomend": "future", 

123 "voltooid": "perfect", 

124 "onvoltooid deelwoord": ["imperfect", "participle"], 

125 "voltooid deelwoord": ["past", "participle"], 

126 "gebiedende wijs": "imperative", 

127 "aanvoegende wijs": "subjunctive", 

128 "aantonende wijs": "indicative", 

129 "eerste": "first-person", 

130 "tweede": "second-person", 

131 "derde": "third-person", 

132 "verleden": "past", 

133 "voorwaardelijk": "conditional", 

134 "hoofdzin": "main-clause", 

135 "bijzin": "subordinate-clause", 

136 # Sjabloon:-nlname- 

137 "nominatief": "nominative", 

138 "genitief": "genitive", 

139 # Sjabloon:-denoun- 

140 "datief": "dative", 

141 "accusatief": "accusative", 

142 # Sjabloon:-nlverb-reflex- 

143 "tegenwoordige tijd": "present", 

144 "verleden tijd": "past", 

145 "toekomende tijd": "future", 

146 "1": "first-person", 

147 "2": "second-person", 

148 "3": "third-person", 

149 "voltooide tijd": "past", 

150 # Sjabloon:-dumverb- 

151 "onv. deelwoord": ["imperfect", "participle"], 

152 "volt deelwoord": ["past", "participle"], 

153 "aantonend": "indicative", 

154 "aanvoegend": "subjunctive", 

155} 

156 

157 

158HEADER_LINE_TAGS = { 

159 "dim. tant.": ["diminutive", "noun"], # Sjabloon:dimt 

160} 

161 

162 

163TAGS = {**GLOSS_TAGS, **TABLE_TAGS, **HEADER_LINE_TAGS} 

164 

165# https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels 

166TOPICS = { 

167 "aardrijkskunde": "geography", 

168 "adel": "nobility", 

169 "anatomie": "anatomy", 

170 "antropologie": "anthropology", 

171 "archeologie": "archaeology", 

172 "astrologie": "astrology", 

173 "astronomie": "astronomy", 

174 # "bacteriën": "bacterium", 

175 # "badminton": "badminton", 

176 "basketbal": "basketball", 

177 "bedrijf": "business", 

178 "bedrijfskunde": "business", # "business administration", 

179 # "bedrijfstak": "industrial branch", 

180 "beeldhouwkunst": "arts", # "sculpting" 

181 # "beroep": "profession", 

182 "beschrijvende plantkunde": "botany", # "descriptive botany" 

183 # "bidsprinkhanen": "mantises", 

184 "biochemie": "biochemistry", 

185 "biologie": "biology", 

186 "bloemplanten": "botany", 

187 "boekbinderij": "bookbinding", 

188 "boekhouding": "accounting", 

189 "bosbouw": "forestry", 

190 "bouwkunde": "architecture", 

191 # "breukgetal": "", 

192 "bridge": "bridge", 

193 # "buideldieren": "marsupial", 

194 # "buikpotigen": "", 

195 # "buissnaveligen": "", 

196 # "buistandigen": "", 

197 # "cloacadieren": "monotreme", 

198 "communicatie": "communications", 

199 # "coniferen": "conifers", 

200 "cosmetica": "cosmetics", 

201 "cryptografie": "cryptography", 

202 # "cultuur": "culture", 

203 "dag": "weekday", 

204 "dans": "dance", 

205 "demografie": "demography", 

206 "demoniem": "demonym", 

207 "dichtkunst": "poetry", 

208 # "dierengeluid": "animal sound", 

209 "diergeneeskunde": ["veterinary", "medicine"], 

210 "dierkunde": "zoology", 

211 # "dierluizen": "", 

212 "diplomatie": "diplomacy", 

213 "drinken": "beverages", 

214 # "duifachtigen": "", 

215 # "duikers": "", 

216 # "dysfemisme": "dysphemism", 

217 "ecologie": "ecology", 

218 "economie": "economics", 

219 # "eendvogels": "anseriform", 

220 "eenheid": "units-of-measure", 

221 "effectenhandel": "trading", 

222 "egyptologie": "Egyptology", 

223 # "toponiem: eiland": "", 

224 "elektronica": "electronics", 

225 "elektrotechniek": "electrical-engineering", 

226 # "element": "element", 

227 "emotie": "emotion", 

228 # "evenhoevigen": "", 

229 "familie": "familiar", 

230 "farmacologie": "pharmacology", 

231 # "feest": "party", 

232 "fietsen": "cycling", 

233 "filatelie": "philately", 

234 "filmkunst": "cinematography", 

235 "filosofie": "philosophy", 

236 "financieel": "financial", 

237 # "flamingoachtigen": "", 

238 "folklore": "folklore", 

239 "fotografie": "photography", 

240 # "fruit": "fruit", 

241 # "futen": "grebe", 

242 "fysiologie": "physiology", 

243 "genetica": "genetics", 

244 # "gentachtigen": "", 

245 "geologie": "geology", 

246 "geopolitiek": "geopolitics", 

247 "gereedschap": "tools", 

248 "geschiedenis": "history", 

249 "glaciologie": "glaciology", 

250 # "godheid": "deity", 

251 # "graan": "grain", 

252 "grammatica": "grammar", 

253 "groente": "vegetable", 

254 # "grondmechanica": "", 

255 "haar": "hairstyle", 

256 "handel": "business", 

257 "heraldiek": "heraldry", 

258 "hobby": "hobbies", 

259 "hoofddeksel": "headgear", 

260 # "horeca": "", 

261 "houtbewerking": "woodworking", 

262 # "huishouden": "housekeeping", 

263 "imkerij": "beekeeping", 

264 # "industrie": "industry", 

265 "informatica": "computer sciences", 

266 "internet": "Internet", 

267 # "jaarwisseling": "", 

268 "jachttaal": "hunting", 

269 # "jongerentaal": "", 

270 "juridisch": "legal", 

271 "kaartspel": "card-games", 

272 # "kamperen": "camping", 

273 # "kerst": "Christmas", 

274 # "kindertaal": "child language", 

275 "kleding": "clothing", 

276 "kleur": "colour", 

277 # "knutselen": "", 

278 "kookkunst": "culinary", 

279 # "krachtsport": "", 

280 "kristallografie": "crystallography", 

281 # "kruid": "", 

282 # "kuiperij": "", 

283 "kunst": "arts", 

284 "landbouw": "agriculture", 

285 "landmeetkunde": "surveying", 

286 "leenstelsel": "feudalism", 

287 # "leerbewerking": "", 

288 # "leidekkerij": "", 

289 "letterkunde": "literature", 

290 "lhbt": "LGBT", 

291 "logica": "logic", 

292 "luchtvaart": "aviation", 

293 # "maatschappij": "company", 

294 # "magie": "magic", 

295 "makelaardij": "real-estate", 

296 # "materiaalkunde": "materials science", 

297 # "media": "", 

298 "medisch": "medicine", 

299 # "meer": "lake", 

300 "meetkunde": "geometry", 

301 "metaalbewerking": "metalworking", 

302 "metallurgie": "metallurgy", 

303 "klimatologie": "climatology", 

304 "meteorologie": "meteorology", 

305 # "metonymisch": "", 

306 "meubel": "furniture", 

307 "mijnbouw": "mining", 

308 "milieukunde": "ecology", 

309 "militair": "military", 

310 "mineraal": "mining", 

311 "mineralogie": "mineralogy", 

312 # "misdaad": "crime", 

313 "mode": "fashion", 

314 # "molenaarsambacht": "", 

315 "muziek": "music", 

316 "muziekinstrument": "music", 

317 "mycologie": "mycology", 

318 "mythologie": "mythology", 

319 "natuurkunde": "physics", 

320 "neurologie": "neurology", 

321 "numismatiek": "numismatics", 

322 "oenologie": "oenology", 

323 "onderwijs": "education", 

324 "oorlog": "war", 

325 "optica": "optics", 

326 "ordehandhaving": "law enforcement", 

327 # "paardrijden": "horseriding", 

328 # "planologie": "planology", 

329 "plantkunde": "botany", 

330 "politiek": "politics", 

331 "post": "mail", 

332 "psychologie": "psychology", 

333 "regering": "government", 

334 "religie": "religion", 

335 # "ruimtevaart": "space travel", 

336 "schaak": "chess", 

337 "scheepvaart": "shipping", 

338 "scheikunde": "chemistry", 

339 # "schilderkunst": "painting", 

340 # "schoeisel": "shoewear", 

341 "scouting": "scouting", 

342 "seismologie": "seismology", 

343 "seksualiteit": "sexuality", 

344 "sieraad": "jewellery", 

345 # "slapen": "sleep", 

346 # "snoepgoed": "candy", 

347 "sociologie": "sociology", 

348 # "specerij": "spice", 

349 "speelgoed": "toys", 

350 "spel": "games", 

351 # "spellingsalfabet": "spelling alphabet", 

352 "spoorwegen": "railways", 

353 "sport": "sports", 

354 "statistiek": "statistics", 

355 # "sterrenbeeld": "constellation", 

356 "valutanaam": "money", 

357 "taalkunde": "linguistics", 

358 "tandheelkunde": "dentistry", 

359 "techniek": "technology", 

360 # "teken- en schrijfmateriaal": "", 

361 "tekstkritiek": "textual criticism", 

362 "telecommunicatie": "telecommunications", 

363 "tennis": "tennis", 

364 "textiel": "textiles", 

365 "textielindustrie": "textiles", 

366 "thermodynamica": "thermodynamics", 

367 # "tijdrekening": "timekeeping", 

368 "toerisme": "tourism", 

369 "toneel": "theater", 

370 "transport": "transport", 

371 "tuinbouw": "horticulture", 

372 # "tuinieren": "gardening", 

373 "typografie": "typography", 

374 "valkerij": "falconry", 

375 # "veeteelt": "husbandry", 

376 "verkeer": "traffic", 

377 "visserij": "fishing", 

378 "voeding": "food", 

379 "voetbal": "football", 

380 "volleybal": "volleyball", 

381 # "waterbeheer": "water management", 

382 "wegenbouw": ["road", "construction"], 

383 "werelddeel": "continents", 

384 "werktuigbouwkunde": "mechanical-engineering", 

385 "wetenschap": "sciences", 

386 "wielrennen": "cycling", 

387 # "Wikimedia": "Wikimedia", 

388 # "wikitaal": "", 

389 # "windstreek": "", 

390 # "wintersport": "", 

391 "wiskunde": "mathematics", 

392 # "wonen": "", 

393 "zoötomie": "zootomy", 

394 "zwemmen": "swimming", 

395 "toponiem: land": "country", # Template:land 

396} 

397 

398 

399def translate_raw_tags(data: WordEntry) -> None: 

400 raw_tags = [] 

401 for raw_tag in data.raw_tags: 

402 if raw_tag in TAGS: 

403 tr_tag = TAGS[raw_tag] 

404 if isinstance(tr_tag, str): 

405 data.tags.append(tr_tag) 

406 elif isinstance(tr_tag, list): 406 ↛ 401line 406 didn't jump to line 401 because the condition on line 406 was always true

407 data.tags.extend(tr_tag) 

408 elif raw_tag in TOPICS and hasattr(data, "topics"): 

409 tr_topic = TOPICS[raw_tag] 

410 if isinstance(tr_topic, str): 410 ↛ 412line 410 didn't jump to line 412 because the condition on line 410 was always true

411 data.topics.append(tr_topic) 

412 elif isinstance(tr_topic, list): 

413 data.topics.extend(tr_topic) 

414 else: 

415 raw_tags.append(raw_tag) 

416 data.raw_tags = raw_tags 

417 

418 

419# used in translation, linkage and gloss lists 

420LIST_ITEM_TAG_TEMPLATES = { 

421 "m": "masculine", 

422 "f": "feminine", 

423 "n": "neuter", 

424 "c": "common", 

425 "s": "singular", 

426 "p": "plural", 

427 "a": "animate", 

428 "i": "inanimate", 

429 "impf": "imperfective", 

430 "pf": "perfective", 

431}