Coverage for src/wiktextract/extractor/nl/tags.py: 85%

25 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1from .models import WordEntry 

2 

3# https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen 

4# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen 

5GLOSS_TAG_TEMPLATES = frozenset( 

6 [ 

7 "absol", 

8 "accus", 

9 "auxl", 

10 "copl", 

11 "deponens", 

12 "ditr", 

13 "erga", 

14 "inerg", 

15 "intr", 

16 "modl", 

17 "onpr", 

18 "ov", 

19 "rcpq", 

20 "refl", 

21 "s-verb", 

22 "plurt", 

23 "singt", 

24 "versterkend voorvoegsel", 

25 ] 

26) 

27 

28 

29# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen 

30# https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels 

31GLOSS_TAGS = { 

32 "figuurlijk": "figuratively", 

33 "afkorting": "abbreviation", 

34 "causatief": "causative", 

35 # "chattaal": "", 

36 "dichterlijk": "poetic", 

37 "eufemisme": "euphemistic", 

38 "familienaam": "surname", 

39 "formeel": "formal", 

40 "gezegde": "proverb", 

41 # "heteroniem": "heteronym", 

42 "historisch": "historical", 

43 "informeel": "informal", 

44 "initiaalwoord": "acronym", 

45 # "klemtoonhomogram": "", 

46 "krachtterm": "vulgar", 

47 # "leesteken": "punctuation", 

48 "letterwoord": "acronym", 

49 "middeleeuwen": "Middle-Ages", 

50 "vrouwelijke naam": ["feminine", "name"], 

51 "mannelijke naam": ["masculine", "name"], 

52 "mannelijke en vrouwelijke naam": ["masculine", "feminine", "name"], 

53 "neologisme": "neologism", 

54 "oudheid": "archaic", 

55 # "palindroom": "palindrome", 

56 "pejoratief": "pejorative", 

57 "persoon": "person", 

58 # "pregnant": "extra meaning", 

59 "samenkoppeling": "compound", 

60 # "sanitair": "", 

61 "scheldwoord": "pejorative", 

62 "schertsend": "humorous", 

63 "spottend": "ironic", 

64 "spreektaal": "vernacular", 

65 "spreekwoord": "proverb", 

66 # "stopwoord": "filled pause", 

67 "straattaal": "slang", 

68 "streektaal": "regiolectal", 

69 "taal": "linguistics", 

70 "toponiem": "toponymic", 

71 "verkorting": "clipping", 

72 "verouderd": "obsolete", 

73 "Vroegnieuwnederlands": "Early-Modern-Dutch", 

74 "vulgair": "vulgar", 

75 "zegswijze": "idiomatic", 

76 "zeldzaam": "rare", 

77 "Latijns-Amerika": "Latin-America", 

78 "absoluut": "absolute", # Sjabloon:absol 

79 "accusatief": "accusative", # Sjabloon:accus 

80 "hulpwerkwoord": "auxiliary", # Sjabloon:auxl 

81 "koppelwerkwoord": "copulative", # Sjabloon:copl 

82 "deponens": "deponent", 

83 "ditransitief": "ditransitive", # Sjabloon:ditr 

84 "ergatief": "ergative", # Sjabloon:erga 

85 "inergatief": "unergative", # Sjabloon:inerg 

86 "onovergankelijk": "intransitive", # Sjabloon:intr 

87 "modaal werkwoord": ["modal", "verb"], # Sjabloon:modl 

88 "onpersoonlijk": "impersonal", # Sjabloon:onpr 

89 "overgankelijk": "transitive", # Sjabloon:ov 

90 "wederkerig": "reciprocal", # Sjabloon:rcpq 

91 "wederkerend": "reflexive", # Sjabloon:refl 

92 "alleen meervoud": "plural-only", # Sjabloon:plurt 

93 "geen meervoud": "no-plural", # Sjabloon:singt 

94 "versterkend voorvoegsel": ["intensifier", "prefix"], 

95 "in een bijzin": "with-subordinate-clause", # Sjabloon:ovt-mv-bijz 

96 "bij inversie": "inversion", # Sjabloon:1ps 

97} 

98 

99TABLE_TAGS = { 

100 # Sjabloon:-nlnoun- 

101 "enkelvoud": "singular", 

102 "meervoud": "plural", 

103 "verkleinwoord": "diminutive", 

104 "bezitsvorm": "possessive", 

105 # Sjabloon:adjcomp 

106 "stellend": "positive", 

107 "vergrotend": "comparative", 

108 "overtreffend": "superlative", 

109 "onverbogen": "uninflected", 

110 "verbogen": "inflected", 

111 "partitief": "partitive", 

112 # Sjabloon:-nlverb- 

113 "onbepaalde wijs": "infinitive", 

114 "kort": "short-form", 

115 "onvoltooid": "imperfect", 

116 "tegenwoordig": "present", 

117 "toekomend": "future", 

118 "voltooid": "perfect", 

119 "onvoltooid deelwoord": ["imperfect", "participle"], 

120 "voltooid deelwoord": ["past", "participle"], 

121 "gebiedende wijs": "imperative", 

122 "aanvoegende wijs": "subjunctive", 

123 "aantonende wijs": "indicative", 

124 "eerste": "first-person", 

125 "tweede": "second-person", 

126 "derde": "third-person", 

127 "verleden": "past", 

128 "voorwaardelijk": "conditional", 

129 "hoofdzin": "main-clause", 

130 "bijzin": "subordinate-clause", 

131 # Sjabloon:-nlname- 

132 "nominatief": "nominative", 

133 "genitief": "genitive", 

134 # Sjabloon:-denoun- 

135 "datief": "dative", 

136 "accusatief": "accusative", 

137 # Sjabloon:-nlverb-reflex- 

138 "tegenwoordige tijd": "present", 

139 "verleden tijd": "past", 

140 "toekomende tijd": "future", 

141 "1": "first-person", 

142 "2": "second-person", 

143 "3": "third-person", 

144 "voltooide tijd": "past", 

145 # Sjabloon:-dumverb- 

146 "onv. deelwoord": ["imperfect", "participle"], 

147 "volt deelwoord": ["past", "participle"], 

148 "aantonend": "indicative", 

149 "aanvoegend": "subjunctive", 

150} 

151 

152 

153HEADER_LINE_TAGS = { 

154 "dim. tant.": ["diminutive", "noun"], # Sjabloon:dimt 

155} 

156 

157 

158TAGS = {**GLOSS_TAGS, **TABLE_TAGS, **HEADER_LINE_TAGS} 

159 

160# https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels 

161TOPICS = { 

162 "aardrijkskunde": "geography", 

163 "adel": "nobility", 

164 "anatomie": "anatomy", 

165 "antropologie": "anthropology", 

166 "archeologie": "archaeology", 

167 "astrologie": "astrology", 

168 "astronomie": "astronomy", 

169 # "bacteriën": "bacterium", 

170 # "badminton": "badminton", 

171 "basketbal": "basketball", 

172 "bedrijf": "business", 

173 "bedrijfskunde": "business", # "business administration", 

174 # "bedrijfstak": "industrial branch", 

175 "beeldhouwkunst": "arts", # "sculpting" 

176 # "beroep": "profession", 

177 "beschrijvende plantkunde": "botany", # "descriptive botany" 

178 # "bidsprinkhanen": "mantises", 

179 "biochemie": "biochemistry", 

180 "biologie": "biology", 

181 "bloemplanten": "botany", 

182 "boekbinderij": "bookbinding", 

183 "boekhouding": "accounting", 

184 "bosbouw": "forestry", 

185 "bouwkunde": "architecture", 

186 # "breukgetal": "", 

187 "bridge": "bridge", 

188 # "buideldieren": "marsupial", 

189 # "buikpotigen": "", 

190 # "buissnaveligen": "", 

191 # "buistandigen": "", 

192 # "cloacadieren": "monotreme", 

193 "communicatie": "communications", 

194 # "coniferen": "conifers", 

195 "cosmetica": "cosmetics", 

196 "cryptografie": "cryptography", 

197 # "cultuur": "culture", 

198 "dag": "weekday", 

199 "dans": "dance", 

200 "demografie": "demography", 

201 "demoniem": "demonym", 

202 "dichtkunst": "poetry", 

203 # "dierengeluid": "animal sound", 

204 "diergeneeskunde": ["veterinary", "medicine"], 

205 "dierkunde": "zoology", 

206 # "dierluizen": "", 

207 "diplomatie": "diplomacy", 

208 "drinken": "beverages", 

209 # "duifachtigen": "", 

210 # "duikers": "", 

211 # "dysfemisme": "dysphemism", 

212 "ecologie": "ecology", 

213 "economie": "economics", 

214 # "eendvogels": "anseriform", 

215 "eenheid": "units-of-measure", 

216 "effectenhandel": "trading", 

217 "egyptologie": "Egyptology", 

218 # "toponiem: eiland": "", 

219 "elektronica": "electronics", 

220 "elektrotechniek": "electrical-engineering", 

221 # "element": "element", 

222 "emotie": "emotion", 

223 # "evenhoevigen": "", 

224 "familie": "familiar", 

225 "farmacologie": "pharmacology", 

226 # "feest": "party", 

227 "fietsen": "cycling", 

228 "filatelie": "philately", 

229 "filmkunst": "cinematography", 

230 "filosofie": "philosophy", 

231 "financieel": "financial", 

232 # "flamingoachtigen": "", 

233 "folklore": "folklore", 

234 "fotografie": "photography", 

235 # "fruit": "fruit", 

236 # "futen": "grebe", 

237 "fysiologie": "physiology", 

238 "genetica": "genetics", 

239 # "gentachtigen": "", 

240 "geologie": "geology", 

241 "geopolitiek": "geopolitics", 

242 "gereedschap": "tools", 

243 "geschiedenis": "history", 

244 "glaciologie": "glaciology", 

245 # "godheid": "deity", 

246 # "graan": "grain", 

247 "grammatica": "grammar", 

248 "groente": "vegetable", 

249 # "grondmechanica": "", 

250 "haar": "hairstyle", 

251 "handel": "business", 

252 "heraldiek": "heraldry", 

253 "hobby": "hobbies", 

254 "hoofddeksel": "headgear", 

255 # "horeca": "", 

256 "houtbewerking": "woodworking", 

257 # "huishouden": "housekeeping", 

258 "imkerij": "beekeeping", 

259 # "industrie": "industry", 

260 "informatica": "computer sciences", 

261 "internet": "Internet", 

262 # "jaarwisseling": "", 

263 "jachttaal": "hunting", 

264 # "jongerentaal": "", 

265 "juridisch": "legal", 

266 "kaartspel": "card-games", 

267 # "kamperen": "camping", 

268 # "kerst": "Christmas", 

269 # "kindertaal": "child language", 

270 "kleding": "clothing", 

271 "kleur": "colour", 

272 # "knutselen": "", 

273 "kookkunst": "culinary", 

274 # "krachtsport": "", 

275 "kristallografie": "crystallography", 

276 # "kruid": "", 

277 # "kuiperij": "", 

278 "kunst": "arts", 

279 "landbouw": "agriculture", 

280 "landmeetkunde": "surveying", 

281 "leenstelsel": "feudalism", 

282 # "leerbewerking": "", 

283 # "leidekkerij": "", 

284 "letterkunde": "literature", 

285 "lhbt": "LGBT", 

286 "logica": "logic", 

287 "luchtvaart": "aviation", 

288 # "maatschappij": "company", 

289 # "magie": "magic", 

290 "makelaardij": "real-estate", 

291 # "materiaalkunde": "materials science", 

292 # "media": "", 

293 "medisch": "medicine", 

294 # "meer": "lake", 

295 "meetkunde": "geometry", 

296 "metaalbewerking": "metalworking", 

297 "metallurgie": "metallurgy", 

298 "klimatologie": "climatology", 

299 "meteorologie": "meteorology", 

300 # "metonymisch": "", 

301 "meubel": "furniture", 

302 "mijnbouw": "mining", 

303 "milieukunde": "ecology", 

304 "militair": "military", 

305 "mineraal": "mining", 

306 "mineralogie": "mineralogy", 

307 # "misdaad": "crime", 

308 "mode": "fashion", 

309 # "molenaarsambacht": "", 

310 "muziek": "music", 

311 "muziekinstrument": "music", 

312 "mycologie": "mycology", 

313 "mythologie": "mythology", 

314 "natuurkunde": "physics", 

315 "neurologie": "neurology", 

316 "numismatiek": "numismatics", 

317 "oenologie": "oenology", 

318 "onderwijs": "education", 

319 "oorlog": "war", 

320 "optica": "optics", 

321 "ordehandhaving": "law enforcement", 

322 # "paardrijden": "horseriding", 

323 # "planologie": "planology", 

324 "plantkunde": "botany", 

325 "politiek": "politics", 

326 "post": "mail", 

327 "psychologie": "psychology", 

328 "regering": "government", 

329 "religie": "religion", 

330 # "ruimtevaart": "space travel", 

331 "schaak": "chess", 

332 "scheepvaart": "shipping", 

333 "scheikunde": "chemistry", 

334 # "schilderkunst": "painting", 

335 # "schoeisel": "shoewear", 

336 "scouting": "scouting", 

337 "seismologie": "seismology", 

338 "seksualiteit": "sexuality", 

339 "sieraad": "jewellery", 

340 # "slapen": "sleep", 

341 # "snoepgoed": "candy", 

342 "sociologie": "sociology", 

343 # "specerij": "spice", 

344 "speelgoed": "toys", 

345 "spel": "games", 

346 # "spellingsalfabet": "spelling alphabet", 

347 "spoorwegen": "railways", 

348 "sport": "sports", 

349 "statistiek": "statistics", 

350 # "sterrenbeeld": "constellation", 

351 "valutanaam": "money", 

352 "taalkunde": "linguistics", 

353 "tandheelkunde": "dentistry", 

354 "techniek": "technology", 

355 # "teken- en schrijfmateriaal": "", 

356 "tekstkritiek": "textual criticism", 

357 "telecommunicatie": "telecommunications", 

358 "tennis": "tennis", 

359 "textiel": "textiles", 

360 "textielindustrie": "textiles", 

361 "thermodynamica": "thermodynamics", 

362 # "tijdrekening": "timekeeping", 

363 "toerisme": "tourism", 

364 "toneel": "theater", 

365 "transport": "transport", 

366 "tuinbouw": "horticulture", 

367 # "tuinieren": "gardening", 

368 "typografie": "typography", 

369 "valkerij": "falconry", 

370 # "veeteelt": "husbandry", 

371 "verkeer": "traffic", 

372 "visserij": "fishing", 

373 "voeding": "food", 

374 "voetbal": "football", 

375 "volleybal": "volleyball", 

376 # "waterbeheer": "water management", 

377 "wegenbouw": ["road", "construction"], 

378 "werelddeel": "continents", 

379 "werktuigbouwkunde": "mechanical-engineering", 

380 "wetenschap": "sciences", 

381 "wielrennen": "cycling", 

382 # "Wikimedia": "Wikimedia", 

383 # "wikitaal": "", 

384 # "windstreek": "", 

385 # "wintersport": "", 

386 "wiskunde": "mathematics", 

387 # "wonen": "", 

388 "zoötomie": "zootomy", 

389 "zwemmen": "swimming", 

390} 

391 

392 

393def translate_raw_tags(data: WordEntry) -> None: 

394 raw_tags = [] 

395 for raw_tag in data.raw_tags: 

396 if raw_tag in TAGS: 

397 tr_tag = TAGS[raw_tag] 

398 if isinstance(tr_tag, str): 

399 data.tags.append(tr_tag) 

400 elif isinstance(tr_tag, list): 400 ↛ 395line 400 didn't jump to line 395 because the condition on line 400 was always true

401 data.tags.extend(tr_tag) 

402 elif raw_tag in TOPICS and hasattr(data, "topics"): 

403 tr_topic = TOPICS[raw_tag] 

404 if isinstance(tr_topic, str): 404 ↛ 406line 404 didn't jump to line 406 because the condition on line 404 was always true

405 data.topics.append(tr_topic) 

406 elif isinstance(tr_topic, list): 

407 data.topics.extend(tr_topic) 

408 else: 

409 raw_tags.append(raw_tag) 

410 data.raw_tags = raw_tags 

411 

412 

413# used in translation, linkage and gloss lists 

414LIST_ITEM_TAG_TEMPLATES = { 

415 "m": "masculine", 

416 "f": "feminine", 

417 "n": "neuter", 

418 "c": "common", 

419 "s": "singular", 

420 "p": "plural", 

421 "a": "animate", 

422 "i": "inanimate", 

423 "impf": "imperfective", 

424 "pf": "perfective", 

425}