Coverage for src/wiktextract/extractor/nl/tags.py: 85%
25 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1from .models import WordEntry
3# https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen
4# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen
5GLOSS_TAG_TEMPLATES = frozenset(
6 [
7 "absol",
8 "accus",
9 "auxl",
10 "copl",
11 "deponens",
12 "ditr",
13 "erga",
14 "inerg",
15 "intr",
16 "modl",
17 "onpr",
18 "ov",
19 "rcpq",
20 "refl",
21 "s-verb",
22 "plurt",
23 "singt",
24 "versterkend voorvoegsel",
25 ]
26)
29# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen
30# https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels
31GLOSS_TAGS = {
32 "figuurlijk": "figuratively",
33 "afkorting": "abbreviation",
34 "causatief": "causative",
35 # "chattaal": "",
36 "dichterlijk": "poetic",
37 "eufemisme": "euphemistic",
38 "familienaam": "surname",
39 "formeel": "formal",
40 "gezegde": "proverb",
41 # "heteroniem": "heteronym",
42 "historisch": "historical",
43 "informeel": "informal",
44 "initiaalwoord": "acronym",
45 # "klemtoonhomogram": "",
46 "krachtterm": "vulgar",
47 # "leesteken": "punctuation",
48 "letterwoord": "acronym",
49 "middeleeuwen": "Middle-Ages",
50 "vrouwelijke naam": ["feminine", "name"],
51 "mannelijke naam": ["masculine", "name"],
52 "mannelijke en vrouwelijke naam": ["masculine", "feminine", "name"],
53 "neologisme": "neologism",
54 "oudheid": "archaic",
55 # "palindroom": "palindrome",
56 "pejoratief": "pejorative",
57 "persoon": "person",
58 # "pregnant": "extra meaning",
59 "samenkoppeling": "compound",
60 # "sanitair": "",
61 "scheldwoord": "pejorative",
62 "schertsend": "humorous",
63 "spottend": "ironic",
64 "spreektaal": "vernacular",
65 "spreekwoord": "proverb",
66 # "stopwoord": "filled pause",
67 "straattaal": "slang",
68 "streektaal": "regiolectal",
69 "taal": "linguistics",
70 "toponiem": "toponymic",
71 "verkorting": "clipping",
72 "verouderd": "obsolete",
73 "Vroegnieuwnederlands": "Early-Modern-Dutch",
74 "vulgair": "vulgar",
75 "zegswijze": "idiomatic",
76 "zeldzaam": "rare",
77 "Latijns-Amerika": "Latin-America",
78 "absoluut": "absolute", # Sjabloon:absol
79 "accusatief": "accusative", # Sjabloon:accus
80 "hulpwerkwoord": "auxiliary", # Sjabloon:auxl
81 "koppelwerkwoord": "copulative", # Sjabloon:copl
82 "deponens": "deponent",
83 "ditransitief": "ditransitive", # Sjabloon:ditr
84 "ergatief": "ergative", # Sjabloon:erga
85 "inergatief": "unergative", # Sjabloon:inerg
86 "onovergankelijk": "intransitive", # Sjabloon:intr
87 "modaal werkwoord": ["modal", "verb"], # Sjabloon:modl
88 "onpersoonlijk": "impersonal", # Sjabloon:onpr
89 "overgankelijk": "transitive", # Sjabloon:ov
90 "wederkerig": "reciprocal", # Sjabloon:rcpq
91 "wederkerend": "reflexive", # Sjabloon:refl
92 "alleen meervoud": "plural-only", # Sjabloon:plurt
93 "geen meervoud": "no-plural", # Sjabloon:singt
94 "versterkend voorvoegsel": ["intensifier", "prefix"],
95 "in een bijzin": "with-subordinate-clause", # Sjabloon:ovt-mv-bijz
96 "bij inversie": "inversion", # Sjabloon:1ps
97 "Noord-Nederland": "Northern-Netherland",
98 "Vlaanderen": "Flanders",
99 "Brabant": "Brabant",
100 "Limburg": "Limburg",
101}
103TABLE_TAGS = {
104 # Sjabloon:-nlnoun-
105 "enkelvoud": "singular",
106 "meervoud": "plural",
107 "verkleinwoord": "diminutive",
108 "bezitsvorm": "possessive",
109 # Sjabloon:adjcomp
110 "stellend": "positive",
111 "vergrotend": "comparative",
112 "overtreffend": "superlative",
113 "onverbogen": "uninflected",
114 "verbogen": "inflected",
115 "partitief": "partitive",
116 # Sjabloon:-nlverb-
117 "onbepaalde wijs": "infinitive",
118 "kort": "short-form",
119 "lang": "long-form",
120 "onvoltooid": "imperfect",
121 "tegenwoordig": "present",
122 "toekomend": "future",
123 "voltooid": "perfect",
124 "onvoltooid deelwoord": ["imperfect", "participle"],
125 "voltooid deelwoord": ["past", "participle"],
126 "gebiedende wijs": "imperative",
127 "aanvoegende wijs": "subjunctive",
128 "aantonende wijs": "indicative",
129 "eerste": "first-person",
130 "tweede": "second-person",
131 "derde": "third-person",
132 "verleden": "past",
133 "voorwaardelijk": "conditional",
134 "hoofdzin": "main-clause",
135 "bijzin": "subordinate-clause",
136 # Sjabloon:-nlname-
137 "nominatief": "nominative",
138 "genitief": "genitive",
139 # Sjabloon:-denoun-
140 "datief": "dative",
141 "accusatief": "accusative",
142 # Sjabloon:-nlverb-reflex-
143 "tegenwoordige tijd": "present",
144 "verleden tijd": "past",
145 "toekomende tijd": "future",
146 "1": "first-person",
147 "2": "second-person",
148 "3": "third-person",
149 "voltooide tijd": "past",
150 # Sjabloon:-dumverb-
151 "onv. deelwoord": ["imperfect", "participle"],
152 "volt deelwoord": ["past", "participle"],
153 "aantonend": "indicative",
154 "aanvoegend": "subjunctive",
155}
158HEADER_LINE_TAGS = {
159 "dim. tant.": ["diminutive", "noun"], # Sjabloon:dimt
160}
163TAGS = {**GLOSS_TAGS, **TABLE_TAGS, **HEADER_LINE_TAGS}
165# https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels
166TOPICS = {
167 "aardrijkskunde": "geography",
168 "adel": "nobility",
169 "anatomie": "anatomy",
170 "antropologie": "anthropology",
171 "archeologie": "archaeology",
172 "astrologie": "astrology",
173 "astronomie": "astronomy",
174 # "bacteriën": "bacterium",
175 # "badminton": "badminton",
176 "basketbal": "basketball",
177 "bedrijf": "business",
178 "bedrijfskunde": "business", # "business administration",
179 # "bedrijfstak": "industrial branch",
180 "beeldhouwkunst": "arts", # "sculpting"
181 # "beroep": "profession",
182 "beschrijvende plantkunde": "botany", # "descriptive botany"
183 # "bidsprinkhanen": "mantises",
184 "biochemie": "biochemistry",
185 "biologie": "biology",
186 "bloemplanten": "botany",
187 "boekbinderij": "bookbinding",
188 "boekhouding": "accounting",
189 "bosbouw": "forestry",
190 "bouwkunde": "architecture",
191 # "breukgetal": "",
192 "bridge": "bridge",
193 # "buideldieren": "marsupial",
194 # "buikpotigen": "",
195 # "buissnaveligen": "",
196 # "buistandigen": "",
197 # "cloacadieren": "monotreme",
198 "communicatie": "communications",
199 # "coniferen": "conifers",
200 "cosmetica": "cosmetics",
201 "cryptografie": "cryptography",
202 # "cultuur": "culture",
203 "dag": "weekday",
204 "dans": "dance",
205 "demografie": "demography",
206 "demoniem": "demonym",
207 "dichtkunst": "poetry",
208 # "dierengeluid": "animal sound",
209 "diergeneeskunde": ["veterinary", "medicine"],
210 "dierkunde": "zoology",
211 # "dierluizen": "",
212 "diplomatie": "diplomacy",
213 "drinken": "beverages",
214 # "duifachtigen": "",
215 # "duikers": "",
216 # "dysfemisme": "dysphemism",
217 "ecologie": "ecology",
218 "economie": "economics",
219 # "eendvogels": "anseriform",
220 "eenheid": "units-of-measure",
221 "effectenhandel": "trading",
222 "egyptologie": "Egyptology",
223 # "toponiem: eiland": "",
224 "elektronica": "electronics",
225 "elektrotechniek": "electrical-engineering",
226 # "element": "element",
227 "emotie": "emotion",
228 # "evenhoevigen": "",
229 "familie": "familiar",
230 "farmacologie": "pharmacology",
231 # "feest": "party",
232 "fietsen": "cycling",
233 "filatelie": "philately",
234 "filmkunst": "cinematography",
235 "filosofie": "philosophy",
236 "financieel": "financial",
237 # "flamingoachtigen": "",
238 "folklore": "folklore",
239 "fotografie": "photography",
240 # "fruit": "fruit",
241 # "futen": "grebe",
242 "fysiologie": "physiology",
243 "genetica": "genetics",
244 # "gentachtigen": "",
245 "geologie": "geology",
246 "geopolitiek": "geopolitics",
247 "gereedschap": "tools",
248 "geschiedenis": "history",
249 "glaciologie": "glaciology",
250 # "godheid": "deity",
251 # "graan": "grain",
252 "grammatica": "grammar",
253 "groente": "vegetable",
254 # "grondmechanica": "",
255 "haar": "hairstyle",
256 "handel": "business",
257 "heraldiek": "heraldry",
258 "hobby": "hobbies",
259 "hoofddeksel": "headgear",
260 # "horeca": "",
261 "houtbewerking": "woodworking",
262 # "huishouden": "housekeeping",
263 "imkerij": "beekeeping",
264 # "industrie": "industry",
265 "informatica": "computer sciences",
266 "internet": "Internet",
267 # "jaarwisseling": "",
268 "jachttaal": "hunting",
269 # "jongerentaal": "",
270 "juridisch": "legal",
271 "kaartspel": "card-games",
272 # "kamperen": "camping",
273 # "kerst": "Christmas",
274 # "kindertaal": "child language",
275 "kleding": "clothing",
276 "kleur": "colour",
277 # "knutselen": "",
278 "kookkunst": "culinary",
279 # "krachtsport": "",
280 "kristallografie": "crystallography",
281 # "kruid": "",
282 # "kuiperij": "",
283 "kunst": "arts",
284 "landbouw": "agriculture",
285 "landmeetkunde": "surveying",
286 "leenstelsel": "feudalism",
287 # "leerbewerking": "",
288 # "leidekkerij": "",
289 "letterkunde": "literature",
290 "lhbt": "LGBT",
291 "logica": "logic",
292 "luchtvaart": "aviation",
293 # "maatschappij": "company",
294 # "magie": "magic",
295 "makelaardij": "real-estate",
296 # "materiaalkunde": "materials science",
297 # "media": "",
298 "medisch": "medicine",
299 # "meer": "lake",
300 "meetkunde": "geometry",
301 "metaalbewerking": "metalworking",
302 "metallurgie": "metallurgy",
303 "klimatologie": "climatology",
304 "meteorologie": "meteorology",
305 # "metonymisch": "",
306 "meubel": "furniture",
307 "mijnbouw": "mining",
308 "milieukunde": "ecology",
309 "militair": "military",
310 "mineraal": "mining",
311 "mineralogie": "mineralogy",
312 # "misdaad": "crime",
313 "mode": "fashion",
314 # "molenaarsambacht": "",
315 "muziek": "music",
316 "muziekinstrument": "music",
317 "mycologie": "mycology",
318 "mythologie": "mythology",
319 "natuurkunde": "physics",
320 "neurologie": "neurology",
321 "numismatiek": "numismatics",
322 "oenologie": "oenology",
323 "onderwijs": "education",
324 "oorlog": "war",
325 "optica": "optics",
326 "ordehandhaving": "law enforcement",
327 # "paardrijden": "horseriding",
328 # "planologie": "planology",
329 "plantkunde": "botany",
330 "politiek": "politics",
331 "post": "mail",
332 "psychologie": "psychology",
333 "regering": "government",
334 "religie": "religion",
335 # "ruimtevaart": "space travel",
336 "schaak": "chess",
337 "scheepvaart": "shipping",
338 "scheikunde": "chemistry",
339 # "schilderkunst": "painting",
340 # "schoeisel": "shoewear",
341 "scouting": "scouting",
342 "seismologie": "seismology",
343 "seksualiteit": "sexuality",
344 "sieraad": "jewellery",
345 # "slapen": "sleep",
346 # "snoepgoed": "candy",
347 "sociologie": "sociology",
348 # "specerij": "spice",
349 "speelgoed": "toys",
350 "spel": "games",
351 # "spellingsalfabet": "spelling alphabet",
352 "spoorwegen": "railways",
353 "sport": "sports",
354 "statistiek": "statistics",
355 # "sterrenbeeld": "constellation",
356 "valutanaam": "money",
357 "taalkunde": "linguistics",
358 "tandheelkunde": "dentistry",
359 "techniek": "technology",
360 # "teken- en schrijfmateriaal": "",
361 "tekstkritiek": "textual criticism",
362 "telecommunicatie": "telecommunications",
363 "tennis": "tennis",
364 "textiel": "textiles",
365 "textielindustrie": "textiles",
366 "thermodynamica": "thermodynamics",
367 # "tijdrekening": "timekeeping",
368 "toerisme": "tourism",
369 "toneel": "theater",
370 "transport": "transport",
371 "tuinbouw": "horticulture",
372 # "tuinieren": "gardening",
373 "typografie": "typography",
374 "valkerij": "falconry",
375 # "veeteelt": "husbandry",
376 "verkeer": "traffic",
377 "visserij": "fishing",
378 "voeding": "food",
379 "voetbal": "football",
380 "volleybal": "volleyball",
381 # "waterbeheer": "water management",
382 "wegenbouw": ["road", "construction"],
383 "werelddeel": "continents",
384 "werktuigbouwkunde": "mechanical-engineering",
385 "wetenschap": "sciences",
386 "wielrennen": "cycling",
387 # "Wikimedia": "Wikimedia",
388 # "wikitaal": "",
389 # "windstreek": "",
390 # "wintersport": "",
391 "wiskunde": "mathematics",
392 # "wonen": "",
393 "zoötomie": "zootomy",
394 "zwemmen": "swimming",
395 "toponiem: land": "country", # Template:land
396}
399def translate_raw_tags(data: WordEntry) -> None:
400 raw_tags = []
401 for raw_tag in data.raw_tags:
402 if raw_tag in TAGS:
403 tr_tag = TAGS[raw_tag]
404 if isinstance(tr_tag, str):
405 data.tags.append(tr_tag)
406 elif isinstance(tr_tag, list): 406 ↛ 401line 406 didn't jump to line 401 because the condition on line 406 was always true
407 data.tags.extend(tr_tag)
408 elif raw_tag in TOPICS and hasattr(data, "topics"):
409 tr_topic = TOPICS[raw_tag]
410 if isinstance(tr_topic, str): 410 ↛ 412line 410 didn't jump to line 412 because the condition on line 410 was always true
411 data.topics.append(tr_topic)
412 elif isinstance(tr_topic, list):
413 data.topics.extend(tr_topic)
414 else:
415 raw_tags.append(raw_tag)
416 data.raw_tags = raw_tags
419# used in translation, linkage and gloss lists
420LIST_ITEM_TAG_TEMPLATES = {
421 "m": "masculine",
422 "f": "feminine",
423 "n": "neuter",
424 "c": "common",
425 "s": "singular",
426 "p": "plural",
427 "a": "animate",
428 "i": "inanimate",
429 "impf": "imperfective",
430 "pf": "perfective",
431}