Coverage for src/wiktextract/extractor/nl/tags.py: 85%
25 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from .models import WordEntry
3# https://nl.wiktionary.org/wiki/Categorie:Lemmasjablonen
4# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen
5GLOSS_TAG_TEMPLATES = frozenset(
6 [
7 "absol",
8 "accus",
9 "auxl",
10 "copl",
11 "deponens",
12 "ditr",
13 "erga",
14 "inerg",
15 "intr",
16 "modl",
17 "onpr",
18 "ov",
19 "rcpq",
20 "refl",
21 "s-verb",
22 "plurt",
23 "singt",
24 "versterkend voorvoegsel",
25 ]
26)
29# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen
30# https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels
31GLOSS_TAGS = {
32 "figuurlijk": "figuratively",
33 "afkorting": "abbreviation",
34 "causatief": "causative",
35 # "chattaal": "",
36 "dichterlijk": "poetic",
37 "eufemisme": "euphemistic",
38 "familienaam": "surname",
39 "formeel": "formal",
40 "gezegde": "proverb",
41 # "heteroniem": "heteronym",
42 "historisch": "historical",
43 "informeel": "informal",
44 "initiaalwoord": "acronym",
45 # "klemtoonhomogram": "",
46 "krachtterm": "vulgar",
47 # "leesteken": "punctuation",
48 "letterwoord": "acronym",
49 "middeleeuwen": "Middle-Ages",
50 "vrouwelijke naam": ["feminine", "name"],
51 "mannelijke naam": ["masculine", "name"],
52 "mannelijke en vrouwelijke naam": ["masculine", "feminine", "name"],
53 "neologisme": "neologism",
54 "oudheid": "archaic",
55 # "palindroom": "palindrome",
56 "pejoratief": "pejorative",
57 "persoon": "person",
58 # "pregnant": "extra meaning",
59 "samenkoppeling": "compound",
60 # "sanitair": "",
61 "scheldwoord": "pejorative",
62 "schertsend": "humorous",
63 "spottend": "ironic",
64 "spreektaal": "vernacular",
65 "spreekwoord": "proverb",
66 # "stopwoord": "filled pause",
67 "straattaal": "slang",
68 "streektaal": "regiolectal",
69 "taal": "linguistics",
70 "toponiem": "toponymic",
71 "verkorting": "clipping",
72 "verouderd": "obsolete",
73 "Vroegnieuwnederlands": "Early-Modern-Dutch",
74 "vulgair": "vulgar",
75 "zegswijze": "idiomatic",
76 "zeldzaam": "rare",
77 "Latijns-Amerika": "Latin-America",
78 "absoluut": "absolute", # Sjabloon:absol
79 "accusatief": "accusative", # Sjabloon:accus
80 "hulpwerkwoord": "auxiliary", # Sjabloon:auxl
81 "koppelwerkwoord": "copulative", # Sjabloon:copl
82 "deponens": "deponent",
83 "ditransitief": "ditransitive", # Sjabloon:ditr
84 "ergatief": "ergative", # Sjabloon:erga
85 "inergatief": "unergative", # Sjabloon:inerg
86 "onovergankelijk": "intransitive", # Sjabloon:intr
87 "modaal werkwoord": ["modal", "verb"], # Sjabloon:modl
88 "onpersoonlijk": "impersonal", # Sjabloon:onpr
89 "overgankelijk": "transitive", # Sjabloon:ov
90 "wederkerig": "reciprocal", # Sjabloon:rcpq
91 "wederkerend": "reflexive", # Sjabloon:refl
92 "alleen meervoud": "plural-only", # Sjabloon:plurt
93 "geen meervoud": "no-plural", # Sjabloon:singt
94 "versterkend voorvoegsel": ["intensifier", "prefix"],
95 "in een bijzin": "with-subordinate-clause", # Sjabloon:ovt-mv-bijz
96 "bij inversie": "inversion", # Sjabloon:1ps
97}
99TABLE_TAGS = {
100 # Sjabloon:-nlnoun-
101 "enkelvoud": "singular",
102 "meervoud": "plural",
103 "verkleinwoord": "diminutive",
104 "bezitsvorm": "possessive",
105 # Sjabloon:adjcomp
106 "stellend": "positive",
107 "vergrotend": "comparative",
108 "overtreffend": "superlative",
109 "onverbogen": "uninflected",
110 "verbogen": "inflected",
111 "partitief": "partitive",
112 # Sjabloon:-nlverb-
113 "onbepaalde wijs": "infinitive",
114 "kort": "short-form",
115 "onvoltooid": "imperfect",
116 "tegenwoordig": "present",
117 "toekomend": "future",
118 "voltooid": "perfect",
119 "onvoltooid deelwoord": ["imperfect", "participle"],
120 "voltooid deelwoord": ["past", "participle"],
121 "gebiedende wijs": "imperative",
122 "aanvoegende wijs": "subjunctive",
123 "aantonende wijs": "indicative",
124 "eerste": "first-person",
125 "tweede": "second-person",
126 "derde": "third-person",
127 "verleden": "past",
128 "voorwaardelijk": "conditional",
129 "hoofdzin": "main-clause",
130 "bijzin": "subordinate-clause",
131 # Sjabloon:-nlname-
132 "nominatief": "nominative",
133 "genitief": "genitive",
134 # Sjabloon:-denoun-
135 "datief": "dative",
136 "accusatief": "accusative",
137 # Sjabloon:-nlverb-reflex-
138 "tegenwoordige tijd": "present",
139 "verleden tijd": "past",
140 "toekomende tijd": "future",
141 "1": "first-person",
142 "2": "second-person",
143 "3": "third-person",
144 "voltooide tijd": "past",
145 # Sjabloon:-dumverb-
146 "onv. deelwoord": ["imperfect", "participle"],
147 "volt deelwoord": ["past", "participle"],
148 "aantonend": "indicative",
149 "aanvoegend": "subjunctive",
150}
153HEADER_LINE_TAGS = {
154 "dim. tant.": ["diminutive", "noun"], # Sjabloon:dimt
155}
158TAGS = {**GLOSS_TAGS, **TABLE_TAGS, **HEADER_LINE_TAGS}
160# https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels
161TOPICS = {
162 "aardrijkskunde": "geography",
163 "adel": "nobility",
164 "anatomie": "anatomy",
165 "antropologie": "anthropology",
166 "archeologie": "archaeology",
167 "astrologie": "astrology",
168 "astronomie": "astronomy",
169 # "bacteriën": "bacterium",
170 # "badminton": "badminton",
171 "basketbal": "basketball",
172 "bedrijf": "business",
173 "bedrijfskunde": "business", # "business administration",
174 # "bedrijfstak": "industrial branch",
175 "beeldhouwkunst": "arts", # "sculpting"
176 # "beroep": "profession",
177 "beschrijvende plantkunde": "botany", # "descriptive botany"
178 # "bidsprinkhanen": "mantises",
179 "biochemie": "biochemistry",
180 "biologie": "biology",
181 "bloemplanten": "botany",
182 "boekbinderij": "bookbinding",
183 "boekhouding": "accounting",
184 "bosbouw": "forestry",
185 "bouwkunde": "architecture",
186 # "breukgetal": "",
187 "bridge": "bridge",
188 # "buideldieren": "marsupial",
189 # "buikpotigen": "",
190 # "buissnaveligen": "",
191 # "buistandigen": "",
192 # "cloacadieren": "monotreme",
193 "communicatie": "communications",
194 # "coniferen": "conifers",
195 "cosmetica": "cosmetics",
196 "cryptografie": "cryptography",
197 # "cultuur": "culture",
198 "dag": "weekday",
199 "dans": "dance",
200 "demografie": "demography",
201 "demoniem": "demonym",
202 "dichtkunst": "poetry",
203 # "dierengeluid": "animal sound",
204 "diergeneeskunde": ["veterinary", "medicine"],
205 "dierkunde": "zoology",
206 # "dierluizen": "",
207 "diplomatie": "diplomacy",
208 "drinken": "beverages",
209 # "duifachtigen": "",
210 # "duikers": "",
211 # "dysfemisme": "dysphemism",
212 "ecologie": "ecology",
213 "economie": "economics",
214 # "eendvogels": "anseriform",
215 "eenheid": "units-of-measure",
216 "effectenhandel": "trading",
217 "egyptologie": "Egyptology",
218 # "toponiem: eiland": "",
219 "elektronica": "electronics",
220 "elektrotechniek": "electrical-engineering",
221 # "element": "element",
222 "emotie": "emotion",
223 # "evenhoevigen": "",
224 "familie": "familiar",
225 "farmacologie": "pharmacology",
226 # "feest": "party",
227 "fietsen": "cycling",
228 "filatelie": "philately",
229 "filmkunst": "cinematography",
230 "filosofie": "philosophy",
231 "financieel": "financial",
232 # "flamingoachtigen": "",
233 "folklore": "folklore",
234 "fotografie": "photography",
235 # "fruit": "fruit",
236 # "futen": "grebe",
237 "fysiologie": "physiology",
238 "genetica": "genetics",
239 # "gentachtigen": "",
240 "geologie": "geology",
241 "geopolitiek": "geopolitics",
242 "gereedschap": "tools",
243 "geschiedenis": "history",
244 "glaciologie": "glaciology",
245 # "godheid": "deity",
246 # "graan": "grain",
247 "grammatica": "grammar",
248 "groente": "vegetable",
249 # "grondmechanica": "",
250 "haar": "hairstyle",
251 "handel": "business",
252 "heraldiek": "heraldry",
253 "hobby": "hobbies",
254 "hoofddeksel": "headgear",
255 # "horeca": "",
256 "houtbewerking": "woodworking",
257 # "huishouden": "housekeeping",
258 "imkerij": "beekeeping",
259 # "industrie": "industry",
260 "informatica": "computer sciences",
261 "internet": "Internet",
262 # "jaarwisseling": "",
263 "jachttaal": "hunting",
264 # "jongerentaal": "",
265 "juridisch": "legal",
266 "kaartspel": "card-games",
267 # "kamperen": "camping",
268 # "kerst": "Christmas",
269 # "kindertaal": "child language",
270 "kleding": "clothing",
271 "kleur": "colour",
272 # "knutselen": "",
273 "kookkunst": "culinary",
274 # "krachtsport": "",
275 "kristallografie": "crystallography",
276 # "kruid": "",
277 # "kuiperij": "",
278 "kunst": "arts",
279 "landbouw": "agriculture",
280 "landmeetkunde": "surveying",
281 "leenstelsel": "feudalism",
282 # "leerbewerking": "",
283 # "leidekkerij": "",
284 "letterkunde": "literature",
285 "lhbt": "LGBT",
286 "logica": "logic",
287 "luchtvaart": "aviation",
288 # "maatschappij": "company",
289 # "magie": "magic",
290 "makelaardij": "real-estate",
291 # "materiaalkunde": "materials science",
292 # "media": "",
293 "medisch": "medicine",
294 # "meer": "lake",
295 "meetkunde": "geometry",
296 "metaalbewerking": "metalworking",
297 "metallurgie": "metallurgy",
298 "klimatologie": "climatology",
299 "meteorologie": "meteorology",
300 # "metonymisch": "",
301 "meubel": "furniture",
302 "mijnbouw": "mining",
303 "milieukunde": "ecology",
304 "militair": "military",
305 "mineraal": "mining",
306 "mineralogie": "mineralogy",
307 # "misdaad": "crime",
308 "mode": "fashion",
309 # "molenaarsambacht": "",
310 "muziek": "music",
311 "muziekinstrument": "music",
312 "mycologie": "mycology",
313 "mythologie": "mythology",
314 "natuurkunde": "physics",
315 "neurologie": "neurology",
316 "numismatiek": "numismatics",
317 "oenologie": "oenology",
318 "onderwijs": "education",
319 "oorlog": "war",
320 "optica": "optics",
321 "ordehandhaving": "law enforcement",
322 # "paardrijden": "horseriding",
323 # "planologie": "planology",
324 "plantkunde": "botany",
325 "politiek": "politics",
326 "post": "mail",
327 "psychologie": "psychology",
328 "regering": "government",
329 "religie": "religion",
330 # "ruimtevaart": "space travel",
331 "schaak": "chess",
332 "scheepvaart": "shipping",
333 "scheikunde": "chemistry",
334 # "schilderkunst": "painting",
335 # "schoeisel": "shoewear",
336 "scouting": "scouting",
337 "seismologie": "seismology",
338 "seksualiteit": "sexuality",
339 "sieraad": "jewellery",
340 # "slapen": "sleep",
341 # "snoepgoed": "candy",
342 "sociologie": "sociology",
343 # "specerij": "spice",
344 "speelgoed": "toys",
345 "spel": "games",
346 # "spellingsalfabet": "spelling alphabet",
347 "spoorwegen": "railways",
348 "sport": "sports",
349 "statistiek": "statistics",
350 # "sterrenbeeld": "constellation",
351 "valutanaam": "money",
352 "taalkunde": "linguistics",
353 "tandheelkunde": "dentistry",
354 "techniek": "technology",
355 # "teken- en schrijfmateriaal": "",
356 "tekstkritiek": "textual criticism",
357 "telecommunicatie": "telecommunications",
358 "tennis": "tennis",
359 "textiel": "textiles",
360 "textielindustrie": "textiles",
361 "thermodynamica": "thermodynamics",
362 # "tijdrekening": "timekeeping",
363 "toerisme": "tourism",
364 "toneel": "theater",
365 "transport": "transport",
366 "tuinbouw": "horticulture",
367 # "tuinieren": "gardening",
368 "typografie": "typography",
369 "valkerij": "falconry",
370 # "veeteelt": "husbandry",
371 "verkeer": "traffic",
372 "visserij": "fishing",
373 "voeding": "food",
374 "voetbal": "football",
375 "volleybal": "volleyball",
376 # "waterbeheer": "water management",
377 "wegenbouw": ["road", "construction"],
378 "werelddeel": "continents",
379 "werktuigbouwkunde": "mechanical-engineering",
380 "wetenschap": "sciences",
381 "wielrennen": "cycling",
382 # "Wikimedia": "Wikimedia",
383 # "wikitaal": "",
384 # "windstreek": "",
385 # "wintersport": "",
386 "wiskunde": "mathematics",
387 # "wonen": "",
388 "zoötomie": "zootomy",
389 "zwemmen": "swimming",
390}
393def translate_raw_tags(data: WordEntry) -> None:
394 raw_tags = []
395 for raw_tag in data.raw_tags:
396 if raw_tag in TAGS:
397 tr_tag = TAGS[raw_tag]
398 if isinstance(tr_tag, str):
399 data.tags.append(tr_tag)
400 elif isinstance(tr_tag, list): 400 ↛ 395line 400 didn't jump to line 395 because the condition on line 400 was always true
401 data.tags.extend(tr_tag)
402 elif raw_tag in TOPICS and hasattr(data, "topics"):
403 tr_topic = TOPICS[raw_tag]
404 if isinstance(tr_topic, str): 404 ↛ 406line 404 didn't jump to line 406 because the condition on line 404 was always true
405 data.topics.append(tr_topic)
406 elif isinstance(tr_topic, list):
407 data.topics.extend(tr_topic)
408 else:
409 raw_tags.append(raw_tag)
410 data.raw_tags = raw_tags
413# used in translation, linkage and gloss lists
414LIST_ITEM_TAG_TEMPLATES = {
415 "m": "masculine",
416 "f": "feminine",
417 "n": "neuter",
418 "c": "common",
419 "s": "singular",
420 "p": "plural",
421 "a": "animate",
422 "i": "inanimate",
423 "impf": "imperfective",
424 "pf": "perfective",
425}