Coverage for src/wiktextract/extractor/nl/tags.py: 84%
23 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1from .models import WordEntry
3# https://nl.wiktionary.org/wiki/Categorie:Werkwoordsjablonen
4VERB_TAGS = {
5 "ergatief": "ergative", # Sjabloon:erga
6 "inergatief": "unergative", # Sjabloon:inerg
7 "hulpwerkwoord": "auxiliary", # Sjabloon:auxl
8}
10# https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels
11GLOSS_TAGS = {
12 "figuurlijk": "figuratively",
13 "afkorting": "abbreviation",
14 "causatief": "causative",
15 # "chattaal": "",
16 "dichterlijk": "poetic",
17 "eufemisme": "euphemistic",
18 "familienaam": "surname",
19 "formeel": "formal",
20 "gezegde": "proverb",
21 # "heteroniem": "heteronym",
22 "historisch": "historical",
23 "informeel": "informal",
24 "initiaalwoord": "acronym",
25 # "klemtoonhomogram": "",
26 "krachtterm": "vulgar",
27 # "leesteken": "punctuation",
28 "letterwoord": "acronym",
29 "middeleeuwen": "Middle-Ages",
30 "vrouwelijke naam": ["feminine", "name"],
31 "mannelijke naam": ["masculine", "name"],
32 "mannelijke en vrouwelijke naam": ["masculine", "feminine", "name"],
33 "neologisme": "neologism",
34 "oudheid": "archaic",
35 # "palindroom": "palindrome",
36 "pejoratief": "pejorative",
37 "persoon": "person",
38 # "pregnant": "extra meaning",
39 "samenkoppeling": "compound",
40 # "sanitair": "",
41 "scheldwoord": "pejorative",
42 "schertsend": "humorous",
43 "spottend": "ironic",
44 "spreektaal": "vernacular",
45 "spreekwoord": "proverb",
46 # "stopwoord": "filled pause",
47 "straattaal": "slang",
48 "streektaal": "regiolectal",
49 # "taal": "language",
50 "toponiem": "toponymic",
51 "verkorting": "clipping",
52 "verouderd": "obsolete",
53 "Vroegnieuwnederlands": "Early-Modern-Dutch",
54 "vulgair": "vulgar",
55 "zegswijze": "idiomatic",
56 "zeldzaam": "rare",
57 "Latijns-Amerika": "Latin-America",
58}
60TABLE_TAGS = {
61 # Sjabloon:-nlnoun-
62 "enkelvoud": "singular",
63 "meervoud": "plural",
64 "verkleinwoord": "diminutive",
65 # Sjabloon:adjcomp
66 "stellend": "positive",
67 "vergrotend": "comparative",
68 "overtreffend": "superlative",
69 "onverbogen": "uninflected",
70 "verbogen": "inflected",
71 "partitief": "partitive",
72 # Sjabloon:-nlverb-
73 "onbepaalde wijs": "infinitive",
74 "kort": "short-form",
75 "onvoltooid": "imperfect",
76 "tegenwoordig": "present",
77 "toekomend": "future",
78 "voltooid": "perfect",
79 "onvoltooid deelwoord": ["imperfect", "participle"],
80 "voltooid deelwoord": ["past", "participle"],
81 "gebiedende wijs": "imperative",
82 "aanvoegende wijs": "subjunctive",
83 "aantonende wijs": "indicative",
84 "eerste": "first-person",
85 "tweede": "second-person",
86 "derde": "third-person",
87 "verleden": "past",
88 "voorwaardelijk": "conditional",
89}
92TAGS = {**VERB_TAGS, **GLOSS_TAGS, **TABLE_TAGS}
94# https://nl.wiktionary.org/wiki/Categorie:WikiWoordenboek:Contextlabels
95TOPICS = {
96 "aardrijkskunde": "geography",
97 "adel": "nobility",
98 "anatomie": "anatomy",
99 "antropologie": "anthropology",
100 "archeologie": "archaeology",
101 "astrologie": "astrology",
102 "astronomie": "astronomy",
103 # "bacteriën": "bacterium",
104 # "badminton": "badminton",
105 "basketbal": "basketball",
106 "bedrijf": "business",
107 "bedrijfskunde": "business", # "business administration",
108 # "bedrijfstak": "industrial branch",
109 "beeldhouwkunst": "arts", # "sculpting"
110 # "beroep": "profession",
111 "beschrijvende plantkunde": "botany", # "descriptive botany"
112 # "bidsprinkhanen": "mantises",
113 "biochemie": "biochemistry",
114 "biologie": "biology",
115 "bloemplanten": "botany",
116 "boekbinderij": "bookbinding",
117 "boekhouding": "accounting",
118 "bosbouw": "forestry",
119 "bouwkunde": "architecture",
120 # "breukgetal": "",
121 "bridge": "bridge",
122 # "buideldieren": "marsupial",
123 # "buikpotigen": "",
124 # "buissnaveligen": "",
125 # "buistandigen": "",
126 # "cloacadieren": "monotreme",
127 "communicatie": "communications",
128 # "coniferen": "conifers",
129 "cosmetica": "cosmetics",
130 "cryptografie": "cryptography",
131 # "cultuur": "culture",
132 "dag": "weekday",
133 "dans": "dance",
134 "demografie": "demography",
135 "demoniem": "demonym",
136 "dichtkunst": "poetry",
137 # "dierengeluid": "animal sound",
138 "diergeneeskunde": ["veterinary", "medicine"],
139 "dierkunde": "zoology",
140 # "dierluizen": "",
141 "diplomatie": "diplomacy",
142 "drinken": "beverages",
143 # "duifachtigen": "",
144 # "duikers": "",
145 # "dysfemisme": "dysphemism",
146 "ecologie": "ecology",
147 "economie": "economics",
148 # "eendvogels": "anseriform",
149 # "eenheid": "",
150 "effectenhandel": "trading",
151 "egyptologie": "Egyptology",
152 # "toponiem: eiland": "",
153 "elektronica": "electronics",
154 "elektrotechniek": "electrical-engineering",
155 # "element": "element",
156 "emotie": "emotion",
157 # "evenhoevigen": "",
158 # "familie": "family",
159 "farmacologie": "pharmacology",
160 # "feest": "party",
161 "fietsen": "cycling",
162 "filatelie": "philately",
163 "filmkunst": "cinematography",
164 "filosofie": "philosophy",
165 "financieel": "financial",
166 # "flamingoachtigen": "",
167 "folklore": "folklore",
168 "fotografie": "photography",
169 # "fruit": "fruit",
170 # "futen": "grebe",
171 "fysiologie": "physiology",
172 "genetica": "genetics",
173 # "gentachtigen": "",
174 "geologie": "geology",
175 "geopolitiek": "geopolitics",
176 "gereedschap": "tools",
177 "geschiedenis": "history",
178 "glaciologie": "glaciology",
179 # "godheid": "deity",
180 # "graan": "grain",
181 "grammatica": "grammar",
182 "groente": "vegetable",
183 # "grondmechanica": "",
184 "haar": "hairstyle",
185 "handel": "business",
186 "heraldiek": "heraldry",
187 "hobby": "hobbies",
188 "hoofddeksel": "headgear",
189 # "horeca": "",
190 "houtbewerking": "woodworking",
191 # "huishouden": "housekeeping",
192 "imkerij": "beekeeping",
193 # "industrie": "industry",
194 "informatica": "computer sciences",
195 "internet": "Internet",
196 # "jaarwisseling": "",
197 "jachttaal": "hunting",
198 # "jongerentaal": "",
199 "juridisch": "legal",
200 "kaartspel": "card-games",
201 # "kamperen": "camping",
202 # "kerst": "Christmas",
203 # "kindertaal": "child language",
204 "kleding": "clothing",
205 "kleur": "colour",
206 # "knutselen": "",
207 "kookkunst": "culinary",
208 # "krachtsport": "",
209 "kristallografie": "crystallography",
210 # "kruid": "",
211 # "kuiperij": "",
212 "kunst": "arts",
213 "landbouw": "agriculture",
214 "landmeetkunde": "surveying",
215 "leenstelsel": "feudalism",
216 # "leerbewerking": "",
217 # "leidekkerij": "",
218 "letterkunde": "literature",
219 "lhbt": "LGBT",
220 "logica": "logic",
221 "luchtvaart": "aviation",
222 # "maatschappij": "company",
223 # "magie": "magic",
224 "makelaardij": "real-estate",
225 # "materiaalkunde": "materials science",
226 # "media": "",
227 "medisch": "medicine",
228 # "meer": "lake",
229 "meetkunde": "geometry",
230 "metaalbewerking": "metalworking",
231 "metallurgie": "metallurgy",
232 "klimatologie": "climatology",
233 "meteorologie": "meteorology",
234 # "metonymisch": "",
235 "meubel": "furniture",
236 "mijnbouw": "mining",
237 "milieukunde": "ecology",
238 "militair": "military",
239 "mineraal": "mining",
240 "mineralogie": "mineralogy",
241 # "misdaad": "crime",
242 "mode": "fashion",
243 # "molenaarsambacht": "",
244 "muziek": "music",
245 "muziekinstrument": "music",
246 "mycologie": "mycology",
247 "mythologie": "mythology",
248 "natuurkunde": "physics",
249 "neurologie": "neurology",
250 "numismatiek": "numismatics",
251 "oenologie": "oenology",
252 "onderwijs": "education",
253 "oorlog": "war",
254 "optica": "optics",
255 "ordehandhaving": "law enforcement",
256 # "paardrijden": "horseriding",
257 # "planologie": "planology",
258 "plantkunde": "botany",
259 "politiek": "politics",
260 "post": "mail",
261 "psychologie": "psychology",
262 "regering": "government",
263 "religie": "religion",
264 # "ruimtevaart": "space travel",
265 "schaak": "chess",
266 "scheepvaart": "shipping",
267 "scheikunde": "chemistry",
268 # "schilderkunst": "painting",
269 # "schoeisel": "shoewear",
270 "scouting": "scouting",
271 "seismologie": "seismology",
272 "seksualiteit": "sexuality",
273 "sieraad": "jewellery",
274 # "slapen": "sleep",
275 # "snoepgoed": "candy",
276 "sociologie": "sociology",
277 # "specerij": "spice",
278 "speelgoed": "toys",
279 "spel": "games",
280 # "spellingsalfabet": "spelling alphabet",
281 "spoorwegen": "railways",
282 "sport": "sports",
283 "statistiek": "statistics",
284 # "sterrenbeeld": "constellation",
285 "valutanaam": "money",
286 "taalkunde": "linguistics",
287 "tandheelkunde": "dentistry",
288 "techniek": "technology",
289 # "teken- en schrijfmateriaal": "",
290 "tekstkritiek": "textual criticism",
291 "telecommunicatie": "telecommunications",
292 "tennis": "tennis",
293 "textiel": "textiles",
294 "textielindustrie": "textiles",
295 "thermodynamica": "thermodynamics",
296 # "tijdrekening": "timekeeping",
297 "toerisme": "tourism",
298 "toneel": "theater",
299 "transport": "transport",
300 "tuinbouw": "horticulture",
301 # "tuinieren": "gardening",
302 "typografie": "typography",
303 "valkerij": "falconry",
304 # "veeteelt": "husbandry",
305 "verkeer": "traffic",
306 "visserij": "fishing",
307 "voeding": "food",
308 "voetbal": "football",
309 "volleybal": "volleyball",
310 # "waterbeheer": "water management",
311 "wegenbouw": ["road", "construction"],
312 "werelddeel": "continents",
313 "werktuigbouwkunde": "mechanical-engineering",
314 "wetenschap": "sciences",
315 "wielrennen": "cycling",
316 # "Wikimedia": "Wikimedia",
317 # "wikitaal": "",
318 # "windstreek": "",
319 # "wintersport": "",
320 "wiskunde": "mathematics",
321 # "wonen": "",
322 "zoötomie": "zootomy",
323 "zwemmen": "swimming",
324}
327def translate_raw_tags(data: WordEntry) -> None:
328 raw_tags = []
329 for raw_tag in data.raw_tags:
330 if raw_tag in TAGS:
331 tr_tag = TAGS[raw_tag]
332 if isinstance(tr_tag, str):
333 data.tags.append(tr_tag)
334 elif isinstance(tr_tag, list): 334 ↛ 329line 334 didn't jump to line 329 because the condition on line 334 was always true
335 data.tags.extend(tr_tag)
336 elif raw_tag in TOPICS:
337 tr_topic = TOPICS[raw_tag]
338 if isinstance(tr_topic, str): 338 ↛ 340line 338 didn't jump to line 340 because the condition on line 338 was always true
339 data.topics.append(tr_topic)
340 elif isinstance(tr_topic, list):
341 data.topics.extend(tr_topic)
342 else:
343 raw_tags.append(raw_tag)
344 data.raw_tags = raw_tags