Coverage for src / wiktextract / extractor / de / tags.py: 81%
37 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-19 11:25 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-19 11:25 +0000
1from .models import WordEntry
3# Sense tags
4# https://de.wiktionary.org/wiki/Vorlage:K
5# https://de.wiktionary.org/wiki/Vorlage:K/Abk
6K_TEMPLATE_TAGS = {
7 "Abl.": "ablative",
8 "Ablativ": "ablative",
9 "abw.": "derogatory",
10 "abwertend": "derogatory",
11 "AE": "US",
12 "AmE": "US",
13 "adv.": "adverbial",
14 "Akkusativ": "accusative",
15 "alemann.": "Alemannic",
16 "alemannisch": "Alemannic",
17 "allg.": "general",
18 "allgemein": "general",
19 "alltagsspr.": "colloquial",
20 "amtsspr.": "officialese",
21 # "ansonsten": "otherwise", # combined with other text
22 "attr.": "attributive",
23 # "auch": "also",
24 "bair.": "Bavarian",
25 "bairisch": "Bavarian",
26 "bar.": "Bavarian",
27 "direktional": "directional",
28 "BE": "British",
29 "BrE": "British",
30 "Bedva.": "outdated",
31 "Bedvatd.": "outdated",
32 "besonders": "especially",
33 "veraltende Bedeutung": "outdated",
34 # "bei": "",
35 # "bes.": "especially",
36 # "besonders": "especially",
37 # "beziehungsweise": "",
38 # "bzw.": "",
39 # "bildungsspr.": "",
40 # "bis": "",
41 # "bisweilen": "",
42 # "das": "",
43 "Dativ": "dative",
44 # "DDR": "",
45 "Deutschland": "Germany",
46 # "der": "",
47 "dichter.": "poetic",
48 "dichterisch": "poetic",
49 # "die": "",
50 "Dim.": "diminutive",
51 "Dimin.": "diminutive",
52 "Diminutiv": "diminutive",
53 # "eher": "",
54 "erzg.": "Erzgebirgisch",
55 "erzgeb.": "Erzgebirgisch",
56 "erzgebirgisch": "Erzgebirgisch",
57 "euph.": "euphemistic",
58 "fachspr.": "jargon",
59 "fachsprachlich": "jargon",
60 "fam.": "familiär",
61 "fig": "figurative",
62 "fig.": "figurative",
63 # "früher": "",
64 # "gegenwartslateinisch": "",
65 "geh.": "gehoben",
66 "Genitiv": "genitive",
67 "gsm": "Swiss German",
68 "häufig": "often",
69 "haben": "auxiliary",
70 "hebben": "auxiliary",
71 "hauptsächlich": "primarily",
72 "hist.": "historical",
73 "ieS": "narrowly",
74 "i.e.S.": "narrowly",
75 "i. e. S.": "narrowly",
76 # "im": "",
77 # "in": "",
78 # "in Bezug auf": "relational",
79 "indekl.": "indeclinable",
80 # "insbes.": "",
81 "Instrumental": "instrumental",
82 "intrans.": "intransitive",
83 "intransitiv": "intransitive",
84 # "iPl": "in plural",
85 "iron.": "ironic",
86 # "iwS": "",
87 # "jugendspr.": "",
88 "kinderspr.": "childish",
89 "kirchenlateinisch": "Church Latin",
90 "klasslat.": "Classical Latin",
91 "klassischlateinisch": "Classical Latin",
92 "kPl.": "no-plural",
93 "kein Plural": "no-plural",
94 "kSg.": "no-singulative",
95 "kSt.": "no-comparative",
96 "kurz für": "short-form",
97 "landsch.": "regional",
98 "landschaftlich": "regional",
99 "lautm.": "onomatopoeic",
100 "lokal": "regional",
101 "Ling.": "linguistics",
102 "mA": "accusative",
103 "md.": "Central German",
104 "mdal.": "dialectal",
105 "Med.": "medicine", # topic
106 # "meist": "mostly",
107 # "meistens": "mostly",
108 "metaphor.": "metaphoric",
109 "meton.": "metonymically",
110 "mG": "genitive",
111 "mitteld.": "Central German",
112 "mit Dativ": "with-dative",
113 "mit Akkusativ": "with-accusative",
114 # "mitunter": "",
115 "mlat.": "Medieval Latin",
116 "mittellateinisch": "Medieval Latin",
117 "mundartl.": "dialectal",
118 "nDu.": "only-dual",
119 "nigr.": "Niger",
120 "nigrisch": "Niger",
121 "nkLat.": "post-Classical Latin",
122 "nachklassischlateinisch": "post-Classical Latin",
123 "nlat.": "New Latin",
124 "neulateinisch": "New Latin",
125 "nordd.": "North German",
126 "norddeutsch": "North German",
127 "nordwestd.": "Northwestern Germany",
128 "nPl.": "plural-only",
129 "Österreich": "Austrian German",
130 "örtlich": "regional",
131 "österr.": "Austrian German",
132 "österreichisch": "Austrian German",
133 "ostfränkisch": "East Franconian German",
134 "pej.": "pejorative",
135 "personifizierend": "person",
136 "poet.": "poetic",
137 "PräpmG": "genitive prepositional",
138 "PmG": "genitive prepositional",
139 "reg.": "regional",
140 "refl.": "reflexive",
141 "reflexiv": "reflexive",
142 # "respektive": "",
143 "sal.": "casual",
144 "salopp": "casual",
145 "scherzh.": "jocular",
146 "schriftspr.": "literary",
147 # "schülerspr.": "",
148 "schwäb.": "Swabian",
149 "schwäbisch": "Swabian",
150 "Schweiz": "Swiss Standard German",
151 "schweiz.": "Swiss Standard German",
152 "schweizerisch": "Swiss Standard German",
153 "Schweizerdeutsch": "Swiss German",
154 "schweizerdeutsch": "Swiss German",
155 # "seemannsspr.": "",
156 "sein": "auxiliary verb",
157 # "sehr": "", # very
158 "selten": "rare",
159 "seltener": "rare",
160 "seltener auch": "rare",
161 "soldatenspr.": ["military", "slang"],
162 # "sonderspr.": "",
163 # "sonst": "",
164 # "sowie": "",
165 "spätlat.": "Late Latin",
166 "spätlateinisch": "Late Latin",
167 # "später": "",
168 "speziell": "special",
169 "südd.": "South German",
170 "süddt.": "South German",
171 # "techn.": "",
172 # "teils": "",
173 # "teilweise": "",
174 "temporal": "temporal",
175 "tlwva.": "outdated",
176 "tlwvatd.": "outdated",
177 "trans.": "transitive",
178 "transitiv": "transitive",
179 # "über": "",
180 # "überwiegend": "mostly",
181 "übertr.": "figurative",
182 "übertragen": "figurative",
183 "ugs.": "colloquial",
184 "umgangssprachlich": "colloquial",
185 # "und": "",
186 "ungebr.": "uncommon",
187 "unpers.": "impersonal",
188 "unpersönlich": "impersonal",
189 # "ursprünglich": "",
190 "va.": "outdated",
191 "vatd.": "outdated",
192 "veraltend": "outdated",
193 # "verh.": "",
194 "volkst.": "popular",
195 # "von": "",
196 # "vor allem": "",
197 # "vor allem in": "",
198 "vul.": "vulgar",
199 "vulg.": "vulgar",
200 "vlat.": ["vulgar", "Latin"],
201 "vulgärlat": ["vulgar", "Latin"],
202 "vulgärlateinisch": ["vulgar", "Latin"],
203 "wien.": "Vienna",
204 "wienerisch": "Vienna",
205 "Wpräp": "prepositional",
206 # "z. B.": "",
207 # "z. T.": "",
208 # "zijn": "",
209 # "zum Beispiel": "",
210 # "zum Teil": "",
211 # "zumeist": "",
212 "Kardinalzahl": "cardinal",
213 "Sammelbegriff": "collective",
214 "Fachsprache": "jargon",
215 "formale Sprachen": "formal",
216 "Programmiersprachen": "programming",
217 "Rechnerarchitektur": "programming",
218 "Geografie": "geography",
219 "Geometrie": "geometry",
220 "Finanzwesen": "finance",
221 "juristisch": "law",
222 "Physik": "physics",
223 "abstrakt": "abstract",
224 "gegenständlich": "objective",
225 "personifiziert": "personal",
226 "kirchlich": "Ecclesiastical",
227}
229GENDER_TAGS = {
230 "n": "neuter",
231 "m": "masculine",
232 "f": "feminine",
233 "u": "common",
234 "m, f": ["masculine", "feminine"], # Vorlage:mf
235 "m, f, n": ["masculine", "feminine", "neuter"], # Vorlage:mfn
236 "f, n": ["feminine", "neuter"], # Vorlage:fn
237 # Vorlage:Deklinationsseite Adjektiv
238 "Maskulinum": "masculine",
239 "Femininum": "feminine",
240 "Neutrum": "neuter",
241 "f Pl.": ["feminine", "plural"], # Template:fPl.
242 "m Pl.": ["masculine", "plural"], # Template:mPl.
243 "n Pl.": ["neuter", "plural"], # Template:nPl.
244 "u Pl.": ["common", "plural"], # Template:uPl.
245}
247NUMBER_TAGS = {
248 # Vorlage:Deutsch Substantiv Übersicht
249 "Singular": "singular",
250 "Plural": "plural",
251 "Pl.": "plural",
252 "pl": "plural",
253 "Dual": "dual",
254 "ohne Plural": "no-plural",
255 "meist im Plural": "plural-normally",
256 "meist Plural": "plural-normally",
257 "nur Plural": "plural-only",
258}
260CASE_TAGS = {
261 # Vorlage:Deutsch Substantiv Übersicht
262 "Nominativ": "nominative",
263 "Genitiv": "genitive",
264 "Dativ": "dative",
265 "Akkusativ": "accusative",
266 # Template:Polnisch Substantiv Übersicht
267 "Lokativ": "locative",
268 "Vokativ": "vocative",
269 "Dativ Singular": ["dative", "singular"],
270 "Genitiv Singular": ["genitive", "singular"],
271 # Template:Finnisch Substantiv Übersicht
272 "Inessiv": "inessive",
273 "Elativ": "elative",
274 "Illativ": "illative",
275 "Adessiv": "adessive",
276 "Allativ": "allative",
277 "Essiv": "essive",
278 "Translativ": "translative",
279 "Abessiv": "abessive",
280 "Instruktiv": "instructive",
281 "Komitativ": "comitative",
282 "Partitiv": "partitive",
283}
285COMPARISON_TAGS = {
286 # Vorlage:Deutsch Adjektiv Übersicht
287 # Vorlage:Deklinationsseite Adjektiv
288 "Positiv": "positive",
289 "Komparativ": "comparative",
290 "Superlativ": "superlative",
291}
293DECLENSION_TAGS = {
294 # https://en.wikipedia.org/wiki/German_declension
295 # Vorlage:Deklinationsseite Adjektiv
296 "Starke Deklination": "strong",
297 "Schwache Deklination": "weak",
298 "Gemischte Deklination": "mixed",
299}
301OTHER_TAGS = {
302 # Vorlage:Deklinationsseite Adjektiv
303 "Prädikativ": "predicative",
304 "erweiterte": "extended",
305 "Höflichkeitsform": "honorific",
306 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
307 "nichterweitert": "not-extended",
308 "erweitert": "extended",
309 "zeitlich": "temporal",
310 "indeklinabel": "indeclinable",
311 "östlich": "Eastern",
312 "westlich": "Western",
313 "britisch": "British",
314 "Substantive": "noun",
315 "Substantiv": "noun",
316 "historisch": "historical",
317 "wörtlich": "literally",
318 "Adjektiv": "adjective",
319 "gehoben": "literary",
320 "Nebenform von": "variant",
321 "Verben": "verb",
322 "regional": "regional",
323 # Vorlage:CH&LI
324 "Schweiz und Liechtenstein": ["Switzerland", "Liechtenstein"],
325 "Switzerland and Liechtenstein": ["Switzerland", "Liechtenstein"],
326 "traditionell": "traditional",
327 "vereinfachte Schreibweise": "simplified",
328 "US-amerikanisch": "US",
329 "Adjektive": "adjective",
330 "australisch": "Australian",
331 "scherzhaft": "humorous",
332 "Minuskel": "lowercase",
333 "Majuskel": "uppercase",
334 "bildungssprachlich": "formal",
335 "Imperativ Singular": ["imperative", "singular"],
336 "meist": "usually",
337 "deutsch": "German",
338 "pariserisch": "Parisian",
339 "derb": "impolite",
340 "poetisch": "poetic",
341 "Adverb": "adverb",
342 "süddeutsch": "South-German",
343 "Verb": "verb",
344 "kanadisch": "Canadian",
345 "Supinum": "supine",
346 "Kanada": "Canada",
347 "vulgär": "vulgar",
348 "metonymisch": "metonymically",
349 "veraltet": "dated",
350 "kolumbianisch": "Colombian",
351 "Medial": "medial",
352 "Pinyin": "Pinyin",
353 "Wade-Giles": "Wade-Giles",
354 "umgangssprachlich": "colloquial",
355 "literarisch": "literary",
356 "franz.": "French",
357 "engl.": "English",
358}
360TENSE_TAGS = {
361 # Vorlage:Deutsch Verb Übersicht
362 "Präsens": "present",
363 "Präteritum": "past",
364 "Perfekt": "perfect",
365 "Futur I": "future-i",
366 "Futur II": "future-ii",
367 "Plusquamperfekt": "pluperfect",
368 # Template:Kroatisch Verb Übersicht
369 "perfektiv": "perfective",
370 "imperfektiv": "imperfective",
371 "Imperfekt": "imperfect",
372 "Futur": "future",
373}
375MOOD_TAGS = {
376 # Vorlage:Deutsch Verb Übersicht
377 # Vorlage:Deutsch Verb regelmäßig
378 "Konjunktiv I": "subjunctive-i",
379 "Konjunktiv II": "subjunctive-ii",
380 "Imperativ": "imperative",
381 "Imperative": "imperative",
382 "Indikativ": "indicative",
383 # Template:Schwedisch Verb Übersicht
384 "Konjunktiv": "subjunctive",
385 "Konditional": "conditional",
386 # Template:Englisch Verb Übersicht
387 "simple present": "present",
388 "simple past": "past",
389 "present participle": ["present", "participle"],
390 "past participle": ["past", "participle"],
391}
393VERB_FORM_TAGS = {
394 # Vorlage:Deutsch Verb Übersicht
395 "Partizip II": "participle-2",
396 "Hilfsverb": "auxiliary",
397 "Infinitive": "infinitive",
398 "Infinitiv": "infinitive",
399 "Partizipien": "participle",
400 "unregelmäßig": "irregular",
401 "Aorist": "aorist",
402 # Template:Dänisch Verb Übersicht
403 "Partizip Perfekt": ["participle", "perfect"],
404}
406VOICE_TAGS = {
407 # Vorlage:Deutsch Verb unregelmäßig
408 "Aktiv": "active",
409 "Vorgangspassiv": "processual-passive",
410 "Zustandspassiv": "statal-passive",
411 "Passiv": "passive",
412 "Gerundivum": "gerundive",
413 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
414 "Zustandsreflexiv": "statal-reflexive",
415}
417PERSON_TAGS = {
418 # Vorlage:Deutsch Verb unregelmäßig
419 "1. Person Singular": ["first-person", "singular"],
420 "1. Person Plural": ["first-person", "plural"],
421 "2. Person Singular": ["second-person", "singular"],
422 "2. Person Plural": ["second-person", "plural"],
423 "3. Person Singular": ["third-person", "singular"],
424 "3. Person Plural": ["third-person", "plural"],
425 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
426 "Sg. 1. Pers.": ["first-person", "singular"],
427 "Pl. 1. Pers.": ["first-person", "plural"],
428 "Sg. 2. Pers.": ["second-person", "singular"],
429 "Pl. 2. Pers.": ["second-person", "plural"],
430 "Sg. 3. Pers.": ["third-person", "singular"],
431 "Pl. 3. Pers.": ["third-person", "plural"],
432}
434INFLECTION_TABLE_TAGS = {
435 # Vorlage:Deutsch Verb regelmäßig
436 "ungebräuchlich": "uncommon",
437 "veraltet": "archaic",
438 # Vorlage:Deutsch Verb schwach trennbar reflexiv
439 "Nebensatzkonjugation": "subordinate-clause",
440 "Hauptsatzkonjugation": "main-clause",
441 "regelmäßig": "regular",
442 "untrennbar": "inseparable",
443 "trennbar": "separable",
444 # Vorlage:Deutsch Nachname Übersicht
445 "Singular m": ["singular", "masculine"],
446 "Singular f": ["singular", "feminine"],
447 # Vorlage:Deklinationsseite Numerale
448 "bestimmt": "definite",
449 "unbestimmt": "indefinite",
450 "Unbestimmt": "indefinite",
451 "mit Possessivpronomen": ["possessive", "pronoun"],
452 # Template:Kroatisch Verb Übersicht
453 "Partizip Präteritum Aktiv": ["past", "participle", "active"],
454 # Vorlage:Bulgarisch Substantiv Übersicht f1
455 "Singular bestimmt": ["singular", "definite"],
456 "Plural bestimmt": ["plural", "definite"],
457 # Vorlage:Schwedisch Verb Übersicht
458 "Partizip Präsens": ["present", "participle"],
459 # Template:Mazedonisch Substantiv Übersicht
460 "Distalg": "distal",
461 "Proximal": "proximal",
462 "Zählform": "count-form",
463}
465GRAMMATICAL_TAGS = {
466 **K_TEMPLATE_TAGS,
467 **GENDER_TAGS,
468 **NUMBER_TAGS,
469 **CASE_TAGS,
470 **COMPARISON_TAGS,
471 **DECLENSION_TAGS,
472 **OTHER_TAGS,
473 **TENSE_TAGS,
474 **MOOD_TAGS,
475 **VERB_FORM_TAGS,
476 **VOICE_TAGS,
477 **PERSON_TAGS,
478 **INFLECTION_TABLE_TAGS,
479}
481K_TEMPLATE_TOPICS = {
482 "Biologie": "biology",
483 "Linguistik": "linguistics",
484 "Wortbildung": "morphology",
485 "Behörde": "government",
486 "Astronomie": "astronomy",
487 "Immobilienbranche": "real-estate",
488 "Kunst": "arts",
489 "Informatik": "computing",
490 "Nautik": "nautical",
491 "Sport": "sports",
492 "Schuhwerk": "footwear",
493 "Textilien": "textiles",
494 "Zahlungsmittel": "payment-method",
495 "Ökologie": "ecology",
496 "Internet": "Internet",
497 "Religion": "religion",
498 "Militärsprache": "military",
499 "Systematik": "systematics",
500 "Zoologie": "zoology",
501 "Seefahrt": "seafaring",
502 "Soldatensprache": {"topic": "military", "tag": "slang"},
503 "Botanik": "botany",
504 "Marine": "navy",
505 "Informationstechnologie": "computing",
506 "Betriebswirtschaftslehre": "business",
507 "Recht": "law",
508 "Elektronik": "electronics",
509 "Emotion": "emotion",
510 "Mathematik": "mathematics",
511 "Bürgerliches Recht": "civil-Law",
512 "Militär": "military",
513 "Politik": "politics",
514 "Werkzeug": "tools",
515 "Medizin": "medicine",
516 "Ornithologie": "ornithology",
517 "Technik": "technology",
518 "Waffentechnik": "weaponry",
519 "Anatomie": "anatomy",
520 "Fußball": "football",
521 "Kartenspiel": "card-games",
522 "Theoretische Informatik": "computing",
523 "militärisch": "military",
524 "Taxonomie": "taxonomy",
525 "Chemie": "chemistry",
526 "Alchimie": "alchemy",
527 "Pharmazie": "medicine",
528 "Wirtschaft": "economics",
529 "wissenschaftlich": "scientific",
530 "Gastronomie": "food",
531 "Architektur": "architecture",
532 "Geologie": "geology",
533 "Philosophie": "philosophy",
534 "Psychologie": "psychology",
535 "Landwirtschaft": "agriculture",
536 "Literatur": "literature",
537 "Weinbau": "viticulture",
538 "Meteorologie": "meteorology",
539 "Kleidung": "clothing",
540 "Bauwesen": "construction",
541 "Geschichte": "history",
542 "Christentum": "Christianity",
543 "Mythologie": "mythology",
544 "Grammatik": "grammar",
545 "Elektrotechnik": "electrical-engineering",
546}
549def translate_raw_tags(data: WordEntry) -> None:
550 raw_tags = []
551 for raw_tag in data.raw_tags:
552 if raw_tag in GRAMMATICAL_TAGS:
553 tag = GRAMMATICAL_TAGS[raw_tag]
554 if isinstance(tag, str) and tag not in data.tags:
555 data.tags.append(tag)
556 elif isinstance(tag, list): 556 ↛ 551line 556 didn't jump to line 551 because the condition on line 556 was always true
557 for t in tag:
558 if t not in data.tags: 558 ↛ 557line 558 didn't jump to line 557 because the condition on line 558 was always true
559 data.tags.append(t)
560 elif raw_tag in K_TEMPLATE_TOPICS and hasattr(data, "topics"):
561 topic = K_TEMPLATE_TOPICS[raw_tag]
562 if isinstance(topic, str) and topic not in data.topics: 562 ↛ 564line 562 didn't jump to line 564 because the condition on line 562 was always true
563 data.topics.append(topic)
564 elif isinstance(topic, dict) and topic["topic"] not in data.topics:
565 data.topics.append(topic["topic"])
566 if topic["tag"] not in data.tags:
567 data.tags.append(topic["tag"])
568 else:
569 raw_tags.append(raw_tag)
570 data.raw_tags = raw_tags