Coverage for src/wiktextract/extractor/de/tags.py: 81%
37 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1from .models import WordEntry
3# Sense tags
4# https://de.wiktionary.org/wiki/Vorlage:K
5# https://de.wiktionary.org/wiki/Vorlage:K/Abk
6K_TEMPLATE_TAGS = {
7 "Abl.": "ablative",
8 "Ablativ": "ablative",
9 "abw.": "derogatory",
10 "abwertend": "derogatory",
11 "AE": "US",
12 "AmE": "US",
13 "adv.": "adverbial",
14 "Akkusativ": "accusative",
15 "alemann.": "Alemannic",
16 "alemannisch": "Alemannic",
17 "allg.": "general",
18 "allgemein": "general",
19 "alltagsspr.": "colloquial",
20 "amtsspr.": "officialese",
21 # "ansonsten": "otherwise", # combined with other text
22 "attr.": "attributive",
23 # "auch": "also",
24 "bair.": "Bavarian",
25 "bairisch": "Bavarian",
26 "bar.": "Bavarian",
27 "direktional": "directional",
28 "BE": "British",
29 "BrE": "British",
30 "Bedva.": "outdated",
31 "Bedvatd.": "outdated",
32 "besonders": "especially",
33 "veraltende Bedeutung": "outdated",
34 # "bei": "",
35 # "bes.": "especially",
36 # "besonders": "especially",
37 # "beziehungsweise": "",
38 # "bzw.": "",
39 # "bildungsspr.": "",
40 # "bis": "",
41 # "bisweilen": "",
42 # "das": "",
43 "Dativ": "dative",
44 # "DDR": "",
45 "Deutschland": "Germany",
46 # "der": "",
47 "dichter.": "poetic",
48 "dichterisch": "poetic",
49 # "die": "",
50 "Dim.": "diminutive",
51 "Dimin.": "diminutive",
52 "Diminutiv": "diminutive",
53 # "eher": "",
54 "erzg.": "Erzgebirgisch",
55 "erzgeb.": "Erzgebirgisch",
56 "erzgebirgisch": "Erzgebirgisch",
57 "euph.": "euphemistic",
58 "fachspr.": "jargon",
59 "fachsprachlich": "jargon",
60 "fam.": "familiär",
61 "fig": "figurative",
62 "fig.": "figurative",
63 # "früher": "",
64 # "gegenwartslateinisch": "",
65 "geh.": "gehoben",
66 "Genitiv": "genitive",
67 "gsm": "Swiss German",
68 "häufig": "often",
69 "haben": "auxiliary",
70 "hebben": "auxiliary",
71 "hauptsächlich": "primarily",
72 "hist.": "historical",
73 "ieS": "narrowly",
74 "i.e.S.": "narrowly",
75 "i. e. S.": "narrowly",
76 # "im": "",
77 # "in": "",
78 # "in Bezug auf": "relational",
79 "indekl.": "indeclinable",
80 # "insbes.": "",
81 "Instrumental": "instrumental",
82 "intrans.": "intransitive",
83 "intransitiv": "intransitive",
84 # "iPl": "in plural",
85 "iron.": "ironic",
86 # "iwS": "",
87 # "jugendspr.": "",
88 "kinderspr.": "childish",
89 "kirchenlateinisch": "Church Latin",
90 "klasslat.": "Classical Latin",
91 "klassischlateinisch": "Classical Latin",
92 "kPl.": "no-plural",
93 "kein Plural": "no-plural",
94 "kSg.": "no-singulative",
95 "kSt.": "no-comparative",
96 "kurz für": "short-form",
97 "landsch.": "regional",
98 "landschaftlich": "regional",
99 "lautm.": "onomatopoeic",
100 "lokal": "regional",
101 "Ling.": "linguistics",
102 "mA": "accusative",
103 "md.": "Central German",
104 "mdal.": "dialectal",
105 "Med.": "medicine", # topic
106 # "meist": "mostly",
107 # "meistens": "mostly",
108 "metaphor.": "metaphoric",
109 "meton.": "metonymically",
110 "mG": "genitive",
111 "mitteld.": "Central German",
112 "mit Dativ": "with-dative",
113 "mit Akkusativ": "with-accusative",
114 # "mitunter": "",
115 "mlat.": "Medieval Latin",
116 "mittellateinisch": "Medieval Latin",
117 "mundartl.": "dialectal",
118 "nDu.": "only-dual",
119 "nigr.": "Niger",
120 "nigrisch": "Niger",
121 "nkLat.": "post-Classical Latin",
122 "nachklassischlateinisch": "post-Classical Latin",
123 "nlat.": "New Latin",
124 "neulateinisch": "New Latin",
125 "nordd.": "North German",
126 "norddeutsch": "North German",
127 "nordwestd.": "Northwestern Germany",
128 "nPl.": "plural-only",
129 "Österreich": "Austrian German",
130 "örtlich": "regional",
131 "österr.": "Austrian German",
132 "österreichisch": "Austrian German",
133 "ostfränkisch": "East Franconian German",
134 "pej.": "pejorative",
135 "personifizierend": "person",
136 "poet.": "poetic",
137 "PräpmG": "genitive prepositional",
138 "PmG": "genitive prepositional",
139 "reg.": "regional",
140 "refl.": "reflexive",
141 "reflexiv": "reflexive",
142 # "respektive": "",
143 "sal.": "casual",
144 "salopp": "casual",
145 "scherzh.": "jocular",
146 "schriftspr.": "literary",
147 # "schülerspr.": "",
148 "schwäb.": "Swabian",
149 "schwäbisch": "Swabian",
150 "Schweiz": "Swiss Standard German",
151 "schweiz.": "Swiss Standard German",
152 "schweizerisch": "Swiss Standard German",
153 "Schweizerdeutsch": "Swiss German",
154 "schweizerdeutsch": "Swiss German",
155 # "seemannsspr.": "",
156 "sein": "auxiliary verb",
157 # "sehr": "", # very
158 "selten": "rare",
159 "seltener": "rare",
160 "seltener auch": "rare",
161 "soldatenspr.": ["military", "slang"],
162 # "sonderspr.": "",
163 # "sonst": "",
164 # "sowie": "",
165 "spätlat.": "Late Latin",
166 "spätlateinisch": "Late Latin",
167 # "später": "",
168 "speziell": "special",
169 "südd.": "South German",
170 "süddt.": "South German",
171 # "techn.": "",
172 # "teils": "",
173 # "teilweise": "",
174 "temporal": "temporal",
175 "tlwva.": "outdated",
176 "tlwvatd.": "outdated",
177 "trans.": "transitive",
178 "transitiv": "transitive",
179 # "über": "",
180 # "überwiegend": "mostly",
181 "übertr.": "figurative",
182 "übertragen": "figurative",
183 "ugs.": "colloquial",
184 "umgangssprachlich": "colloquial",
185 # "und": "",
186 "ungebr.": "uncommon",
187 "unpers.": "impersonal",
188 "unpersönlich": "impersonal",
189 # "ursprünglich": "",
190 "va.": "outdated",
191 "vatd.": "outdated",
192 "veraltend": "outdated",
193 # "verh.": "",
194 "volkst.": "popular",
195 # "von": "",
196 # "vor allem": "",
197 # "vor allem in": "",
198 "vul.": "vulgar",
199 "vulg.": "vulgar",
200 "vlat.": ["vulgar", "Latin"],
201 "vulgärlat": ["vulgar", "Latin"],
202 "vulgärlateinisch": ["vulgar", "Latin"],
203 "wien.": "Vienna",
204 "wienerisch": "Vienna",
205 "Wpräp": "prepositional",
206 # "z. B.": "",
207 # "z. T.": "",
208 # "zijn": "",
209 # "zum Beispiel": "",
210 # "zum Teil": "",
211 # "zumeist": "",
212 "Kardinalzahl": "cardinal",
213 "Sammelbegriff": "collective",
214 "Fachsprache": "jargon",
215 "formale Sprachen": "formal",
216 "Programmiersprachen": "programming",
217 "Rechnerarchitektur": "programming",
218 "Geografie": "geography",
219 "Geometrie": "geometry",
220 "Finanzwesen": "finance",
221 "juristisch": "law",
222 "Physik": "physics",
223 "abstrakt": "abstract",
224 "gegenständlich": "objective",
225 "personifiziert": "personal",
226 "kirchlich": "Ecclesiastical",
227}
229GENDER_TAGS = {
230 "n": "neuter",
231 "m": "masculine",
232 "f": "feminine",
233 "u": "common",
234 "m, f": ["masculine", "feminine"], # Vorlage:mf
235 "m, f, n": ["masculine", "feminine", "neuter"], # Vorlage:mfn
236 "f, n": ["feminine", "neuter"], # Vorlage:fn
237 # Vorlage:Deklinationsseite Adjektiv
238 "Maskulinum": "masculine",
239 "Femininum": "feminine",
240 "Neutrum": "neuter",
241 "f Pl.": ["feminine", "plural"], # Template:fPl.
242 "m Pl.": ["masculine", "plural"], # Template:mPl.
243 "n Pl.": ["neuter", "plural"], # Template:nPl.
244 "u Pl.": ["common", "plural"], # Template:uPl.
245}
247NUMBER_TAGS = {
248 # Vorlage:Deutsch Substantiv Übersicht
249 "Singular": "singular",
250 "Plural": "plural",
251 "Pl.": "plural",
252 "pl": "plural",
253 "Dual": "dual",
254 "ohne Plural": "no-plural",
255 "meist im Plural": "plural-normally",
256 "meist Plural": "plural-normally",
257 "nur Plural": "plural-only",
258}
260CASE_TAGS = {
261 # Vorlage:Deutsch Substantiv Übersicht
262 "Nominativ": "nominative",
263 "Genitiv": "genitive",
264 "Dativ": "dative",
265 "Akkusativ": "accusative",
266 # Template:Polnisch Substantiv Übersicht
267 "Lokativ": "locative",
268 "Vokativ": "vocative",
269 "Dativ Singular": ["dative", "singular"],
270 "Genitiv Singular": ["genitive", "singular"],
271 # Template:Finnisch Substantiv Übersicht
272 "Inessiv": "inessive",
273 "Elativ": "elative",
274 "Illativ": "illative",
275 "Adessiv": "adessive",
276 "Allativ": "allative",
277 "Essiv": "essive",
278 "Translativ": "translative",
279 "Abessiv": "abessive",
280 "Instruktiv": "instructive",
281 "Komitativ": "comitative",
282 "Partitiv": "partitive",
283}
285COMPARISON_TAGS = {
286 # Vorlage:Deutsch Adjektiv Übersicht
287 # Vorlage:Deklinationsseite Adjektiv
288 "Positiv": "positive",
289 "Komparativ": "comparative",
290 "Superlativ": "superlative",
291}
293DECLENSION_TAGS = {
294 # https://en.wikipedia.org/wiki/German_declension
295 # Vorlage:Deklinationsseite Adjektiv
296 "Starke Deklination": "strong",
297 "Schwache Deklination": "weak",
298 "Gemischte Deklination": "mixed",
299}
301OTHER_TAGS = {
302 # Vorlage:Deklinationsseite Adjektiv
303 "Prädikativ": "predicative",
304 "erweiterte": "extended",
305 "Höflichkeitsform": "honorific",
306 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
307 "nichterweitert": "not-extended",
308 "erweitert": "extended",
309 "zeitlich": "temporal",
310 "indeklinabel": "indeclinable",
311 "östlich": "Eastern",
312 "westlich": "Western",
313 "britisch": "British",
314 "Substantive": "noun",
315 "Substantiv": "noun",
316 "historisch": "historical",
317 "wörtlich": "literally",
318 "Adjektiv": "adjective",
319 "gehoben": "literary",
320 "Nebenform von": "variant",
321 "Verben": "verb",
322 "regional": "regional",
323 # Vorlage:CH&LI
324 "Schweiz und Liechtenstein": ["Switzerland", "Liechtenstein"],
325 "Switzerland and Liechtenstein": ["Switzerland", "Liechtenstein"],
326 "traditionell": "traditional",
327 "vereinfachte Schreibweise": "simplified",
328 "US-amerikanisch": "US",
329 "Adjektive": "adjective",
330 "australisch": "Australian",
331 "scherzhaft": "humorous",
332 "Minuskel": "lowercase",
333 "Majuskel": "uppercase",
334 "bildungssprachlich": "formal",
335 "Imperativ Singular": ["imperative", "singular"],
336 "meist": "usually",
337 "deutsch": "German",
338 "pariserisch": "Parisian",
339 "derb": "impolite",
340 "poetisch": "poetic",
341 "Adverb": "adverb",
342 "süddeutsch": "South-German",
343 "Verb": "verb",
344 "kanadisch": "Canadian",
345 "Supinum": "supine",
346 "Kanada": "Canada",
347 "vulgär": "vulgar",
348 "metonymisch": "metonymically",
349 "veraltet": "dated",
350 "kolumbianisch": "Colombian",
351 "Medial": "medial",
352 "Pinyin": "Pinyin",
353 "Wade-Giles": "Wade-Giles",
354 "umgangssprachlich": "colloquial",
355 "literarisch": "literary",
356}
358TENSE_TAGS = {
359 # Vorlage:Deutsch Verb Übersicht
360 "Präsens": "present",
361 "Präteritum": "past",
362 "Perfekt": "perfect",
363 "Futur I": "future-i",
364 "Futur II": "future-ii",
365 "Plusquamperfekt": "pluperfect",
366 # Template:Kroatisch Verb Übersicht
367 "perfektiv": "perfective",
368 "imperfektiv": "imperfective",
369 "Imperfekt": "imperfect",
370 "Futur": "future",
371}
373MOOD_TAGS = {
374 # Vorlage:Deutsch Verb Übersicht
375 # Vorlage:Deutsch Verb regelmäßig
376 "Konjunktiv I": "subjunctive-i",
377 "Konjunktiv II": "subjunctive-ii",
378 "Imperativ": "imperative",
379 "Imperative": "imperative",
380 "Indikativ": "indicative",
381 # Template:Schwedisch Verb Übersicht
382 "Konjunktiv": "subjunctive",
383 "Konditional": "conditional",
384}
386VERB_FORM_TAGS = {
387 # Vorlage:Deutsch Verb Übersicht
388 "Partizip II": "participle-2",
389 "Hilfsverb": "auxiliary",
390 "Infinitive": "infinitive",
391 "Infinitiv": "infinitive",
392 "Partizipien": "participle",
393 "unregelmäßig": "irregular",
394 "Aorist": "aorist",
395 # Template:Dänisch Verb Übersicht
396 "Partizip Perfekt": ["participle", "perfect"],
397}
399VOICE_TAGS = {
400 # Vorlage:Deutsch Verb unregelmäßig
401 "Aktiv": "active",
402 "Vorgangspassiv": "processual-passive",
403 "Zustandspassiv": "statal-passive",
404 "Passiv": "passive",
405 "Gerundivum": "gerundive",
406 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
407 "Zustandsreflexiv": "statal-reflexive",
408}
410PERSON_TAGS = {
411 # Vorlage:Deutsch Verb unregelmäßig
412 "1. Person Singular": ["first-person", "singular"],
413 "1. Person Plural": ["first-person", "plural"],
414 "2. Person Singular": ["second-person", "singular"],
415 "2. Person Plural": ["second-person", "plural"],
416 "3. Person Singular": ["third-person", "singular"],
417 "3. Person Plural": ["third-person", "plural"],
418 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
419 "Sg. 1. Pers.": ["first-person", "singular"],
420 "Pl. 1. Pers.": ["first-person", "plural"],
421 "Sg. 2. Pers.": ["second-person", "singular"],
422 "Pl. 2. Pers.": ["second-person", "plural"],
423 "Sg. 3. Pers.": ["third-person", "singular"],
424 "Pl. 3. Pers.": ["third-person", "plural"],
425}
427INFLECTION_TABLE_TAGS = {
428 # Vorlage:Deutsch Verb regelmäßig
429 "ungebräuchlich": "uncommon",
430 "veraltet": "archaic",
431 # Vorlage:Deutsch Verb schwach trennbar reflexiv
432 "Nebensatzkonjugation": "subordinate-clause",
433 "Hauptsatzkonjugation": "main-clause",
434 "regelmäßig": "regular",
435 "untrennbar": "inseparable",
436 "trennbar": "separable",
437 # Vorlage:Deutsch Nachname Übersicht
438 "Singular m": ["singular", "masculine"],
439 "Singular f": ["singular", "feminine"],
440 # Vorlage:Deklinationsseite Numerale
441 "bestimmt": "definite",
442 "unbestimmt": "indefinite",
443 "Unbestimmt": "indefinite",
444 "mit Possessivpronomen": ["possessive", "pronoun"],
445 # Template:Kroatisch Verb Übersicht
446 "Partizip Präteritum Aktiv": ["past", "participle", "active"],
447 # Vorlage:Bulgarisch Substantiv Übersicht f1
448 "Singular bestimmt": ["singular", "definite"],
449 "Plural bestimmt": ["plural", "definite"],
450 # Vorlage:Schwedisch Verb Übersicht
451 "Partizip Präsens": ["present", "participle"],
452 # Template:Mazedonisch Substantiv Übersicht
453 "Distalg": "distal",
454 "Proximal": "proximal",
455 "Zählform": "count-form",
456}
458GRAMMATICAL_TAGS = {
459 **K_TEMPLATE_TAGS,
460 **GENDER_TAGS,
461 **NUMBER_TAGS,
462 **CASE_TAGS,
463 **COMPARISON_TAGS,
464 **DECLENSION_TAGS,
465 **OTHER_TAGS,
466 **TENSE_TAGS,
467 **MOOD_TAGS,
468 **VERB_FORM_TAGS,
469 **VOICE_TAGS,
470 **PERSON_TAGS,
471 **INFLECTION_TABLE_TAGS,
472}
474K_TEMPLATE_TOPICS = {
475 "Biologie": "biology",
476 "Linguistik": "linguistics",
477 "Wortbildung": "morphology",
478 "Behörde": "government",
479 "Astronomie": "astronomy",
480 "Immobilienbranche": "real-estate",
481 "Kunst": "arts",
482 "Informatik": "computing",
483 "Nautik": "nautical",
484 "Sport": "sports",
485 "Schuhwerk": "footwear",
486 "Textilien": "textiles",
487 "Zahlungsmittel": "payment-method",
488 "Ökologie": "ecology",
489 "Internet": "Internet",
490 "Religion": "religion",
491 "Militärsprache": "military",
492 "Systematik": "systematics",
493 "Zoologie": "zoology",
494 "Seefahrt": "seafaring",
495 "Soldatensprache": {"topic": "military", "tag": "slang"},
496 "Botanik": "botany",
497 "Marine": "navy",
498 "Informationstechnologie": "computing",
499 "Betriebswirtschaftslehre": "business",
500 "Recht": "law",
501 "Elektronik": "electronics",
502 "Emotion": "emotion",
503 "Mathematik": "mathematics",
504 "Bürgerliches Recht": "civil-Law",
505 "Militär": "military",
506 "Politik": "politics",
507 "Werkzeug": "tools",
508 "Medizin": "medicine",
509 "Ornithologie": "ornithology",
510 "Technik": "technology",
511 "Waffentechnik": "weaponry",
512 "Anatomie": "anatomy",
513 "Fußball": "football",
514 "Kartenspiel": "card-games",
515 "Theoretische Informatik": "computing",
516 "militärisch": "military",
517 "Taxonomie": "taxonomy",
518 "Chemie": "chemistry",
519 "Alchimie": "alchemy",
520 "Pharmazie": "medicine",
521 "Wirtschaft": "economics",
522 "wissenschaftlich": "scientific",
523 "Gastronomie": "food",
524 "Architektur": "architecture",
525 "Geologie": "geology",
526 "Philosophie": "philosophy",
527 "Psychologie": "psychology",
528 "Landwirtschaft": "agriculture",
529 "Literatur": "literature",
530 "Weinbau": "viticulture",
531 "Meteorologie": "meteorology",
532 "Kleidung": "clothing",
533 "Bauwesen": "construction",
534 "Geschichte": "history",
535 "Christentum": "Christianity",
536 "Mythologie": "mythology",
537 "Grammatik": "grammar",
538 "Elektrotechnik": "electrical-engineering",
539}
542def translate_raw_tags(data: WordEntry) -> None:
543 raw_tags = []
544 for raw_tag in data.raw_tags:
545 if raw_tag in GRAMMATICAL_TAGS:
546 tag = GRAMMATICAL_TAGS[raw_tag]
547 if isinstance(tag, str) and tag not in data.tags:
548 data.tags.append(tag)
549 elif isinstance(tag, list): 549 ↛ 544line 549 didn't jump to line 544 because the condition on line 549 was always true
550 for t in tag:
551 if t not in data.tags: 551 ↛ 550line 551 didn't jump to line 550 because the condition on line 551 was always true
552 data.tags.append(t)
553 elif raw_tag in K_TEMPLATE_TOPICS and hasattr(data, "topics"):
554 topic = K_TEMPLATE_TOPICS[raw_tag]
555 if isinstance(topic, str) and topic not in data.topics: 555 ↛ 557line 555 didn't jump to line 557 because the condition on line 555 was always true
556 data.topics.append(topic)
557 elif isinstance(topic, dict) and topic["topic"] not in data.topics:
558 data.topics.append(topic["topic"])
559 if topic["tag"] not in data.tags:
560 data.tags.append(topic["tag"])
561 else:
562 raw_tags.append(raw_tag)
563 data.raw_tags = raw_tags