Coverage for src/wiktextract/extractor/de/tags.py: 81%
37 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1from .models import WordEntry
3# Sense tags
4# https://de.wiktionary.org/wiki/Vorlage:K
5# https://de.wiktionary.org/wiki/Vorlage:K/Abk
6K_TEMPLATE_TAGS = {
7 "Abl.": "ablative",
8 "Ablativ": "ablative",
9 "abw.": "derogatory",
10 "abwertend": "derogatory",
11 "AE": "US",
12 "AmE": "US",
13 "adv.": "adverbial",
14 "Akkusativ": "accusative",
15 "alemann.": "Alemannic",
16 "alemannisch": "Alemannic",
17 "allg.": "general",
18 "allgemein": "general",
19 "alltagsspr.": "colloquial",
20 "amtsspr.": "officialese",
21 # "ansonsten": "otherwise", # combined with other text
22 "attr.": "attributive",
23 # "auch": "also",
24 "bair.": "Bavarian",
25 "bairisch": "Bavarian",
26 "bar.": "Bavarian",
27 "direktional": "directional",
28 "BE": "British",
29 "BrE": "British",
30 "Bedva.": "outdated",
31 "Bedvatd.": "outdated",
32 "besonders": "especially",
33 "veraltende Bedeutung": "outdated",
34 # "bei": "",
35 # "bes.": "especially",
36 # "besonders": "especially",
37 # "beziehungsweise": "",
38 # "bzw.": "",
39 # "bildungsspr.": "",
40 # "bis": "",
41 # "bisweilen": "",
42 # "das": "",
43 "Dativ": "dative",
44 # "DDR": "",
45 "Deutschland": "Germany",
46 # "der": "",
47 "dichter.": "poetic",
48 # "die": "",
49 "Dim.": "diminutive",
50 "Dimin.": "diminutive",
51 "Diminutiv": "diminutive",
52 # "eher": "",
53 "erzg.": "Erzgebirgisch",
54 "erzgeb.": "Erzgebirgisch",
55 "erzgebirgisch": "Erzgebirgisch",
56 "euph.": "euphemistic",
57 "fachspr.": "jargon",
58 "fachsprachlich": "jargon",
59 "fam.": "familiär",
60 "fig": "figurative",
61 "fig.": "figurative",
62 # "früher": "",
63 # "gegenwartslateinisch": "",
64 "geh.": "gehoben",
65 "Genitiv": "genitive",
66 "gsm": "Swiss German",
67 "häufig": "often",
68 "haben": "auxiliary",
69 "hebben": "auxiliary",
70 "hauptsächlich": "primarily",
71 "hist.": "historical",
72 "ieS": "narrowly",
73 "i.e.S.": "narrowly",
74 "i. e. S.": "narrowly",
75 # "im": "",
76 # "in": "",
77 # "in Bezug auf": "relational",
78 "indekl.": "indeclinable",
79 # "insbes.": "",
80 "Instrumental": "instrumental",
81 "intrans.": "intransitive",
82 "intransitiv": "intransitive",
83 # "iPl": "in plural",
84 "iron.": "ironic",
85 # "iwS": "",
86 # "jugendspr.": "",
87 "kinderspr.": "childish",
88 "kirchenlateinisch": "Church Latin",
89 "klasslat.": "Classical Latin",
90 "klassischlateinisch": "Classical Latin",
91 "kPl.": "no-plural",
92 "kein Plural": "no-plural",
93 "kSg.": "no-singulative",
94 "kSt.": "no-comparative",
95 "kurz für": "short-form",
96 "landsch.": "regional",
97 "landschaftlich": "regional",
98 "lautm.": "onomatopoeic",
99 "lokal": "regional",
100 "Ling.": "linguistics",
101 "mA": "accusative",
102 "md.": "Central German",
103 "mdal.": "dialectal",
104 "Med.": "medicine", # topic
105 # "meist": "mostly",
106 # "meistens": "mostly",
107 "metaphor.": "metaphoric",
108 "meton.": "metonymically",
109 "mG": "genitive",
110 "mitteld.": "Central German",
111 "mit Dativ": "with-dative",
112 "mit Akkusativ": "with-accusative",
113 # "mitunter": "",
114 "mlat.": "Medieval Latin",
115 "mittellateinisch": "Medieval Latin",
116 "mundartl.": "dialectal",
117 "nDu.": "only-dual",
118 "nigr.": "Niger",
119 "nigrisch": "Niger",
120 "nkLat.": "post-Classical Latin",
121 "nachklassischlateinisch": "post-Classical Latin",
122 "nlat.": "New Latin",
123 "neulateinisch": "New Latin",
124 "nordd.": "North German",
125 "norddeutsch": "North German",
126 "nordwestd.": "Northwestern Germany",
127 "nPl.": "plural-only",
128 "Österreich": "Austrian German",
129 "örtlich": "regional",
130 "österr.": "Austrian German",
131 "österreichisch": "Austrian German",
132 "ostfränkisch": "East Franconian German",
133 "pej.": "pejorative",
134 "personifizierend": "person",
135 "poet.": "poetic",
136 "PräpmG": "genitive prepositional",
137 "PmG": "genitive prepositional",
138 "reg.": "regional",
139 "refl.": "reflexive",
140 "reflexiv": "reflexive",
141 # "respektive": "",
142 "sal.": "casual",
143 "salopp": "casual",
144 "scherzh.": "jocular",
145 "schriftspr.": "literary",
146 # "schülerspr.": "",
147 "schwäb.": "Swabian",
148 "schwäbisch": "Swabian",
149 "Schweiz": "Swiss Standard German",
150 "schweiz.": "Swiss Standard German",
151 "schweizerisch": "Swiss Standard German",
152 "Schweizerdeutsch": "Swiss German",
153 "schweizerdeutsch": "Swiss German",
154 # "seemannsspr.": "",
155 "sein": "auxiliary verb",
156 # "sehr": "", # very
157 "selten": "rare",
158 "seltener": "rare",
159 "seltener auch": "rare",
160 "soldatenspr.": ["military", "slang"],
161 # "sonderspr.": "",
162 # "sonst": "",
163 # "sowie": "",
164 "spätlat.": "Late Latin",
165 "spätlateinisch": "Late Latin",
166 # "später": "",
167 "speziell": "special",
168 "südd.": "South German",
169 "süddt.": "South German",
170 # "techn.": "",
171 # "teils": "",
172 # "teilweise": "",
173 "temporal": "temporal",
174 "tlwva.": "outdated",
175 "tlwvatd.": "outdated",
176 "trans.": "transitive",
177 "transitiv": "transitive",
178 # "über": "",
179 # "überwiegend": "mostly",
180 "übertr.": "figurative",
181 "übertragen": "figurative",
182 "ugs.": "colloquial",
183 "umgangssprachlich": "colloquial",
184 # "und": "",
185 "ungebr.": "uncommon",
186 "unpers.": "impersonal",
187 "unpersönlich": "impersonal",
188 # "ursprünglich": "",
189 "va.": "outdated",
190 "vatd.": "outdated",
191 "veraltend": "outdated",
192 # "verh.": "",
193 "volkst.": "popular",
194 # "von": "",
195 # "vor allem": "",
196 # "vor allem in": "",
197 "vul.": "vulgar",
198 "vulg.": "vulgar",
199 "vlat.": ["vulgar", "Latin"],
200 "vulgärlat": ["vulgar", "Latin"],
201 "vulgärlateinisch": ["vulgar", "Latin"],
202 "wien.": "Vienna",
203 "wienerisch": "Vienna",
204 "Wpräp": "prepositional",
205 # "z. B.": "",
206 # "z. T.": "",
207 # "zijn": "",
208 # "zum Beispiel": "",
209 # "zum Teil": "",
210 # "zumeist": "",
211 "Kardinalzahl": "cardinal",
212 "Sammelbegriff": "collective",
213 "Fachsprache": "jargon",
214 "formale Sprachen": "formal",
215 "Programmiersprachen": "programming",
216 "Rechnerarchitektur": "programming",
217 "Geografie": "geography",
218 "Geometrie": "geometry",
219 "Finanzwesen": "finance",
220 "juristisch": "law",
221 "Physik": "physics",
222 "abstrakt": "abstract",
223 "gegenständlich": "objective",
224 "personifiziert": "personal",
225 "kirchlich": "Ecclesiastical",
226}
228GENDER_TAGS = {
229 "n": "neuter",
230 "m": "masculine",
231 "f": "feminine",
232 # Vorlage:Deklinationsseite Adjektiv
233 "Maskulinum": "masculine",
234 "Femininum": "feminine",
235 "Neutrum": "neuter",
236}
238NUMBER_TAGS = {
239 # Vorlage:Deutsch Substantiv Übersicht
240 "Singular": "singular",
241 "Plural": "plural",
242 "Pl.": "plural",
243 "Dual": "dual",
244}
246CASE_TAGS = {
247 # Vorlage:Deutsch Substantiv Übersicht
248 "Nominativ": "nominative",
249 "Genitiv": "genitive",
250 "Dativ": "dative",
251 "Akkusativ": "accusative",
252 # Template:Polnisch Substantiv Übersicht
253 "Lokativ": "locative",
254 "Vokativ": "vocative",
255 "Dativ Singular": ["dative", "singular"],
256 "Genitiv Singular": ["genitive", "singular"],
257 # Template:Finnisch Substantiv Übersicht
258 "Inessiv": "inessive",
259 "Elativ": "elative",
260 "Illativ": "illative",
261 "Adessiv": "adessive",
262 "Allativ": "allative",
263 "Essiv": "essive",
264 "Translativ": "translative",
265 "Abessiv": "abessive",
266 "Instruktiv": "instructive",
267 "Komitativ": "comitative",
268}
270COMPARISON_TAGS = {
271 # Vorlage:Deutsch Adjektiv Übersicht
272 # Vorlage:Deklinationsseite Adjektiv
273 "Positiv": "positive",
274 "Komparativ": "comparative",
275 "Superlativ": "superlative",
276}
278DECLENSION_TAGS = {
279 # https://en.wikipedia.org/wiki/German_declension
280 # Vorlage:Deklinationsseite Adjektiv
281 "Starke Deklination": "strong",
282 "Schwache Deklination": "weak",
283 "Gemischte Deklination": "mixed",
284}
286OTHER_TAGS = {
287 # Vorlage:Deklinationsseite Adjektiv
288 "Prädikativ": "predicative",
289 "erweiterte": "extended",
290 "Höflichkeitsform": "honorific",
291 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
292 "nichterweitert": "not-extended",
293 "erweitert": "extended",
294 "zeitlich": "temporal",
295 "indeklinabel": "indeclinable",
296 "östlich": "Eastern",
297 "westlich": "Western",
298 "britisch": "British",
299 "Substantive": "noun",
300 "Substantiv": "noun",
301 "historisch": "historical",
302 "wörtlich": "literally",
303 "Adjektiv": "adjective",
304 "gehoben": "literary",
305 "Nebenform von": "variant",
306 "Verben": "verb",
307 "regional": "regional",
308 # Vorlage:CH&LI
309 "Schweiz und Liechtenstein": ["Switzerland", "Liechtenstein"],
310 "Switzerland and Liechtenstein": ["Switzerland", "Liechtenstein"],
311 "traditionell": "traditional",
312 "vereinfachte Schreibweise": "simplified",
313}
315TENSE_TAGS = {
316 # Vorlage:Deutsch Verb Übersicht
317 "Präsens": "present",
318 "Präteritum": "past",
319 "Perfekt": "perfect",
320 "Futur I": "future-i",
321 "Futur II": "future-ii",
322 "Plusquamperfekt": "pluperfect",
323 # Template:Kroatisch Verb Übersicht
324 "perfektiv": "perfective",
325 "imperfektiv": "imperfective",
326 "Imperfekt": "imperfect",
327}
329MOOD_TAGS = {
330 # Vorlage:Deutsch Verb Übersicht
331 # Vorlage:Deutsch Verb regelmäßig
332 "Konjunktiv I": "subjunctive-i",
333 "Konjunktiv II": "subjunctive-ii",
334 "Imperativ": "imperative",
335 "Imperative": "imperative",
336 "Indikativ": "indicative",
337}
339VERB_FORM_TAGS = {
340 # Vorlage:Deutsch Verb Übersicht
341 "Partizip II": "participle-2",
342 "Hilfsverb": "auxiliary",
343 "Infinitive": "infinitive",
344 "Infinitiv": "infinitive",
345 "Partizipien": "participle",
346 "unregelmäßig": "irregular",
347 "Aorist": "aorist",
348 # Template:Dänisch Verb Übersicht
349 "Partizip Perfekt": ["participle", "perfect"],
350}
352VOICE_TAGS = {
353 # Vorlage:Deutsch Verb unregelmäßig
354 "Aktiv": "active",
355 "Vorgangspassiv": "processual-passive",
356 "Zustandspassiv": "statal-passive",
357 "Passiv": "passive",
358 "Gerundivum": "gerundive",
359 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
360 "Zustandsreflexiv": "statal-reflexive",
361}
363PERSON_TAGS = {
364 # Vorlage:Deutsch Verb unregelmäßig
365 "1. Person Singular": ["first-person", "singular"],
366 "1. Person Plural": ["first-person", "plural"],
367 "2. Person Singular": ["second-person", "singular"],
368 "2. Person Plural": ["second-person", "plural"],
369 "3. Person Singular": ["third-person", "singular"],
370 "3. Person Plural": ["third-person", "plural"],
371 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
372 "Sg. 1. Pers.": ["first-person", "singular"],
373 "Pl. 1. Pers.": ["first-person", "plural"],
374 "Sg. 2. Pers.": ["second-person", "singular"],
375 "Pl. 2. Pers.": ["second-person", "plural"],
376 "Sg. 3. Pers.": ["third-person", "singular"],
377 "Pl. 3. Pers.": ["third-person", "plural"],
378}
380INFLECTION_TABLE_TAGS = {
381 # Vorlage:Deutsch Verb regelmäßig
382 "ungebräuchlich": "uncommon",
383 "veraltet": "archaic",
384 # Vorlage:Deutsch Verb schwach trennbar reflexiv
385 "Nebensatzkonjugation": "subordinate-clause",
386 "Hauptsatzkonjugation": "main-clause",
387 "regelmäßig": "regular",
388 "untrennbar": "inseparable",
389 "trennbar": "separable",
390 # Vorlage:Deutsch Nachname Übersicht
391 "Singular m": ["singular", "masculine"],
392 "Singular f": ["singular", "feminine"],
393 # Vorlage:Deklinationsseite Numerale
394 "bestimmt": "definite",
395 "unbestimmt": "indefinite",
396 "mit Possessivpronomen": ["possessive", "pronoun"],
397 # Template:Kroatisch Verb Übersicht
398 "Partizip Präteritum Aktiv": ["past", "participle", "active"],
399}
401GRAMMATICAL_TAGS = {
402 **K_TEMPLATE_TAGS,
403 **GENDER_TAGS,
404 **NUMBER_TAGS,
405 **CASE_TAGS,
406 **COMPARISON_TAGS,
407 **DECLENSION_TAGS,
408 **OTHER_TAGS,
409 **TENSE_TAGS,
410 **MOOD_TAGS,
411 **VERB_FORM_TAGS,
412 **VOICE_TAGS,
413 **PERSON_TAGS,
414 **INFLECTION_TABLE_TAGS,
415}
417K_TEMPLATE_TOPICS = {
418 "Biologie": "biology",
419 "Linguistik": "linguistics",
420 "Wortbildung": "morphology",
421 "Behörde": "government",
422 "Astronomie": "astronomy",
423 "Immobilienbranche": "real-estate",
424 "Kunst": "arts",
425 "Informatik": "computing",
426 "Nautik": "nautical",
427 "Sport": "sports",
428 "Schuhwerk": "footwear",
429 "Textilien": "textiles",
430 "Zahlungsmittel": "payment-method",
431 "Ökologie": "ecology",
432 "Internet": "Internet",
433 "Religion": "religion",
434 "Militärsprache": "military",
435 "Systematik": "systematics",
436 "Zoologie": "zoology",
437 "Seefahrt": "seafaring",
438 "Soldatensprache": {"topic": "military", "tag": "slang"},
439 "Botanik": "botany",
440 "Marine": "navy",
441 "Informationstechnologie": "computing",
442 "Betriebswirtschaftslehre": "business",
443 "Recht": "law",
444 "Elektronik": "electronics",
445 "Emotion": "emotion",
446 "Mathematik": "mathematics",
447 "Bürgerliches Recht": "civil-Law",
448 "Militär": "military",
449 "Politik": "politics",
450 "Werkzeug": "tools",
451 "Medizin": "medicine",
452 "Ornithologie": "ornithology",
453 "Technik": "technology",
454 "Waffentechnik": "weaponry",
455 "Anatomie": "anatomy",
456 "Fußball": "football",
457 "Kartenspiel": "card-games",
458 "Theoretische Informatik": "computing",
459 "militärisch": "military",
460 "Taxonomie": "taxonomy",
461}
464def translate_raw_tags(data: WordEntry) -> None:
465 raw_tags = []
466 for raw_tag in data.raw_tags:
467 if raw_tag in GRAMMATICAL_TAGS:
468 tag = GRAMMATICAL_TAGS[raw_tag]
469 if isinstance(tag, str) and tag not in data.tags:
470 data.tags.append(tag)
471 elif isinstance(tag, list): 471 ↛ 466line 471 didn't jump to line 466 because the condition on line 471 was always true
472 for t in tag:
473 if t not in data.tags: 473 ↛ 472line 473 didn't jump to line 472 because the condition on line 473 was always true
474 data.tags.append(t)
475 elif raw_tag in K_TEMPLATE_TOPICS and hasattr(data, "topics"):
476 topic = K_TEMPLATE_TOPICS[raw_tag]
477 if isinstance(topic, str) and topic not in data.topics: 477 ↛ 479line 477 didn't jump to line 479 because the condition on line 477 was always true
478 data.topics.append(topic)
479 elif isinstance(topic, dict) and topic["topic"] not in data.topics:
480 data.topics.append(topic["topic"])
481 if topic["tag"] not in data.tags:
482 data.tags.append(topic["tag"])
483 else:
484 raw_tags.append(raw_tag)
485 data.raw_tags = raw_tags