Coverage for src/wiktextract/extractor/de/tags.py: 81%
37 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1from .models import WordEntry
3# Sense tags
4# https://de.wiktionary.org/wiki/Vorlage:K
5# https://de.wiktionary.org/wiki/Vorlage:K/Abk
6K_TEMPLATE_TAGS = {
7 "Abl.": "ablative",
8 "Ablativ": "ablative",
9 "abw.": "derogatory",
10 "abwertend": "derogatory",
11 "AE": "US",
12 "AmE": "US",
13 "adv.": "adverbial",
14 "Akkusativ": "accusative",
15 "alemann.": "Alemannic",
16 "alemannisch": "Alemannic",
17 "allg.": "general",
18 "allgemein": "general",
19 "alltagsspr.": "colloquial",
20 "amtsspr.": "officialese",
21 # "ansonsten": "otherwise", # combined with other text
22 "attr.": "attributive",
23 # "auch": "also",
24 "bair.": "Bavarian",
25 "bairisch": "Bavarian",
26 "bar.": "Bavarian",
27 "direktional": "directional",
28 "BE": "British",
29 "BrE": "British",
30 "Bedva.": "outdated",
31 "Bedvatd.": "outdated",
32 "besonders": "especially",
33 "veraltende Bedeutung": "outdated",
34 # "bei": "",
35 # "bes.": "especially",
36 # "besonders": "especially",
37 # "beziehungsweise": "",
38 # "bzw.": "",
39 # "bildungsspr.": "",
40 # "bis": "",
41 # "bisweilen": "",
42 # "das": "",
43 "Dativ": "dative",
44 # "DDR": "",
45 "Deutschland": "Germany",
46 # "der": "",
47 "dichter.": "poetic",
48 "dichterisch": "poetic",
49 # "die": "",
50 "Dim.": "diminutive",
51 "Dimin.": "diminutive",
52 "Diminutiv": "diminutive",
53 # "eher": "",
54 "erzg.": "Erzgebirgisch",
55 "erzgeb.": "Erzgebirgisch",
56 "erzgebirgisch": "Erzgebirgisch",
57 "euph.": "euphemistic",
58 "fachspr.": "jargon",
59 "fachsprachlich": "jargon",
60 "fam.": "familiär",
61 "fig": "figurative",
62 "fig.": "figurative",
63 # "früher": "",
64 # "gegenwartslateinisch": "",
65 "geh.": "gehoben",
66 "Genitiv": "genitive",
67 "gsm": "Swiss German",
68 "häufig": "often",
69 "haben": "auxiliary",
70 "hebben": "auxiliary",
71 "hauptsächlich": "primarily",
72 "hist.": "historical",
73 "ieS": "narrowly",
74 "i.e.S.": "narrowly",
75 "i. e. S.": "narrowly",
76 # "im": "",
77 # "in": "",
78 # "in Bezug auf": "relational",
79 "indekl.": "indeclinable",
80 # "insbes.": "",
81 "Instrumental": "instrumental",
82 "intrans.": "intransitive",
83 "intransitiv": "intransitive",
84 # "iPl": "in plural",
85 "iron.": "ironic",
86 # "iwS": "",
87 # "jugendspr.": "",
88 "kinderspr.": "childish",
89 "kirchenlateinisch": "Church Latin",
90 "klasslat.": "Classical Latin",
91 "klassischlateinisch": "Classical Latin",
92 "kPl.": "no-plural",
93 "kein Plural": "no-plural",
94 "kSg.": "no-singulative",
95 "kSt.": "no-comparative",
96 "kurz für": "short-form",
97 "landsch.": "regional",
98 "landschaftlich": "regional",
99 "lautm.": "onomatopoeic",
100 "lokal": "regional",
101 "Ling.": "linguistics",
102 "mA": "accusative",
103 "md.": "Central German",
104 "mdal.": "dialectal",
105 "Med.": "medicine", # topic
106 # "meist": "mostly",
107 # "meistens": "mostly",
108 "metaphor.": "metaphoric",
109 "meton.": "metonymically",
110 "mG": "genitive",
111 "mitteld.": "Central German",
112 "mit Dativ": "with-dative",
113 "mit Akkusativ": "with-accusative",
114 # "mitunter": "",
115 "mlat.": "Medieval Latin",
116 "mittellateinisch": "Medieval Latin",
117 "mundartl.": "dialectal",
118 "nDu.": "only-dual",
119 "nigr.": "Niger",
120 "nigrisch": "Niger",
121 "nkLat.": "post-Classical Latin",
122 "nachklassischlateinisch": "post-Classical Latin",
123 "nlat.": "New Latin",
124 "neulateinisch": "New Latin",
125 "nordd.": "North German",
126 "norddeutsch": "North German",
127 "nordwestd.": "Northwestern Germany",
128 "nPl.": "plural-only",
129 "Österreich": "Austrian German",
130 "örtlich": "regional",
131 "österr.": "Austrian German",
132 "österreichisch": "Austrian German",
133 "ostfränkisch": "East Franconian German",
134 "pej.": "pejorative",
135 "personifizierend": "person",
136 "poet.": "poetic",
137 "PräpmG": "genitive prepositional",
138 "PmG": "genitive prepositional",
139 "reg.": "regional",
140 "refl.": "reflexive",
141 "reflexiv": "reflexive",
142 # "respektive": "",
143 "sal.": "casual",
144 "salopp": "casual",
145 "scherzh.": "jocular",
146 "schriftspr.": "literary",
147 # "schülerspr.": "",
148 "schwäb.": "Swabian",
149 "schwäbisch": "Swabian",
150 "Schweiz": "Swiss Standard German",
151 "schweiz.": "Swiss Standard German",
152 "schweizerisch": "Swiss Standard German",
153 "Schweizerdeutsch": "Swiss German",
154 "schweizerdeutsch": "Swiss German",
155 # "seemannsspr.": "",
156 "sein": "auxiliary verb",
157 # "sehr": "", # very
158 "selten": "rare",
159 "seltener": "rare",
160 "seltener auch": "rare",
161 "soldatenspr.": ["military", "slang"],
162 # "sonderspr.": "",
163 # "sonst": "",
164 # "sowie": "",
165 "spätlat.": "Late Latin",
166 "spätlateinisch": "Late Latin",
167 # "später": "",
168 "speziell": "special",
169 "südd.": "South German",
170 "süddt.": "South German",
171 # "techn.": "",
172 # "teils": "",
173 # "teilweise": "",
174 "temporal": "temporal",
175 "tlwva.": "outdated",
176 "tlwvatd.": "outdated",
177 "trans.": "transitive",
178 "transitiv": "transitive",
179 # "über": "",
180 # "überwiegend": "mostly",
181 "übertr.": "figurative",
182 "übertragen": "figurative",
183 "ugs.": "colloquial",
184 "umgangssprachlich": "colloquial",
185 # "und": "",
186 "ungebr.": "uncommon",
187 "unpers.": "impersonal",
188 "unpersönlich": "impersonal",
189 # "ursprünglich": "",
190 "va.": "outdated",
191 "vatd.": "outdated",
192 "veraltend": "outdated",
193 # "verh.": "",
194 "volkst.": "popular",
195 # "von": "",
196 # "vor allem": "",
197 # "vor allem in": "",
198 "vul.": "vulgar",
199 "vulg.": "vulgar",
200 "vlat.": ["vulgar", "Latin"],
201 "vulgärlat": ["vulgar", "Latin"],
202 "vulgärlateinisch": ["vulgar", "Latin"],
203 "wien.": "Vienna",
204 "wienerisch": "Vienna",
205 "Wpräp": "prepositional",
206 # "z. B.": "",
207 # "z. T.": "",
208 # "zijn": "",
209 # "zum Beispiel": "",
210 # "zum Teil": "",
211 # "zumeist": "",
212 "Kardinalzahl": "cardinal",
213 "Sammelbegriff": "collective",
214 "Fachsprache": "jargon",
215 "formale Sprachen": "formal",
216 "Programmiersprachen": "programming",
217 "Rechnerarchitektur": "programming",
218 "Geografie": "geography",
219 "Geometrie": "geometry",
220 "Finanzwesen": "finance",
221 "juristisch": "law",
222 "Physik": "physics",
223 "abstrakt": "abstract",
224 "gegenständlich": "objective",
225 "personifiziert": "personal",
226 "kirchlich": "Ecclesiastical",
227}
229GENDER_TAGS = {
230 "n": "neuter",
231 "m": "masculine",
232 "f": "feminine",
233 # Vorlage:Deklinationsseite Adjektiv
234 "Maskulinum": "masculine",
235 "Femininum": "feminine",
236 "Neutrum": "neuter",
237}
239NUMBER_TAGS = {
240 # Vorlage:Deutsch Substantiv Übersicht
241 "Singular": "singular",
242 "Plural": "plural",
243 "Pl.": "plural",
244 "Dual": "dual",
245}
247CASE_TAGS = {
248 # Vorlage:Deutsch Substantiv Übersicht
249 "Nominativ": "nominative",
250 "Genitiv": "genitive",
251 "Dativ": "dative",
252 "Akkusativ": "accusative",
253 # Template:Polnisch Substantiv Übersicht
254 "Lokativ": "locative",
255 "Vokativ": "vocative",
256 "Dativ Singular": ["dative", "singular"],
257 "Genitiv Singular": ["genitive", "singular"],
258 # Template:Finnisch Substantiv Übersicht
259 "Inessiv": "inessive",
260 "Elativ": "elative",
261 "Illativ": "illative",
262 "Adessiv": "adessive",
263 "Allativ": "allative",
264 "Essiv": "essive",
265 "Translativ": "translative",
266 "Abessiv": "abessive",
267 "Instruktiv": "instructive",
268 "Komitativ": "comitative",
269}
271COMPARISON_TAGS = {
272 # Vorlage:Deutsch Adjektiv Übersicht
273 # Vorlage:Deklinationsseite Adjektiv
274 "Positiv": "positive",
275 "Komparativ": "comparative",
276 "Superlativ": "superlative",
277}
279DECLENSION_TAGS = {
280 # https://en.wikipedia.org/wiki/German_declension
281 # Vorlage:Deklinationsseite Adjektiv
282 "Starke Deklination": "strong",
283 "Schwache Deklination": "weak",
284 "Gemischte Deklination": "mixed",
285}
287OTHER_TAGS = {
288 # Vorlage:Deklinationsseite Adjektiv
289 "Prädikativ": "predicative",
290 "erweiterte": "extended",
291 "Höflichkeitsform": "honorific",
292 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
293 "nichterweitert": "not-extended",
294 "erweitert": "extended",
295 "zeitlich": "temporal",
296 "indeklinabel": "indeclinable",
297 "östlich": "Eastern",
298 "westlich": "Western",
299 "britisch": "British",
300 "Substantive": "noun",
301 "Substantiv": "noun",
302 "historisch": "historical",
303 "wörtlich": "literally",
304 "Adjektiv": "adjective",
305 "gehoben": "literary",
306 "Nebenform von": "variant",
307 "Verben": "verb",
308 "regional": "regional",
309 # Vorlage:CH&LI
310 "Schweiz und Liechtenstein": ["Switzerland", "Liechtenstein"],
311 "Switzerland and Liechtenstein": ["Switzerland", "Liechtenstein"],
312 "traditionell": "traditional",
313 "vereinfachte Schreibweise": "simplified",
314}
316TENSE_TAGS = {
317 # Vorlage:Deutsch Verb Übersicht
318 "Präsens": "present",
319 "Präteritum": "past",
320 "Perfekt": "perfect",
321 "Futur I": "future-i",
322 "Futur II": "future-ii",
323 "Plusquamperfekt": "pluperfect",
324 # Template:Kroatisch Verb Übersicht
325 "perfektiv": "perfective",
326 "imperfektiv": "imperfective",
327 "Imperfekt": "imperfect",
328}
330MOOD_TAGS = {
331 # Vorlage:Deutsch Verb Übersicht
332 # Vorlage:Deutsch Verb regelmäßig
333 "Konjunktiv I": "subjunctive-i",
334 "Konjunktiv II": "subjunctive-ii",
335 "Imperativ": "imperative",
336 "Imperative": "imperative",
337 "Indikativ": "indicative",
338}
340VERB_FORM_TAGS = {
341 # Vorlage:Deutsch Verb Übersicht
342 "Partizip II": "participle-2",
343 "Hilfsverb": "auxiliary",
344 "Infinitive": "infinitive",
345 "Infinitiv": "infinitive",
346 "Partizipien": "participle",
347 "unregelmäßig": "irregular",
348 "Aorist": "aorist",
349 # Template:Dänisch Verb Übersicht
350 "Partizip Perfekt": ["participle", "perfect"],
351}
353VOICE_TAGS = {
354 # Vorlage:Deutsch Verb unregelmäßig
355 "Aktiv": "active",
356 "Vorgangspassiv": "processual-passive",
357 "Zustandspassiv": "statal-passive",
358 "Passiv": "passive",
359 "Gerundivum": "gerundive",
360 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
361 "Zustandsreflexiv": "statal-reflexive",
362}
364PERSON_TAGS = {
365 # Vorlage:Deutsch Verb unregelmäßig
366 "1. Person Singular": ["first-person", "singular"],
367 "1. Person Plural": ["first-person", "plural"],
368 "2. Person Singular": ["second-person", "singular"],
369 "2. Person Plural": ["second-person", "plural"],
370 "3. Person Singular": ["third-person", "singular"],
371 "3. Person Plural": ["third-person", "plural"],
372 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
373 "Sg. 1. Pers.": ["first-person", "singular"],
374 "Pl. 1. Pers.": ["first-person", "plural"],
375 "Sg. 2. Pers.": ["second-person", "singular"],
376 "Pl. 2. Pers.": ["second-person", "plural"],
377 "Sg. 3. Pers.": ["third-person", "singular"],
378 "Pl. 3. Pers.": ["third-person", "plural"],
379}
381INFLECTION_TABLE_TAGS = {
382 # Vorlage:Deutsch Verb regelmäßig
383 "ungebräuchlich": "uncommon",
384 "veraltet": "archaic",
385 # Vorlage:Deutsch Verb schwach trennbar reflexiv
386 "Nebensatzkonjugation": "subordinate-clause",
387 "Hauptsatzkonjugation": "main-clause",
388 "regelmäßig": "regular",
389 "untrennbar": "inseparable",
390 "trennbar": "separable",
391 # Vorlage:Deutsch Nachname Übersicht
392 "Singular m": ["singular", "masculine"],
393 "Singular f": ["singular", "feminine"],
394 # Vorlage:Deklinationsseite Numerale
395 "bestimmt": "definite",
396 "unbestimmt": "indefinite",
397 "mit Possessivpronomen": ["possessive", "pronoun"],
398 # Template:Kroatisch Verb Übersicht
399 "Partizip Präteritum Aktiv": ["past", "participle", "active"],
400}
402GRAMMATICAL_TAGS = {
403 **K_TEMPLATE_TAGS,
404 **GENDER_TAGS,
405 **NUMBER_TAGS,
406 **CASE_TAGS,
407 **COMPARISON_TAGS,
408 **DECLENSION_TAGS,
409 **OTHER_TAGS,
410 **TENSE_TAGS,
411 **MOOD_TAGS,
412 **VERB_FORM_TAGS,
413 **VOICE_TAGS,
414 **PERSON_TAGS,
415 **INFLECTION_TABLE_TAGS,
416}
418K_TEMPLATE_TOPICS = {
419 "Biologie": "biology",
420 "Linguistik": "linguistics",
421 "Wortbildung": "morphology",
422 "Behörde": "government",
423 "Astronomie": "astronomy",
424 "Immobilienbranche": "real-estate",
425 "Kunst": "arts",
426 "Informatik": "computing",
427 "Nautik": "nautical",
428 "Sport": "sports",
429 "Schuhwerk": "footwear",
430 "Textilien": "textiles",
431 "Zahlungsmittel": "payment-method",
432 "Ökologie": "ecology",
433 "Internet": "Internet",
434 "Religion": "religion",
435 "Militärsprache": "military",
436 "Systematik": "systematics",
437 "Zoologie": "zoology",
438 "Seefahrt": "seafaring",
439 "Soldatensprache": {"topic": "military", "tag": "slang"},
440 "Botanik": "botany",
441 "Marine": "navy",
442 "Informationstechnologie": "computing",
443 "Betriebswirtschaftslehre": "business",
444 "Recht": "law",
445 "Elektronik": "electronics",
446 "Emotion": "emotion",
447 "Mathematik": "mathematics",
448 "Bürgerliches Recht": "civil-Law",
449 "Militär": "military",
450 "Politik": "politics",
451 "Werkzeug": "tools",
452 "Medizin": "medicine",
453 "Ornithologie": "ornithology",
454 "Technik": "technology",
455 "Waffentechnik": "weaponry",
456 "Anatomie": "anatomy",
457 "Fußball": "football",
458 "Kartenspiel": "card-games",
459 "Theoretische Informatik": "computing",
460 "militärisch": "military",
461 "Taxonomie": "taxonomy",
462}
465def translate_raw_tags(data: WordEntry) -> None:
466 raw_tags = []
467 for raw_tag in data.raw_tags:
468 if raw_tag in GRAMMATICAL_TAGS:
469 tag = GRAMMATICAL_TAGS[raw_tag]
470 if isinstance(tag, str) and tag not in data.tags:
471 data.tags.append(tag)
472 elif isinstance(tag, list): 472 ↛ 467line 472 didn't jump to line 467 because the condition on line 472 was always true
473 for t in tag:
474 if t not in data.tags: 474 ↛ 473line 474 didn't jump to line 473 because the condition on line 474 was always true
475 data.tags.append(t)
476 elif raw_tag in K_TEMPLATE_TOPICS and hasattr(data, "topics"):
477 topic = K_TEMPLATE_TOPICS[raw_tag]
478 if isinstance(topic, str) and topic not in data.topics: 478 ↛ 480line 478 didn't jump to line 480 because the condition on line 478 was always true
479 data.topics.append(topic)
480 elif isinstance(topic, dict) and topic["topic"] not in data.topics:
481 data.topics.append(topic["topic"])
482 if topic["tag"] not in data.tags:
483 data.tags.append(topic["tag"])
484 else:
485 raw_tags.append(raw_tag)
486 data.raw_tags = raw_tags