Coverage for src/wiktextract/extractor/de/tags.py: 77%
34 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1from .models import WordEntry
3# Sense tags
4# https://de.wiktionary.org/wiki/Vorlage:K
5# https://de.wiktionary.org/wiki/Vorlage:K/Abk
6K_TEMPLATE_TAGS = {
7 "Abl.": "ablative",
8 "Ablativ": "ablative",
9 "abw.": "derogatory",
10 "abwertend": "derogatory",
11 "AE": "US",
12 "AmE": "US",
13 "adv.": "adverbial",
14 "Akkusativ": "accusative",
15 "alemann.": "Alemannic",
16 "alemannisch": "Alemannic",
17 "allg.": "general",
18 "allgemein": "general",
19 "alltagsspr.": "colloquial",
20 "amtsspr.": "officialese",
21 # "ansonsten": "otherwise", # combined with other text
22 "attr.": "attributive",
23 # "auch": "also",
24 "bair.": "Bavarian",
25 "bairisch": "Bavarian",
26 "bar.": "Bavarian",
27 "BE": "British",
28 "BrE": "British",
29 "Bedva.": "outdated",
30 "Bedvatd.": "outdated",
31 "veraltende Bedeutung": "outdated",
32 # "bei": "",
33 # "bes.": "especially",
34 # "besonders": "especially",
35 # "beziehungsweise": "",
36 # "bzw.": "",
37 # "bildungsspr.": "",
38 # "bis": "",
39 # "bisweilen": "",
40 # "das": "",
41 "Dativ": "dative",
42 # "DDR": "",
43 # "der": "",
44 "dichter.": "poetic",
45 # "die": "",
46 "Dim.": "diminutive",
47 "Dimin.": "diminutive",
48 "Diminutiv": "diminutive",
49 # "eher": "",
50 "erzg.": "Erzgebirgisch",
51 "erzgeb.": "Erzgebirgisch",
52 "erzgebirgisch": "Erzgebirgisch",
53 "euph.": "euphemistic",
54 "fachspr.": "jargon",
55 "fachsprachlich": "jargon",
56 "fam.": "familiär",
57 "fig": "figurative",
58 "fig.": "figurative",
59 # "früher": "",
60 # "gegenwartslateinisch": "",
61 "geh.": "gehoben",
62 "Genitiv": "genitive",
63 "gsm": "Swiss German",
64 "häufig": "often",
65 "haben": "auxiliary",
66 "hebben": "auxiliary",
67 "hauptsächlich": "primarily",
68 "hist.": "historical",
69 "ieS": "narrowly",
70 "i.e.S.": "narrowly",
71 "i. e. S.": "narrowly",
72 # "im": "",
73 # "in": "",
74 # "in Bezug auf": "relational",
75 "indekl.": "indeclinable",
76 # "insbes.": "",
77 "Instrumental": "instrumental",
78 "intrans.": "intransitive",
79 "intransitiv": "intransitive",
80 # "iPl": "in plural",
81 "iron.": "ironic",
82 # "iwS": "",
83 # "jugendspr.": "",
84 "kinderspr.": "childish",
85 "kirchenlateinisch": "Church Latin",
86 "klasslat.": "Classical Latin",
87 "klassischlateinisch": "Classical Latin",
88 "kPl.": "no-plural",
89 "kein Plural": "no-plural",
90 "kSg.": "no-singulative",
91 "kSt.": "no-comparative",
92 "landsch.": "regional",
93 "lautm.": "onomatopoeic",
94 "Ling.": "linguistics",
95 "mA": "accusative",
96 "md.": "Central German",
97 "mdal.": "dialectal",
98 "Med.": "medicine", # topic
99 # "meist": "mostly",
100 # "meistens": "mostly",
101 "metaphor.": "metaphoric",
102 "meton.": "metonymically",
103 "mG": "genitive",
104 "mitteld.": "Central German",
105 # "mitunter": "",
106 "mlat.": "Medieval Latin",
107 "mittellateinisch": "Medieval Latin",
108 "mundartl.": "dialectal",
109 "nDu.": "only-dual",
110 "nigr.": "Niger",
111 "nigrisch": "Niger",
112 "nkLat.": "post-Classical Latin",
113 "nachklassischlateinisch": "post-Classical Latin",
114 "nlat.": "New Latin",
115 "neulateinisch": "New Latin",
116 "nordd.": "North German",
117 "norddeutsch": "North German",
118 "nordwestd.": "Northwestern Germany",
119 "nPl.": "plural-only",
120 "Österreich": "Austrian German",
121 "österr.": "Austrian German",
122 "österreichisch": "Austrian German",
123 "ostfränkisch": "East Franconian German",
124 "pej.": "pejorative",
125 "poet.": "poetic",
126 "PräpmG": "genitive prepositional",
127 "PmG": "genitive prepositional",
128 "reg.": "regional",
129 "refl.": "reflexive",
130 "reflexiv": "reflexive",
131 # "respektive": "",
132 "sal.": "casual",
133 "salopp": "casual",
134 "scherzh.": "jocular",
135 "schriftspr.": "literary",
136 # "schülerspr.": "",
137 "schwäb.": "Swabian",
138 "schwäbisch": "Swabian",
139 "Schweiz": "Swiss Standard German",
140 "schweiz.": "Swiss Standard German",
141 "schweizerisch": "Swiss Standard German",
142 "Schweizerdeutsch": "Swiss German",
143 "schweizerdeutsch": "Swiss German",
144 # "seemannsspr.": "",
145 "sein": "auxiliary verb",
146 # "sehr": "", # very
147 "selten": "rare",
148 "seltener": "rare",
149 "seltener auch": "rare",
150 "soldatenspr.": ["military", "slang"],
151 # "sonderspr.": "",
152 # "sonst": "",
153 # "sowie": "",
154 "spätlat.": "Late Latin",
155 "spätlateinisch": "Late Latin",
156 # "später": "",
157 "speziell": "special",
158 "südd.": "South German",
159 "süddt.": "South German",
160 # "techn.": "",
161 # "teils": "",
162 # "teilweise": "",
163 "tlwva.": "outdated",
164 "tlwvatd.": "outdated",
165 "trans.": "transitive",
166 "transitiv": "transitive",
167 # "über": "",
168 # "überwiegend": "mostly",
169 "übertr.": "figurative",
170 "übertragen": "figurative",
171 "ugs.": "colloquial",
172 "umgangssprachlich": "colloquial",
173 # "und": "",
174 "ungebr.": "uncommon",
175 "unpers.": "impersonal",
176 "unpersönlich": "impersonal",
177 # "ursprünglich": "",
178 "va.": "outdated",
179 "vatd.": "outdated",
180 "veraltend": "outdated",
181 # "verh.": "",
182 "volkst.": "popular",
183 # "von": "",
184 # "vor allem": "",
185 # "vor allem in": "",
186 "vul.": "vulgar",
187 "vulg.": "vulgar",
188 "vlat.": ["vulgar", "Latin"],
189 "vulgärlat": ["vulgar", "Latin"],
190 "vulgärlateinisch": ["vulgar", "Latin"],
191 "wien.": "Vienna",
192 "wienerisch": "Vienna",
193 # "Wpräp": "",
194 # "z. B.": "",
195 # "z. T.": "",
196 # "zijn": "",
197 # "zum Beispiel": "",
198 # "zum Teil": "",
199 # "zumeist": "",
200 "Kardinalzahl": "cardinal",
201 "Sammelbegriff": "collective",
202 "Fachsprache": "jargon",
203 "formale Sprachen": "formal",
204 "Programmiersprachen": "programming",
205 "Rechnerarchitektur": "programming",
206 "Geografie": "geography",
207 "Geometrie": "geometry",
208 "Finanzwesen": "finance",
209 "juristisch": "law",
210 "Physik": "physics",
211}
213GENDER_TAGS = {
214 "n": "neuter",
215 "m": "masculine",
216 "f": "feminine",
217 # Vorlage:Deklinationsseite Adjektiv
218 "Maskulinum": "masculine",
219 "Femininum": "feminine",
220 "Neutrum": "neuter",
221}
223NUMBER_TAGS = {
224 # Vorlage:Deutsch Substantiv Übersicht
225 "Singular": "singular",
226 "Plural": "plural",
227}
229CASE_TAGS = {
230 # Vorlage:Deutsch Substantiv Übersicht
231 "Nominativ": "nominative",
232 "Genitiv": "genitive",
233 "Dativ": "dative",
234 "Akkusativ": "accusative",
235}
237COMPARISON_TAGS = {
238 # Vorlage:Deutsch Adjektiv Übersicht
239 # Vorlage:Deklinationsseite Adjektiv
240 "Positiv": "positive",
241 "Komparativ": "comparative",
242 "Superlativ": "superlative",
243}
245DECLENSION_TAGS = {
246 # https://en.wikipedia.org/wiki/German_declension
247 # Vorlage:Deklinationsseite Adjektiv
248 "Starke Deklination": "strong",
249 "Schwache Deklination": "weak",
250 "Gemischte Deklination": "mixed",
251}
253OTHER_TAGS = {
254 # Vorlage:Deklinationsseite Adjektiv
255 "Prädikativ": "predicative",
256 "erweiterte": "extended",
257 "Höflichkeitsform": "honorific",
258 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
259 "nichterweitert": "not-extended",
260 "erweitert": "extended",
261}
263TENSE_TAGS = {
264 # Vorlage:Deutsch Verb Übersicht
265 "Präsens": "present",
266 "Präteritum": "past",
267 "Perfekt": "perfect",
268 "Futur I": "future-i",
269 "Futur II": "future-ii",
270 "Plusquamperfekt": "pluperfect",
271}
273MOOD_TAGS = {
274 # Vorlage:Deutsch Verb Übersicht
275 # Vorlage:Deutsch Verb regelmäßig
276 "Konjunktiv I": "subjunctive-i",
277 "Konjunktiv II": "subjunctive-ii",
278 "Imperativ": "imperative",
279 "Imperative": "imperative",
280 "Indikativ": "indicative",
281}
283VERB_FORM_TAGS = {
284 # Vorlage:Deutsch Verb Übersicht
285 "Partizip II": "participle-2",
286 "Hilfsverb": "auxiliary",
287 "Infinitive": "infinitive",
288 "Infinitiv": "infinitive",
289 "Partizipien": "participle",
290}
292VOICE_TAGS = {
293 # Vorlage:Deutsch Verb unregelmäßig
294 "Aktiv": "active",
295 "Vorgangspassiv": "processual passive",
296 "Zustandspassiv": "statal passive",
297 "Passiv": "passive",
298 "Gerundivum": "gerundive",
299 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
300 "Zustandsreflexiv": "statal reflexive",
301}
303PERSON_TAGS = {
304 # Vorlage:Deutsch Verb unregelmäßig
305 "1. Person Singular": ["first-person", "singular"],
306 "1. Person Plural": ["first-person", "plural"],
307 "2. Person Singular": ["second-person", "singular"],
308 "2. Person Plural": ["second-person", "plural"],
309 "3. Person Singular": ["third-person", "singular"],
310 "3. Person Plural": ["third-person", "plural"],
311 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
312 "Sg. 1. Pers.": ["first-person", "singular"],
313 "Pl. 1. Pers.": ["first-person", "plural"],
314 "Sg. 2. Pers.": ["second-person", "singular"],
315 "Pl. 2. Pers.": ["second-person", "plural"],
316 "Sg. 3. Pers.": ["third-person", "singular"],
317 "Pl. 3. Pers.": ["third-person", "plural"],
318}
320INFLECTION_TABLE_TAGS = {
321 # Vorlage:Deutsch Verb regelmäßig
322 "ungebräuchlich": "uncommon",
323 "veraltet": "archaic",
324 # Vorlage:Deutsch Verb schwach trennbar reflexiv
325 "Nebensatzkonjugation": "subordinate-clause",
326 "Hauptsatzkonjugation": "main-clause",
327}
329GRAMMATICAL_TAGS = {
330 **K_TEMPLATE_TAGS,
331 **GENDER_TAGS,
332 **NUMBER_TAGS,
333 **CASE_TAGS,
334 **COMPARISON_TAGS,
335 **DECLENSION_TAGS,
336 **OTHER_TAGS,
337 **TENSE_TAGS,
338 **MOOD_TAGS,
339 **VERB_FORM_TAGS,
340 **VOICE_TAGS,
341 **PERSON_TAGS,
342 **INFLECTION_TABLE_TAGS,
343}
345K_TEMPLATE_TOPICS = {
346 "Biologie": "biology",
347 "Linguistik": "linguistics",
348 "Wortbildung": "morphology",
349 "Behörde": "government",
350 "Astronomie": "astronomy",
351 "Immobilienbranche": "real-estate",
352 "Kunst": "arts",
353 "Informatik": "computing",
354 "Nautik": "nautical",
355 "Sport": "sports",
356 "Schuhwerk": "footwear",
357 "Textilien": "textiles",
358 "Zahlungsmittel": "payment-method",
359 "Ökologie": "ecology",
360 "Internet": "Internet",
361 "Religion": "religion",
362 "Militärsprache": "military",
363 "Systematik": "systematics",
364 "Zoologie": "zoology",
365 "Seefahrt": "seafaring",
366 "Soldatensprache": {"topic": "military", "tag": "slang"},
367 "Botanik": "botany",
368 "Marine": "navy",
369 "Informationstechnologie": "computing",
370 "Betriebswirtschaftslehre": "business",
371 "Recht": "law",
372 "Elektronik": "electronics",
373 "Emotion": "emotion",
374 "Mathematik": "mathematics",
375 "Bürgerliches Recht": "civil-Law",
376 "Militär": "military",
377 "Politik": "politics",
378 "Werkzeug": "tools",
379 "Medizin": "medicine",
380 "Ornithologie": "ornithology",
381 "Technik": "technology",
382 "Waffentechnik": "weaponry",
383 "Anatomie": "anatomy",
384 "Fußball": "football",
385 "Kartenspiel": "card-games",
386 "Theoretische Informatik": "computing",
387}
390def translate_raw_tags(data: WordEntry) -> None:
391 raw_tags = []
392 for raw_tag in data.raw_tags:
393 if raw_tag in GRAMMATICAL_TAGS:
394 tag = GRAMMATICAL_TAGS[raw_tag]
395 if isinstance(tag, str): 395 ↛ 397line 395 didn't jump to line 397 because the condition on line 395 was always true
396 data.tags.append(tag)
397 elif isinstance(tag, list):
398 data.tags.extend(tag)
399 elif raw_tag in K_TEMPLATE_TOPICS and hasattr(data, "topics"):
400 topic = K_TEMPLATE_TOPICS[raw_tag]
401 if isinstance(topic, str): 401 ↛ 403line 401 didn't jump to line 403 because the condition on line 401 was always true
402 data.topics.append(topic)
403 elif isinstance(topic, dict):
404 data.topics.append(topic.get("topic"))
405 data.tags.append(topic.get("tag"))
406 else:
407 raw_tags.append(raw_tag)
408 data.raw_tags = raw_tags