Coverage for src/wiktextract/extractor/de/tags.py: 77%
34 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from .models import WordEntry
3# Sense tags
4# https://de.wiktionary.org/wiki/Vorlage:K
5# https://de.wiktionary.org/wiki/Vorlage:K/Abk
6K_TEMPLATE_TAGS = {
7 "Abl.": "ablative",
8 "Ablativ": "ablative",
9 "abw.": "derogatory",
10 "abwertend": "derogatory",
11 "AE": "US",
12 "AmE": "US",
13 "adv.": "adverbial",
14 "Akkusativ": "accusative",
15 "alemann.": "Alemannic",
16 "alemannisch": "Alemannic",
17 "allg.": "general",
18 "allgemein": "general",
19 "alltagsspr.": "colloquial",
20 "amtsspr.": "officialese",
21 # "ansonsten": "otherwise", # combined with other text
22 "attr.": "attributive",
23 # "auch": "also",
24 "bair.": "Bavarian",
25 "bairisch": "Bavarian",
26 "bar.": "Bavarian",
27 "BE": "British",
28 "BrE": "British",
29 "Bedva.": "outdated",
30 "Bedvatd.": "outdated",
31 "veraltende Bedeutung": "outdated",
32 # "bei": "",
33 # "bes.": "especially",
34 # "besonders": "especially",
35 # "beziehungsweise": "",
36 # "bzw.": "",
37 # "bildungsspr.": "",
38 # "bis": "",
39 # "bisweilen": "",
40 # "das": "",
41 "Dativ": "dative",
42 # "DDR": "",
43 "Deutschland": "Germany",
44 # "der": "",
45 "dichter.": "poetic",
46 # "die": "",
47 "Dim.": "diminutive",
48 "Dimin.": "diminutive",
49 "Diminutiv": "diminutive",
50 # "eher": "",
51 "erzg.": "Erzgebirgisch",
52 "erzgeb.": "Erzgebirgisch",
53 "erzgebirgisch": "Erzgebirgisch",
54 "euph.": "euphemistic",
55 "fachspr.": "jargon",
56 "fachsprachlich": "jargon",
57 "fam.": "familiär",
58 "fig": "figurative",
59 "fig.": "figurative",
60 # "früher": "",
61 # "gegenwartslateinisch": "",
62 "geh.": "gehoben",
63 "Genitiv": "genitive",
64 "gsm": "Swiss German",
65 "häufig": "often",
66 "haben": "auxiliary",
67 "hebben": "auxiliary",
68 "hauptsächlich": "primarily",
69 "hist.": "historical",
70 "ieS": "narrowly",
71 "i.e.S.": "narrowly",
72 "i. e. S.": "narrowly",
73 # "im": "",
74 # "in": "",
75 # "in Bezug auf": "relational",
76 "indekl.": "indeclinable",
77 # "insbes.": "",
78 "Instrumental": "instrumental",
79 "intrans.": "intransitive",
80 "intransitiv": "intransitive",
81 # "iPl": "in plural",
82 "iron.": "ironic",
83 # "iwS": "",
84 # "jugendspr.": "",
85 "kinderspr.": "childish",
86 "kirchenlateinisch": "Church Latin",
87 "klasslat.": "Classical Latin",
88 "klassischlateinisch": "Classical Latin",
89 "kPl.": "no-plural",
90 "kein Plural": "no-plural",
91 "kSg.": "no-singulative",
92 "kSt.": "no-comparative",
93 "landsch.": "regional",
94 "lautm.": "onomatopoeic",
95 "Ling.": "linguistics",
96 "mA": "accusative",
97 "md.": "Central German",
98 "mdal.": "dialectal",
99 "Med.": "medicine", # topic
100 # "meist": "mostly",
101 # "meistens": "mostly",
102 "metaphor.": "metaphoric",
103 "meton.": "metonymically",
104 "mG": "genitive",
105 "mitteld.": "Central German",
106 # "mitunter": "",
107 "mlat.": "Medieval Latin",
108 "mittellateinisch": "Medieval Latin",
109 "mundartl.": "dialectal",
110 "nDu.": "only-dual",
111 "nigr.": "Niger",
112 "nigrisch": "Niger",
113 "nkLat.": "post-Classical Latin",
114 "nachklassischlateinisch": "post-Classical Latin",
115 "nlat.": "New Latin",
116 "neulateinisch": "New Latin",
117 "nordd.": "North German",
118 "norddeutsch": "North German",
119 "nordwestd.": "Northwestern Germany",
120 "nPl.": "plural-only",
121 "Österreich": "Austrian German",
122 "österr.": "Austrian German",
123 "österreichisch": "Austrian German",
124 "ostfränkisch": "East Franconian German",
125 "pej.": "pejorative",
126 "poet.": "poetic",
127 "PräpmG": "genitive prepositional",
128 "PmG": "genitive prepositional",
129 "reg.": "regional",
130 "refl.": "reflexive",
131 "reflexiv": "reflexive",
132 # "respektive": "",
133 "sal.": "casual",
134 "salopp": "casual",
135 "scherzh.": "jocular",
136 "schriftspr.": "literary",
137 # "schülerspr.": "",
138 "schwäb.": "Swabian",
139 "schwäbisch": "Swabian",
140 "Schweiz": "Swiss Standard German",
141 "schweiz.": "Swiss Standard German",
142 "schweizerisch": "Swiss Standard German",
143 "Schweizerdeutsch": "Swiss German",
144 "schweizerdeutsch": "Swiss German",
145 # "seemannsspr.": "",
146 "sein": "auxiliary verb",
147 # "sehr": "", # very
148 "selten": "rare",
149 "seltener": "rare",
150 "seltener auch": "rare",
151 "soldatenspr.": ["military", "slang"],
152 # "sonderspr.": "",
153 # "sonst": "",
154 # "sowie": "",
155 "spätlat.": "Late Latin",
156 "spätlateinisch": "Late Latin",
157 # "später": "",
158 "speziell": "special",
159 "südd.": "South German",
160 "süddt.": "South German",
161 # "techn.": "",
162 # "teils": "",
163 # "teilweise": "",
164 "tlwva.": "outdated",
165 "tlwvatd.": "outdated",
166 "trans.": "transitive",
167 "transitiv": "transitive",
168 # "über": "",
169 # "überwiegend": "mostly",
170 "übertr.": "figurative",
171 "übertragen": "figurative",
172 "ugs.": "colloquial",
173 "umgangssprachlich": "colloquial",
174 # "und": "",
175 "ungebr.": "uncommon",
176 "unpers.": "impersonal",
177 "unpersönlich": "impersonal",
178 # "ursprünglich": "",
179 "va.": "outdated",
180 "vatd.": "outdated",
181 "veraltend": "outdated",
182 # "verh.": "",
183 "volkst.": "popular",
184 # "von": "",
185 # "vor allem": "",
186 # "vor allem in": "",
187 "vul.": "vulgar",
188 "vulg.": "vulgar",
189 "vlat.": ["vulgar", "Latin"],
190 "vulgärlat": ["vulgar", "Latin"],
191 "vulgärlateinisch": ["vulgar", "Latin"],
192 "wien.": "Vienna",
193 "wienerisch": "Vienna",
194 # "Wpräp": "",
195 # "z. B.": "",
196 # "z. T.": "",
197 # "zijn": "",
198 # "zum Beispiel": "",
199 # "zum Teil": "",
200 # "zumeist": "",
201 "Kardinalzahl": "cardinal",
202 "Sammelbegriff": "collective",
203 "Fachsprache": "jargon",
204 "formale Sprachen": "formal",
205 "Programmiersprachen": "programming",
206 "Rechnerarchitektur": "programming",
207 "Geografie": "geography",
208 "Geometrie": "geometry",
209 "Finanzwesen": "finance",
210 "juristisch": "law",
211 "Physik": "physics",
212}
214GENDER_TAGS = {
215 "n": "neuter",
216 "m": "masculine",
217 "f": "feminine",
218 # Vorlage:Deklinationsseite Adjektiv
219 "Maskulinum": "masculine",
220 "Femininum": "feminine",
221 "Neutrum": "neuter",
222}
224NUMBER_TAGS = {
225 # Vorlage:Deutsch Substantiv Übersicht
226 "Singular": "singular",
227 "Plural": "plural",
228}
230CASE_TAGS = {
231 # Vorlage:Deutsch Substantiv Übersicht
232 "Nominativ": "nominative",
233 "Genitiv": "genitive",
234 "Dativ": "dative",
235 "Akkusativ": "accusative",
236}
238COMPARISON_TAGS = {
239 # Vorlage:Deutsch Adjektiv Übersicht
240 # Vorlage:Deklinationsseite Adjektiv
241 "Positiv": "positive",
242 "Komparativ": "comparative",
243 "Superlativ": "superlative",
244}
246DECLENSION_TAGS = {
247 # https://en.wikipedia.org/wiki/German_declension
248 # Vorlage:Deklinationsseite Adjektiv
249 "Starke Deklination": "strong",
250 "Schwache Deklination": "weak",
251 "Gemischte Deklination": "mixed",
252}
254OTHER_TAGS = {
255 # Vorlage:Deklinationsseite Adjektiv
256 "Prädikativ": "predicative",
257 "erweiterte": "extended",
258 "Höflichkeitsform": "honorific",
259 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
260 "nichterweitert": "not-extended",
261 "erweitert": "extended",
262}
264TENSE_TAGS = {
265 # Vorlage:Deutsch Verb Übersicht
266 "Präsens": "present",
267 "Präteritum": "past",
268 "Perfekt": "perfect",
269 "Futur I": "future-i",
270 "Futur II": "future-ii",
271 "Plusquamperfekt": "pluperfect",
272}
274MOOD_TAGS = {
275 # Vorlage:Deutsch Verb Übersicht
276 # Vorlage:Deutsch Verb regelmäßig
277 "Konjunktiv I": "subjunctive-i",
278 "Konjunktiv II": "subjunctive-ii",
279 "Imperativ": "imperative",
280 "Imperative": "imperative",
281 "Indikativ": "indicative",
282}
284VERB_FORM_TAGS = {
285 # Vorlage:Deutsch Verb Übersicht
286 "Partizip II": "participle-2",
287 "Hilfsverb": "auxiliary",
288 "Infinitive": "infinitive",
289 "Infinitiv": "infinitive",
290 "Partizipien": "participle",
291}
293VOICE_TAGS = {
294 # Vorlage:Deutsch Verb unregelmäßig
295 "Aktiv": "active",
296 "Vorgangspassiv": "processual passive",
297 "Zustandspassiv": "statal passive",
298 "Passiv": "passive",
299 "Gerundivum": "gerundive",
300 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
301 "Zustandsreflexiv": "statal reflexive",
302}
304PERSON_TAGS = {
305 # Vorlage:Deutsch Verb unregelmäßig
306 "1. Person Singular": ["first-person", "singular"],
307 "1. Person Plural": ["first-person", "plural"],
308 "2. Person Singular": ["second-person", "singular"],
309 "2. Person Plural": ["second-person", "plural"],
310 "3. Person Singular": ["third-person", "singular"],
311 "3. Person Plural": ["third-person", "plural"],
312 # Vorlage:Deutsch Verb schwach untrennbar reflexiv
313 "Sg. 1. Pers.": ["first-person", "singular"],
314 "Pl. 1. Pers.": ["first-person", "plural"],
315 "Sg. 2. Pers.": ["second-person", "singular"],
316 "Pl. 2. Pers.": ["second-person", "plural"],
317 "Sg. 3. Pers.": ["third-person", "singular"],
318 "Pl. 3. Pers.": ["third-person", "plural"],
319}
321INFLECTION_TABLE_TAGS = {
322 # Vorlage:Deutsch Verb regelmäßig
323 "ungebräuchlich": "uncommon",
324 "veraltet": "archaic",
325 # Vorlage:Deutsch Verb schwach trennbar reflexiv
326 "Nebensatzkonjugation": "subordinate-clause",
327 "Hauptsatzkonjugation": "main-clause",
328}
330GRAMMATICAL_TAGS = {
331 **K_TEMPLATE_TAGS,
332 **GENDER_TAGS,
333 **NUMBER_TAGS,
334 **CASE_TAGS,
335 **COMPARISON_TAGS,
336 **DECLENSION_TAGS,
337 **OTHER_TAGS,
338 **TENSE_TAGS,
339 **MOOD_TAGS,
340 **VERB_FORM_TAGS,
341 **VOICE_TAGS,
342 **PERSON_TAGS,
343 **INFLECTION_TABLE_TAGS,
344}
346K_TEMPLATE_TOPICS = {
347 "Biologie": "biology",
348 "Linguistik": "linguistics",
349 "Wortbildung": "morphology",
350 "Behörde": "government",
351 "Astronomie": "astronomy",
352 "Immobilienbranche": "real-estate",
353 "Kunst": "arts",
354 "Informatik": "computing",
355 "Nautik": "nautical",
356 "Sport": "sports",
357 "Schuhwerk": "footwear",
358 "Textilien": "textiles",
359 "Zahlungsmittel": "payment-method",
360 "Ökologie": "ecology",
361 "Internet": "Internet",
362 "Religion": "religion",
363 "Militärsprache": "military",
364 "Systematik": "systematics",
365 "Zoologie": "zoology",
366 "Seefahrt": "seafaring",
367 "Soldatensprache": {"topic": "military", "tag": "slang"},
368 "Botanik": "botany",
369 "Marine": "navy",
370 "Informationstechnologie": "computing",
371 "Betriebswirtschaftslehre": "business",
372 "Recht": "law",
373 "Elektronik": "electronics",
374 "Emotion": "emotion",
375 "Mathematik": "mathematics",
376 "Bürgerliches Recht": "civil-Law",
377 "Militär": "military",
378 "Politik": "politics",
379 "Werkzeug": "tools",
380 "Medizin": "medicine",
381 "Ornithologie": "ornithology",
382 "Technik": "technology",
383 "Waffentechnik": "weaponry",
384 "Anatomie": "anatomy",
385 "Fußball": "football",
386 "Kartenspiel": "card-games",
387 "Theoretische Informatik": "computing",
388}
391def translate_raw_tags(data: WordEntry) -> None:
392 raw_tags = []
393 for raw_tag in data.raw_tags:
394 if raw_tag in GRAMMATICAL_TAGS:
395 tag = GRAMMATICAL_TAGS[raw_tag]
396 if isinstance(tag, str): 396 ↛ 398line 396 didn't jump to line 398 because the condition on line 396 was always true
397 data.tags.append(tag)
398 elif isinstance(tag, list):
399 data.tags.extend(tag)
400 elif raw_tag in K_TEMPLATE_TOPICS and hasattr(data, "topics"):
401 topic = K_TEMPLATE_TOPICS[raw_tag]
402 if isinstance(topic, str): 402 ↛ 404line 402 didn't jump to line 404 because the condition on line 402 was always true
403 data.topics.append(topic)
404 elif isinstance(topic, dict):
405 data.topics.append(topic.get("topic"))
406 data.tags.append(topic.get("tag"))
407 else:
408 raw_tags.append(raw_tag)
409 data.raw_tags = raw_tags