Coverage for src/wiktextract/extractor/de/tags.py: 81%

1from .models import WordEntry

3# Sense tags

4# https://de.wiktionary.org/wiki/Vorlage:K

5# https://de.wiktionary.org/wiki/Vorlage:K/Abk

6K_TEMPLATE_TAGS = {

7 "Abl.": "ablative",

8 "Ablativ": "ablative",

9 "abw.": "derogatory",

10 "abwertend": "derogatory",

11 "AE": "US",

12 "AmE": "US",

13 "adv.": "adverbial",

14 "Akkusativ": "accusative",

15 "alemann.": "Alemannic",

16 "alemannisch": "Alemannic",

17 "allg.": "general",

18 "allgemein": "general",

19 "alltagsspr.": "colloquial",

20 "amtsspr.": "officialese",

21 # "ansonsten": "otherwise", # combined with other text

22 "attr.": "attributive",

23 # "auch": "also",

24 "bair.": "Bavarian",

25 "bairisch": "Bavarian",

26 "bar.": "Bavarian",

27 "direktional": "directional",

28 "BE": "British",

29 "BrE": "British",

30 "Bedva.": "outdated",

31 "Bedvatd.": "outdated",

32 "besonders": "especially",

33 "veraltende Bedeutung": "outdated",

34 # "bei": "",

35 # "bes.": "especially",

36 # "besonders": "especially",

37 # "beziehungsweise": "",

38 # "bzw.": "",

39 # "bildungsspr.": "",

40 # "bis": "",

41 # "bisweilen": "",

42 # "das": "",

43 "Dativ": "dative",

44 # "DDR": "",

45 "Deutschland": "Germany",

46 # "der": "",

47 "dichter.": "poetic",

48 # "die": "",

49 "Dim.": "diminutive",

50 "Dimin.": "diminutive",

51 "Diminutiv": "diminutive",

52 # "eher": "",

53 "erzg.": "Erzgebirgisch",

54 "erzgeb.": "Erzgebirgisch",

55 "erzgebirgisch": "Erzgebirgisch",

56 "euph.": "euphemistic",

57 "fachspr.": "jargon",

58 "fachsprachlich": "jargon",

59 "fam.": "familiär",

60 "fig": "figurative",

61 "fig.": "figurative",

62 # "früher": "",

63 # "gegenwartslateinisch": "",

64 "geh.": "gehoben",

65 "Genitiv": "genitive",

66 "gsm": "Swiss German",

67 "häufig": "often",

68 "haben": "auxiliary",

69 "hebben": "auxiliary",

70 "hauptsächlich": "primarily",

71 "hist.": "historical",

72 "ieS": "narrowly",

73 "i.e.S.": "narrowly",

74 "i. e. S.": "narrowly",

75 # "im": "",

76 # "in": "",

77 # "in Bezug auf": "relational",

78 "indekl.": "indeclinable",

79 # "insbes.": "",

80 "Instrumental": "instrumental",

81 "intrans.": "intransitive",

82 "intransitiv": "intransitive",

83 # "iPl": "in plural",

84 "iron.": "ironic",

85 # "iwS": "",

86 # "jugendspr.": "",

87 "kinderspr.": "childish",

88 "kirchenlateinisch": "Church Latin",

89 "klasslat.": "Classical Latin",

90 "klassischlateinisch": "Classical Latin",

91 "kPl.": "no-plural",

92 "kein Plural": "no-plural",

93 "kSg.": "no-singulative",

94 "kSt.": "no-comparative",

95 "kurz für": "short-form",

96 "landsch.": "regional",

97 "landschaftlich": "regional",

98 "lautm.": "onomatopoeic",

99 "lokal": "regional",

100 "Ling.": "linguistics",

101 "mA": "accusative",

102 "md.": "Central German",

103 "mdal.": "dialectal",

104 "Med.": "medicine", # topic

105 # "meist": "mostly",

106 # "meistens": "mostly",

107 "metaphor.": "metaphoric",

108 "meton.": "metonymically",

109 "mG": "genitive",

110 "mitteld.": "Central German",

111 "mit Dativ": "with-dative",

112 "mit Akkusativ": "with-accusative",

113 # "mitunter": "",

114 "mlat.": "Medieval Latin",

115 "mittellateinisch": "Medieval Latin",

116 "mundartl.": "dialectal",

117 "nDu.": "only-dual",

118 "nigr.": "Niger",

119 "nigrisch": "Niger",

120 "nkLat.": "post-Classical Latin",

121 "nachklassischlateinisch": "post-Classical Latin",

122 "nlat.": "New Latin",

123 "neulateinisch": "New Latin",

124 "nordd.": "North German",

125 "norddeutsch": "North German",

126 "nordwestd.": "Northwestern Germany",

127 "nPl.": "plural-only",

128 "Österreich": "Austrian German",

129 "örtlich": "regional",

130 "österr.": "Austrian German",

131 "österreichisch": "Austrian German",

132 "ostfränkisch": "East Franconian German",

133 "pej.": "pejorative",

134 "personifizierend": "person",

135 "poet.": "poetic",

136 "PräpmG": "genitive prepositional",

137 "PmG": "genitive prepositional",

138 "reg.": "regional",

139 "refl.": "reflexive",

140 "reflexiv": "reflexive",

141 # "respektive": "",

142 "sal.": "casual",

143 "salopp": "casual",

144 "scherzh.": "jocular",

145 "schriftspr.": "literary",

146 # "schülerspr.": "",

147 "schwäb.": "Swabian",

148 "schwäbisch": "Swabian",

149 "Schweiz": "Swiss Standard German",

150 "schweiz.": "Swiss Standard German",

151 "schweizerisch": "Swiss Standard German",

152 "Schweizerdeutsch": "Swiss German",

153 "schweizerdeutsch": "Swiss German",

154 # "seemannsspr.": "",

155 "sein": "auxiliary verb",

156 # "sehr": "", # very

157 "selten": "rare",

158 "seltener": "rare",

159 "seltener auch": "rare",

160 "soldatenspr.": ["military", "slang"],

161 # "sonderspr.": "",

162 # "sonst": "",

163 # "sowie": "",

164 "spätlat.": "Late Latin",

165 "spätlateinisch": "Late Latin",

166 # "später": "",

167 "speziell": "special",

168 "südd.": "South German",

169 "süddt.": "South German",

170 # "techn.": "",

171 # "teils": "",

172 # "teilweise": "",

173 "temporal": "temporal",

174 "tlwva.": "outdated",

175 "tlwvatd.": "outdated",

176 "trans.": "transitive",

177 "transitiv": "transitive",

178 # "über": "",

179 # "überwiegend": "mostly",

180 "übertr.": "figurative",

181 "übertragen": "figurative",

182 "ugs.": "colloquial",

183 "umgangssprachlich": "colloquial",

184 # "und": "",

185 "ungebr.": "uncommon",

186 "unpers.": "impersonal",

187 "unpersönlich": "impersonal",

188 # "ursprünglich": "",

189 "va.": "outdated",

190 "vatd.": "outdated",

191 "veraltend": "outdated",

192 # "verh.": "",

193 "volkst.": "popular",

194 # "von": "",

195 # "vor allem": "",

196 # "vor allem in": "",

197 "vul.": "vulgar",

198 "vulg.": "vulgar",

199 "vlat.": ["vulgar", "Latin"],

200 "vulgärlat": ["vulgar", "Latin"],

201 "vulgärlateinisch": ["vulgar", "Latin"],

202 "wien.": "Vienna",

203 "wienerisch": "Vienna",

204 "Wpräp": "prepositional",

205 # "z. B.": "",

206 # "z. T.": "",

207 # "zijn": "",

208 # "zum Beispiel": "",

209 # "zum Teil": "",

210 # "zumeist": "",

211 "Kardinalzahl": "cardinal",

212 "Sammelbegriff": "collective",

213 "Fachsprache": "jargon",

214 "formale Sprachen": "formal",

215 "Programmiersprachen": "programming",

216 "Rechnerarchitektur": "programming",

217 "Geografie": "geography",

218 "Geometrie": "geometry",

219 "Finanzwesen": "finance",

220 "juristisch": "law",

221 "Physik": "physics",

222 "abstrakt": "abstract",

223 "gegenständlich": "objective",

224 "personifiziert": "personal",

225 "kirchlich": "Ecclesiastical",

226}

227

228GENDER_TAGS = {

229 "n": "neuter",

230 "m": "masculine",

231 "f": "feminine",

232 # Vorlage:Deklinationsseite Adjektiv

233 "Maskulinum": "masculine",

234 "Femininum": "feminine",

235 "Neutrum": "neuter",

236}

237

238NUMBER_TAGS = {

239 # Vorlage:Deutsch Substantiv Übersicht

240 "Singular": "singular",

241 "Plural": "plural",

242 "Pl.": "plural",

243 "Dual": "dual",

244}

245

246CASE_TAGS = {

247 # Vorlage:Deutsch Substantiv Übersicht

248 "Nominativ": "nominative",

249 "Genitiv": "genitive",

250 "Dativ": "dative",

251 "Akkusativ": "accusative",

252 # Template:Polnisch Substantiv Übersicht

253 "Lokativ": "locative",

254 "Vokativ": "vocative",

255 "Dativ Singular": ["dative", "singular"],

256 "Genitiv Singular": ["genitive", "singular"],

257 # Template:Finnisch Substantiv Übersicht

258 "Inessiv": "inessive",

259 "Elativ": "elative",

260 "Illativ": "illative",

261 "Adessiv": "adessive",

262 "Allativ": "allative",

263 "Essiv": "essive",

264 "Translativ": "translative",

265 "Abessiv": "abessive",

266 "Instruktiv": "instructive",

267 "Komitativ": "comitative",

268}

269

270COMPARISON_TAGS = {

271 # Vorlage:Deutsch Adjektiv Übersicht

272 # Vorlage:Deklinationsseite Adjektiv

273 "Positiv": "positive",

274 "Komparativ": "comparative",

275 "Superlativ": "superlative",

276}

277

278DECLENSION_TAGS = {

279 # https://en.wikipedia.org/wiki/German_declension

280 # Vorlage:Deklinationsseite Adjektiv

281 "Starke Deklination": "strong",

282 "Schwache Deklination": "weak",

283 "Gemischte Deklination": "mixed",

284}

285

286OTHER_TAGS = {

287 # Vorlage:Deklinationsseite Adjektiv

288 "Prädikativ": "predicative",

289 "erweiterte": "extended",

290 "Höflichkeitsform": "honorific",

291 # Vorlage:Deutsch Verb schwach untrennbar reflexiv

292 "nichterweitert": "not-extended",

293 "erweitert": "extended",

294 "zeitlich": "temporal",

295 "indeklinabel": "indeclinable",

296 "östlich": "Eastern",

297 "westlich": "Western",

298 "britisch": "British",

299 "Substantive": "noun",

300 "Substantiv": "noun",

301 "historisch": "historical",

302 "wörtlich": "literally",

303 "Adjektiv": "adjective",

304 "gehoben": "literary",

305 "Nebenform von": "variant",

306 "Verben": "verb",

307 "regional": "regional",

308 # Vorlage:CH&LI

309 "Schweiz und Liechtenstein": ["Switzerland", "Liechtenstein"],

310 "Switzerland and Liechtenstein": ["Switzerland", "Liechtenstein"],

311 "traditionell": "traditional",

312 "vereinfachte Schreibweise": "simplified",

313}

314

315TENSE_TAGS = {

316 # Vorlage:Deutsch Verb Übersicht

317 "Präsens": "present",

318 "Präteritum": "past",

319 "Perfekt": "perfect",

320 "Futur I": "future-i",

321 "Futur II": "future-ii",

322 "Plusquamperfekt": "pluperfect",

323 # Template:Kroatisch Verb Übersicht

324 "perfektiv": "perfective",

325 "imperfektiv": "imperfective",

326 "Imperfekt": "imperfect",

327}

328

329MOOD_TAGS = {

330 # Vorlage:Deutsch Verb Übersicht

331 # Vorlage:Deutsch Verb regelmäßig

332 "Konjunktiv I": "subjunctive-i",

333 "Konjunktiv II": "subjunctive-ii",

334 "Imperativ": "imperative",

335 "Imperative": "imperative",

336 "Indikativ": "indicative",

337}

338

339VERB_FORM_TAGS = {

340 # Vorlage:Deutsch Verb Übersicht

341 "Partizip II": "participle-2",

342 "Hilfsverb": "auxiliary",

343 "Infinitive": "infinitive",

344 "Infinitiv": "infinitive",

345 "Partizipien": "participle",

346 "unregelmäßig": "irregular",

347 "Aorist": "aorist",

348 # Template:Dänisch Verb Übersicht

349 "Partizip Perfekt": ["participle", "perfect"],

350}

351

352VOICE_TAGS = {

353 # Vorlage:Deutsch Verb unregelmäßig

354 "Aktiv": "active",

355 "Vorgangspassiv": "processual-passive",

356 "Zustandspassiv": "statal-passive",

357 "Passiv": "passive",

358 "Gerundivum": "gerundive",

359 # Vorlage:Deutsch Verb schwach untrennbar reflexiv

360 "Zustandsreflexiv": "statal-reflexive",

361}

362

363PERSON_TAGS = {

364 # Vorlage:Deutsch Verb unregelmäßig

365 "1. Person Singular": ["first-person", "singular"],

366 "1. Person Plural": ["first-person", "plural"],

367 "2. Person Singular": ["second-person", "singular"],

368 "2. Person Plural": ["second-person", "plural"],

369 "3. Person Singular": ["third-person", "singular"],

370 "3. Person Plural": ["third-person", "plural"],

371 # Vorlage:Deutsch Verb schwach untrennbar reflexiv

372 "Sg. 1. Pers.": ["first-person", "singular"],

373 "Pl. 1. Pers.": ["first-person", "plural"],

374 "Sg. 2. Pers.": ["second-person", "singular"],

375 "Pl. 2. Pers.": ["second-person", "plural"],

376 "Sg. 3. Pers.": ["third-person", "singular"],

377 "Pl. 3. Pers.": ["third-person", "plural"],

378}

379

380INFLECTION_TABLE_TAGS = {

381 # Vorlage:Deutsch Verb regelmäßig

382 "ungebräuchlich": "uncommon",

383 "veraltet": "archaic",

384 # Vorlage:Deutsch Verb schwach trennbar reflexiv

385 "Nebensatzkonjugation": "subordinate-clause",

386 "Hauptsatzkonjugation": "main-clause",

387 "regelmäßig": "regular",

388 "untrennbar": "inseparable",

389 "trennbar": "separable",

390 # Vorlage:Deutsch Nachname Übersicht

391 "Singular m": ["singular", "masculine"],

392 "Singular f": ["singular", "feminine"],

393 # Vorlage:Deklinationsseite Numerale

394 "bestimmt": "definite",

395 "unbestimmt": "indefinite",

396 "mit Possessivpronomen": ["possessive", "pronoun"],

397 # Template:Kroatisch Verb Übersicht

398 "Partizip Präteritum Aktiv": ["past", "participle", "active"],

399}

400

401GRAMMATICAL_TAGS = {

402 **K_TEMPLATE_TAGS,

403 **GENDER_TAGS,

404 **NUMBER_TAGS,

405 **CASE_TAGS,

406 **COMPARISON_TAGS,

407 **DECLENSION_TAGS,

408 **OTHER_TAGS,

409 **TENSE_TAGS,

410 **MOOD_TAGS,

411 **VERB_FORM_TAGS,

412 **VOICE_TAGS,

413 **PERSON_TAGS,

414 **INFLECTION_TABLE_TAGS,

415}

416

417K_TEMPLATE_TOPICS = {

418 "Biologie": "biology",

419 "Linguistik": "linguistics",

420 "Wortbildung": "morphology",

421 "Behörde": "government",

422 "Astronomie": "astronomy",

423 "Immobilienbranche": "real-estate",

424 "Kunst": "arts",

425 "Informatik": "computing",

426 "Nautik": "nautical",

427 "Sport": "sports",

428 "Schuhwerk": "footwear",

429 "Textilien": "textiles",

430 "Zahlungsmittel": "payment-method",

431 "Ökologie": "ecology",

432 "Internet": "Internet",

433 "Religion": "religion",

434 "Militärsprache": "military",

435 "Systematik": "systematics",

436 "Zoologie": "zoology",

437 "Seefahrt": "seafaring",

438 "Soldatensprache": {"topic": "military", "tag": "slang"},

439 "Botanik": "botany",

440 "Marine": "navy",

441 "Informationstechnologie": "computing",

442 "Betriebswirtschaftslehre": "business",

443 "Recht": "law",

444 "Elektronik": "electronics",

445 "Emotion": "emotion",

446 "Mathematik": "mathematics",

447 "Bürgerliches Recht": "civil-Law",

448 "Militär": "military",

449 "Politik": "politics",

450 "Werkzeug": "tools",

451 "Medizin": "medicine",

452 "Ornithologie": "ornithology",

453 "Technik": "technology",

454 "Waffentechnik": "weaponry",

455 "Anatomie": "anatomy",

456 "Fußball": "football",

457 "Kartenspiel": "card-games",

458 "Theoretische Informatik": "computing",

459 "militärisch": "military",

460 "Taxonomie": "taxonomy",

461}

462

463

464def translate_raw_tags(data: WordEntry) -> None:

465 raw_tags = []

466 for raw_tag in data.raw_tags:

467 if raw_tag in GRAMMATICAL_TAGS:

468 tag = GRAMMATICAL_TAGS[raw_tag]

469 if isinstance(tag, str) and tag not in data.tags:

470 data.tags.append(tag)

471 elif isinstance(tag, list): 471 ↛ 466line 471 didn't jump to line 466 because the condition on line 471 was always true

472 for t in tag:

473 if t not in data.tags: 473 ↛ 472line 473 didn't jump to line 472 because the condition on line 473 was always true

474 data.tags.append(t)

475 elif raw_tag in K_TEMPLATE_TOPICS and hasattr(data, "topics"):

476 topic = K_TEMPLATE_TOPICS[raw_tag]

477 if isinstance(topic, str) and topic not in data.topics: 477 ↛ 479line 477 didn't jump to line 479 because the condition on line 477 was always true

478 data.topics.append(topic)

479 elif isinstance(topic, dict) and topic["topic"] not in data.topics:

480 data.topics.append(topic["topic"])

481 if topic["tag"] not in data.tags:

482 data.tags.append(topic["tag"])

483 else:

484 raw_tags.append(raw_tag)

485 data.raw_tags = raw_tags