Coverage for src/wiktextract/extractor/pl/tags.py: 86%
31 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1from .models import WordEntry
3# Help:Abbreviations used in Wiktionary
4# https://pl.wiktionary.org/wiki/Pomoc:Skróty_używane_w_Wikisłowniku
5# Category:Shortcut templates
6# https://pl.wiktionary.org/wiki/Kategoria:Szablony_skrótów
7TAGS = {
8 "abl.": "ablative",
9 # "akust.": "",
10 "amer.": "US",
11 "aor.": "aorist",
12 "arab.": "Arabic",
13 "bałt.": "Baltic",
14 "bask.": "Basque",
15 "bezok.": "infinitive",
16 "bezosob.": "impersonal",
17 "bibl.": "Biblical",
18 "blm": "no-plural",
19 "blp": "no-singulative",
20 "Bm": "Bokmål",
21 "bośn.": "Bosnian",
22 "brytań.": "British",
23 "bułg.": "Bulgarian",
24 "bwr.": "Bavarian",
25 "celt.": "Celtic",
26 "chiń.": "Chinese",
27 "chorw.": "Croatian",
28 "cs.": "Church-Slavonic",
29 "czes.": "Czech",
30 "depr.": "depreciative",
31 "dial.": "dialectal",
32 "dk": "perfective",
33 "dosł.": "literally",
34 "du": "dual",
35 "dysfem.": "dysphemism",
36 "dysk.": "discourse",
37 "dźwięk.": "onomatopoeic",
38 "egip.": "Egyptian",
39 "ekspr.": "expressively",
40 "el.": "Greek",
41 "erud.": "eruditely",
42 "eufem.": "euphemistic",
43 "ew.": "alternative",
44 "ezot.": "esoteric",
45 "franc.": "French",
46 "galic.": "Galician",
47 "germ.": "Germanic",
48 "gr.": "Ancient-Greek",
49 "grec.": "Ancient-Greek",
50 "grub.": "offensive",
51 "grzecz.": "polite",
52 "gw.": "dialectal",
53 "hebr.": "Hebrew",
54 "hin.": "Hindi",
55 "hiszp.": "Spanish",
56 "honor.": "honorific",
57 "ims.": "participle",
58 "ind.": "India",
59 "infant.": "childish",
60 "irl.": "Irish",
61 "iron.": "ironic",
62 "iterat.": "iterative",
63 "jap.": "Japanese",
64 "kanad.": "Canadian-English",
65 "kanad. franc.": "Canadian-French",
66 "kant.": "Cantonese",
67 "katal.": "Catalan",
68 "kathar.": "Katharevousa",
69 "kaz.": "Kazakh",
70 "kor.": "Korean",
71 "kor. płd.": "South-Korean",
72 "kor. płn.": "North-Korean",
73 "korn.": "Cornish",
74 "książk.": "literary",
75 "lekcew.": "pejorative",
76 "lewant. arab.": "Levantine-Arabic",
77 "libij. arab.": "Libyan-Arabic",
78 "licz.": "numeral",
79 "licz. gł.": "cardinal",
80 "licz. porz.": "ordinal",
81 "litew.": "Lithuanian",
82 "lm": "plural",
83 "lm m": ["plural", "masculine"],
84 "lm nm": ["plural", "nonvirile"],
85 "lp": "singular",
86 "lud.": "vernacular",
87 "lwow.": ["Lviv", "dialectal"],
88 "łac.": "Latin",
89 "łac.kośc.": ["Ecclesiastical", "Latin"],
90 "łot.": "Latvian",
91 "m": "masculine",
92 "mac.": "Macedonian",
93 "malaj.": "Malay",
94 "marok.": "Moroccan",
95 "międzyr.": "interfix",
96 "młodz.": "youth",
97 "mong.": "Mongolian",
98 "mong. klas.": "Classical-Mongolian",
99 "moz.": "Mozambique",
100 "m.-os.": ["masculine", "personal"],
101 "mrz": ["masculine", "inanimate"],
102 "mzw": ["masculine", "animate"],
103 "n": "neuter",
104 "nah": "Nahuatl",
105 "nbk.": "Bokmål",
106 "ndk": "imperfective",
107 "neol.": "neologism",
108 "neutr.": "neutral",
109 "n.gr.": "Modern-Greek",
110 "niderl.": "Dutch",
111 "nieofic.": "unofficially",
112 "niem.": "German",
113 "niem. RFN": "Standard-German",
114 "nieodm.": "uninflected",
115 "nieos.": "impersonal",
116 "niepopr.": "incorrectly",
117 "n.łac.": "Neo-Latin",
118 "nm.-os.": "nonvirile",
119 "nn": "Nynorsk",
120 "norw.": "Norwegian",
121 "nowozel": "New-Zealand",
122 "nprzech.": "intransitive",
123 "nwh.": "Navajo",
124 "nżw": "inanimate",
125 "nord.": "Nordic",
126 "obraź.": "offensive",
127 "odczas.": "verbal",
128 "odm.": "inflected",
129 "odprzym.": "deadjectival",
130 "odrzecz.": "substantival",
131 "ofic.": "officially",
132 "ogsłow.": "Common-Slavic",
133 "określ.": "determiner",
134 "os.": "person",
135 "oset.": "Ossetian",
136 "osm.": "Ottoman",
137 "oznajm.": "indicative",
138 "partyk.": "particle",
139 "paszto": "Pashto",
140 "Prt.": "partitive",
141 "pejor.": "pejorative",
142 "pers.": "Persian",
143 "peryfr.": "periphrastic",
144 "p.gr": "Late-Greek",
145 "pieszcz.": "endearing",
146 "p.łac.": "Late-Latin",
147 "płdbraz.": "Brazil",
148 "płnlap.": "Northern-Sámi",
149 "podn.": "elevatedly",
150 "poet.": "poetic",
151 "pogard.": "scornfully",
152 "pol.": "Polish",
153 "poł.": "Polabian",
154 "port.": "Portuguese",
155 "posp.": "commonly",
156 "postp.": "postpositional",
157 "pot.": "colloquial",
158 "pozn.": ["Poznań", "regional"],
159 "pragerm.": "Proto-Germanic",
160 "praindoeur.": "Proto-Indo-European",
161 "pranord.": "Proto-Norse",
162 "prasł.": "Proto-Slavic",
163 "praturk.": "Proto-Turkic",
164 "prawdop.": "presumably",
165 "prow.": "Provençal",
166 "przech.": "transitive",
167 "przecz.": "negation",
168 "przedr.": "prefix",
169 "przen.": "metaphoric",
170 "przest.": "obsolete",
171 "przesz.": "past",
172 "przyim.": "prepositional",
173 "przym.": "adjective",
174 "przyp.": "subjunctive",
175 "M.": "nominative",
176 "Nom.": "nominative",
177 "D.": "genitive",
178 "Gen.": "genitive",
179 "C.": "dative",
180 "Dat.": "dative",
181 "B.": "accusative",
182 "Akk.": "accusative",
183 "N.": "instrumental",
184 "Ms.": "locative",
185 "W.": "vocative",
186 "adess.": "adessive",
187 "all.": "allative",
188 "ess.": "essive",
189 "part.": "partitive",
190 "przyr.": "suffix",
191 "przysł.": "adverb",
192 "przysz.": "future",
193 "psych.": "psychology",
194 "pszcz.": "beekeeping",
195 "p. uwsp.": "modern", # "(p. uwsp.)" from template "uwsp"
196 "polinez.": "Polynesian",
197 "qu.": "Quechua",
198 "quen.": "Quenya",
199 "rzym.": "Roman",
200 "słow.": "Slavic",
201 "sumer.": "Sumerian",
202 "rodz.": "gendered-article",
203 "rodz. nieokr.": ["indefinite", "article"],
204 "rodz. okr.": ["definite", "article"],
205 "ros.": "Russian",
206 "rozk.": "imperative",
207 "bryt. (RP)": ["British", "Received-Pronunciation"],
208 "rub.": "broadly",
209 "rum.": "Romanian",
210 "run.": "Kirundi",
211 "rzad.": "rare",
212 "rzecz.": "noun",
213 "sanskr.": "Sanskrit",
214 "serb.": "Serbian",
215 "skr.": "abbreviation",
216 "slang.": "slang",
217 "słc.": "Slovak",
218 "słń.": "Slovene",
219 "słowiń.": "Slovene",
220 "somal.": "Somali",
221 "sp.": "conjunction",
222 "st.ang.": "Old-English",
223 "staroż.": "Ancient",
224 "st.cons.": "construct",
225 "st.czes.": "Old-Czech",
226 "st.duń.": "Old-Danish",
227 "st.egip.": "Ancient-Egyptian",
228 "st.franc.": "Old-French",
229 "st.fryz.": "Old-Frisian",
230 "st.gr.": "Ancient-Greek",
231 "st.ind.": "Ancient-Indian",
232 "st.irl.": "Old-Irish",
233 "st.łac.": "Old-Latin",
234 "st.nord.": "Old-Norse",
235 "st.pers.": "Old-Persian",
236 "st.pol.": "Old-Polish",
237 "st.poł.": "Old-Slavic",
238 "st.prus.": "Old-Prussian",
239 "strbr": "passive",
240 "strcz": "active",
241 "strzwr": "middle",
242 "st.rus.": "Old-Russian",
243 "st.saks.": "Old-Saxon",
244 "st.szw.": "Old-Swedish",
245 "st.turk.": "Old-Turkish",
246 "sus.": "Susu",
247 "sw.": "Swahili",
248 "swn.": "Old-High-German",
249 "symbol.": "symbol",
250 "syn.": "synonym",
251 "szw.": "Swedish",
252 "szwajc. franc.": ["French", "Switzerland"],
253 "szwajc. niem.": ["German", "Switzerland"],
254 "szwajc. wł.": ["Italian", "Switzerland"],
255 "szwb.": "German",
256 "śdn.": "Middle-Low-German",
257 "śl.": "Silesian",
258 "średnioang.": "Middle-English",
259 "średniofranc.": "Middle-French",
260 "śr.gr.": "Medieval-Greek",
261 "śr.łac.": "Medieval-Latin",
262 "śr.niderl.": "Middle-Dutch",
263 "śr.pol.": "Middle-Polish",
264 "śwn.": "Middle-High-German",
265 "t.": "also",
266 "taj.": "Thai",
267 "tamil.": "Tamil",
268 "tatar.": "Tatar",
269 "tem. słow.": "word-forming",
270 "ter.": "present",
271 "tim. port.": ["Portuguese", "East Timor"],
272 "tłum.": "translation",
273 "trad.": "Traditional",
274 "tur.": "Turkish",
275 "turkm.": "Turkmen",
276 "tuw.": "Tuvan",
277 "tyb.": "Tibetan",
278 "tzm.": "Tamazight",
279 "UK": "UK",
280 "ukr.": "Ukrainian",
281 "uproszcz.": "Simplified",
282 "urd.": "Urdu",
283 "urz.": "formal",
284 "US": "US",
285 "(p. uwsp.)": "modern-spelling",
286 "uzb.": "Uzbek",
287 "vo.": "Volapük",
288 "w": "common",
289 "wal.": "Welsh",
290 "war.": "variant",
291 "warsz.": ["Warsaw", "dialectal"],
292 "wed.": "Vedic",
293 "wenec.": "Venetian",
294 "węg.": "Hungarian",
295 "wiet.": "Vietnamese",
296 "wilam.": "Vilamovian",
297 "wł.": "Italian",
298 "wsch.": ["Eastern", "dialectal"],
299 "współcz.": "contemporary",
300 "wulg.": "vulgar",
301 "wych. z uż.": "archaic",
302 "wykrz.": "interjection",
303 "wyr. przyim.": ["prepositional", "phrase"],
304 "zach.": ["Western", "dialectal"],
305 "zaim.": "pronoun",
306 "zaw.": "professional",
307 "zaz.": "Zazaki",
308 "zdrobn.": "diminutive",
309 "zgrub.": "augmentative",
310 "zw rz": "regimen",
311 "zw zg": "concord",
312 "zwł.": "especially",
313 "ż": "feminine",
314 "żart.": "humorous",
315 "żmd.": "Samogitian",
316 "żw": "animate",
317 "żyd.": "Jewish",
318 # Category:Acronym templates - grammar
319 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_skrótów_-_gramatyka
320 # gender types in POS line
321 "męski": "masculine",
322 "męskozwierzęcy": ["masculine", "animate"],
323 "męskorzeczowy": ["masculine", "inanimate"],
324 "niepoliczalny": "uncountable",
325 "nieżywotny": "inanimate",
326 "nijaki": "neuter",
327 "policzalny": "countable",
328 "przechodni": "transitive",
329 "żeński": "feminine",
330 "żywotny": "animate",
331 "dzierżawczy": "possessive",
332 "niedokonany": "imperfective",
333 "dokonany": "perfective",
334 "relacyjny": "relational",
335 # "odmiana-rzeczownik-polski" template
336 "liczba pojedyncza": "singular",
337 "liczba mnoga": "plural",
338 "mianownik": "nominative",
339 "dopełniacz": "genitive",
340 "celownik": "dative",
341 "biernik": "accusative",
342 "narzędnik": "instrumental",
343 "miejscownik": "locative",
344 "wołacz": "vocative",
345 # "odmiana-przymiotnik-polski" template
346 "mos/mzw": ["masculine", "animate"],
347 "mos": "masculine",
348 "nmos": "nonvirile",
349 "stopień wyższy": "comparative",
350 "stopień najwyższy": "superlative",
351 # "odmiana-czasownik-polski" template
352 "1. os.": "first-person",
353 "2. os.": "second-person",
354 "3. os.": "third-person",
355 "bezokolicznik": "infinitive",
356 "czas teraźniejszy": "present",
357 "czas przeszły": "past",
358 "tryb rozkazujący": "imperative",
359 "czas przyszły": "future",
360 "czas przyszły prosty": "future",
361 "czas zaprzeszły": "pluperfect",
362 "forma bezosobowa": "impersonal",
363 "czasu przeszłego": "past",
364 "tryb przypuszczający": "conditional",
365 "imiesłów przymiotnikowy czynny": ["active", "participle"],
366 "imiesłów przymiotnikowy bierny": ["passive", "participle"],
367 "imiesłów przysłówkowy współczesny": [
368 "contemporary",
369 "adverbial",
370 "participle",
371 ],
372 "imiesłów przymiotnikowy przeszły": ["past", "participle"],
373 "imiesłów przysłówkowy uprzedni": ["anterior", "adverbial", "participle"],
374 "rzeczownik odczasownikowy": "gerund",
375 # "odmiana-rzeczownik-esperanto" template
376 "ununombro": "singular",
377 "multenombro": "plural",
378 "nominativo": "nominative",
379 "akuzativo": "accusative",
380 "multenombro (virtuala)": ["plural", "virtual"],
381 # pos line
382 "nieprzechodni": "intransitive",
383 "czas.": "verb",
384 "ndk.": "imperfective",
385 "dk.": "perfective",
386 "wspólny": "common",
387 "męskoosobowy": "masculine",
388 "daw.": "dated",
389 "zwrotny": "reflexive",
390 "czasownikowa": "verb",
391 "nieprzechodnia": "intransitive",
392 "słaby": "weak",
393 "bryt.": "British-English",
394 "niemęskoosobowy": "nonvirile",
395 "nazwa własna": "proper-noun",
396 "jakościowy": "qualitative",
397 "policzalna": "countable",
398 "mocny": "strong",
399 "temat": "stem",
400 "niedokonana": "imperfective",
401 "transkrypcja w systemie Hepburna": "Hepburn-romanization",
402}
404TOPICS = {
405 "adm.": "administration",
406 "agrot.": "agrotechnology",
407 "alch.": "alchemy",
408 "anat.": "anatomy",
409 "antrop.": "anthropology",
410 "arachn.": "arachnology",
411 "archit.": "architecture",
412 "archeol.": "archeology",
413 "astr.": "astronomy",
414 "astrol.": "astrology",
415 "astronaut.": "astronautics",
416 "bank.": "banking",
417 # "bibliot.": "",
418 "biochem.": "biochemistry",
419 "biol.": "biology",
420 # "biur.": "",
421 "bot.": "botany",
422 "bud.": "construction",
423 "ceram.": "ceramics",
424 "chem.": "chemistry",
425 "choreogr.": "choreography",
426 "cukiernictwo.": "confectionery",
427 "cybern.": "cybernetics",
428 # "daw.": "",
429 "demogr.": "demography",
430 "dendr.": "dendrology",
431 "drewn.": "woodworking",
432 "druk.": "printing",
433 "dypl.": "diplomacy",
434 "eduk.": "education",
435 "ekol.": "ecology",
436 "ekon.": "economics",
437 "elektr.": "electricity",
438 "elektron.": "electronics",
439 "enol.": "oenology",
440 "ent.": "entomology",
441 "etn.": "ethnography",
442 "etym.": "etymology",
443 "fant.": "speculative-fiction",
444 "farm.": "pharmacology",
445 "felinol.": "felinology",
446 "filatel.": "philately",
447 "film.": "film",
448 "filoz.": "philosophy",
449 "finans.": "finance",
450 "fitopatol.": "phytopathology",
451 "fiz.": "physics",
452 "fizj.": "physiology",
453 "flis.": "timber-rafting",
454 "folk.": "folklore",
455 "fonet.": "phonetics",
456 "form. słow.": "word-forming",
457 "fot.": "photography",
458 "fryzj.": "hairdressing",
459 "garb.": "tanning",
460 "gastr.": "gastronomy",
461 "genet.": "genetics",
462 "geod.": "geodesy",
463 "geofiz.": "geophysics",
464 "geogr.": "geography",
465 "geol.": "geology",
466 "geom.": "geometry",
467 "gend. st.": "gender-studies",
468 "ginek.": "gynaecology",
469 "górn.": "mining",
470 "gram.": "grammar",
471 "gry komp.": "computer games",
472 "hand.": "trade",
473 "harc.": "scouting",
474 "herald.": "heraldry",
475 "herp.": "herpetology",
476 "hig.": "hygienic",
477 "hipol.": "hippology",
478 "hist.": "history",
479 "hotel.": "hotel-industry",
480 "hutn.": "metallurgy",
481 "hydraul.": "hydraulics",
482 "hydrol.": "hydrology",
483 "icht.": "ichthyology",
484 "ikonogr.": "iconography",
485 "inform.": "computer-science",
486 "jedn. miar.": "units-of-measure",
487 "jedn. monet.": "units-of-monetary",
488 "jeźdz.": "equestrianism",
489 "jęz.": "linguistics",
490 "jubil.": "jewelry",
491 "kartogr.": "cartography",
492 "kolej.": "railways",
493 "konserwat.": "conservation",
494 "kosmet.": "cosmetics",
495 "kośc.": "ecclesiastical",
496 "kraw.": "tailoring",
497 "krym.": "criminology",
498 "kryptogr.": "cryptography",
499 "krystal.": "crystallography",
500 "księg.": "accounting",
501 "kulin.": "culinary",
502 "kult.": "cultural-studies",
503 "kynol.": "cynology",
504 "leśn.": "forestry",
505 "liter.": "literature",
506 "log.": "logic",
507 "lotn.": "aviation",
508 "łow.": "hunting",
509 "mar.": "nautical",
510 "mat.": "mathematics",
511 "mebl.": "furniture",
512 "mech.": "mechanics",
513 "med.": "medicine",
514 "met.": "metallurgy",
515 "meteorol.": "meteorology",
516 "metrol.": "metrology",
517 "mikol.": "mycology",
518 "mikrobiol.": "microbiology",
519 "miner.": "mineralogy",
520 "mit.": "mythology",
521 "młyn.": "milling",
522 "monet.": "monetary-unit",
523 "mors.": "maritime",
524 "mot.": "automotive",
525 "muz.": "musicology",
526 "myśl.": "hunting",
527 "nauk.": "sciences",
528 "nawig.": "navigation",
529 "numizm.": "numismatics",
530 "obuw.": "footwear",
531 "oceanogr.": "oceanography",
532 "odl.": "foundry",
533 "odzież.": "clothing-industry",
534 "opt.": "optics",
535 "ornit.": "ornithology",
536 "paleoantrop.": "paleoanthropology",
537 "paleont.": "paleontology",
538 "papier.": "papermaking",
539 "pedag.": "pedagogy",
540 "poczt.": "mail",
541 "poligr.": "printing",
542 "polit.": "political-science",
543 "praw.": "law",
544 "przestęp.": "criminal",
545 "rad.": "radio",
546 "reg.": "region",
547 "rel.": "religion",
548 "ręk.": "handicrafts",
549 "roln.": "agriculture",
550 "ryb.": "fishing",
551 "rzem.": "crafts",
552 "seks.": "sexology",
553 "s.f.": "science-fiction",
554 "socjol.": "sociology",
555 "speleol.": "speleology",
556 "społ.": "social",
557 "sport.": "sports",
558 "spoż.": "food",
559 "stat.": "statistics",
560 "stomat.": "stomatology",
561 "szach.": "chess",
562 "szt.": "art",
563 "taur.": "bullfighting",
564 "teatr.": "theater",
565 "techn.": "technology",
566 "telegr.": "telegraphy",
567 "telekom.": "telecommunications",
568 "telew.": "television",
569 "teol.": "theology",
570 "toksykol.": "toxicology",
571 "topogr.": "topography",
572 "transp.": "transport",
573 "turyst.": "tourism",
574 "typogr.": "typography",
575 "urb.": "urbanism",
576 "wet.": "veterinary",
577 "wędk.": "fishing",
578 "więz.": "prison",
579 "wiośl.": "rowing",
580 "włók.": "textiles",
581 "wojsk.": "military",
582 "zarz.": "management",
583 "zeg.": "horology",
584 "zool.": "zoology",
585 "żegl.": "sailing",
586 "ogrod.": "horticulture",
587}
590def translate_raw_tags(data: WordEntry) -> None:
591 raw_tags = []
592 for raw_tag in data.raw_tags:
593 if not check_tag(data, raw_tag):
594 found_tag = False
595 for part_of_tag in raw_tag.split():
596 if check_tag(data, part_of_tag): 596 ↛ 597line 596 didn't jump to line 597 because the condition on line 596 was never true
597 found_tag = True
598 if not found_tag: 598 ↛ 592line 598 didn't jump to line 592 because the condition on line 598 was always true
599 raw_tags.append(raw_tag)
600 data.raw_tags = raw_tags
603def check_tag(data: WordEntry, raw_tag: str) -> bool:
604 # return `True` if found tag or topic
605 if raw_tag in TAGS and hasattr(data, "tags"):
606 tag = TAGS[raw_tag]
607 if isinstance(tag, str) and tag not in data.tags:
608 data.tags.append(tag)
609 elif isinstance(tag, list):
610 for t in tag:
611 if t not in data.tags:
612 data.tags.append(t)
613 elif raw_tag in TOPICS and hasattr(data, "topics"):
614 topic = TOPICS[raw_tag]
615 if isinstance(topic, str): 615 ↛ 617line 615 didn't jump to line 617 because the condition on line 615 was always true
616 data.topics.append(topic)
617 elif isinstance(topic, list):
618 data.topics.extend(topic)
619 else:
620 return False
621 return True