Coverage for src/wiktextract/extractor/pl/tags.py: 86%
31 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from .models import WordEntry
3# Help:Abbreviations used in Wiktionary
4# https://pl.wiktionary.org/wiki/Pomoc:Skróty_używane_w_Wikisłowniku
5# Category:Shortcut templates
6# https://pl.wiktionary.org/wiki/Kategoria:Szablony_skrótów
7TAGS = {
8 "abl.": "ablative",
9 # "akust.": "",
10 "amer.": "US",
11 "aor.": "aorist",
12 "arab.": "Arabic",
13 "bałt.": "Baltic",
14 "bask.": "Basque",
15 "bezok.": "infinitive",
16 "bezosob.": "impersonal",
17 "bibl.": "Biblical",
18 "blm": "no-plural",
19 "blp": "no-singulative",
20 "Bm": "Bokmål",
21 "bośn.": "Bosnian",
22 "brytań.": "British",
23 "bułg.": "Bulgarian",
24 "bwr.": "Bavarian",
25 "celt.": "Celtic",
26 "chiń.": "Chinese",
27 "chorw.": "Croatian",
28 "cs.": "Church-Slavonic",
29 "czes.": "Czech",
30 "depr.": "depreciative",
31 "dial.": "dialectal",
32 "dk": "perfective",
33 "dosł.": "literally",
34 "du": "dual",
35 "dysfem.": "dysphemism",
36 "dysk.": "discourse",
37 "dźwięk.": "onomatopoeic",
38 "egip.": "Egyptian",
39 "ekspr.": "expressively",
40 "el.": "Greek",
41 "erud.": "eruditely",
42 "eufem.": "euphemistic",
43 "ew.": "alternative",
44 "ezot.": "esoteric",
45 "franc.": "French",
46 "galic.": "Galician",
47 "germ.": "Germanic",
48 "gr.": "Ancient-Greek",
49 "grec.": "Ancient-Greek",
50 "grub.": "offensive",
51 "grzecz.": "polite",
52 "gw.": "dialectal",
53 "hebr.": "Hebrew",
54 "hin.": "Hindi",
55 "hiszp.": "Spanish",
56 "honor.": "honorific",
57 "ims.": "participle",
58 "ind.": "India",
59 "infant.": "childish",
60 "irl.": "Irish",
61 "iron.": "ironic",
62 "iterat.": "iterative",
63 "jap.": "Japanese",
64 "kanad.": "Canadian-English",
65 "kanad. franc.": "Canadian-French",
66 "kant.": "Cantonese",
67 "katal.": "Catalan",
68 "kathar.": "Katharevousa",
69 "kaz.": "Kazakh",
70 "kor.": "Korean",
71 "kor. płd.": "South-Korean",
72 "kor. płn.": "North-Korean",
73 "korn.": "Cornish",
74 "książk.": "literary",
75 "lekcew.": "pejorative",
76 "lewant. arab.": "Levantine-Arabic",
77 "libij. arab.": "Libyan-Arabic",
78 "licz.": "numeral",
79 "licz. gł.": "cardinal",
80 "licz. porz.": "ordinal",
81 "litew.": "Lithuanian",
82 "lm": "plural",
83 "lm m": ["plural", "masculine"],
84 "lm nm": ["plural", "nonvirile"],
85 "lp": "singular",
86 "lud.": "vernacular",
87 "lwow.": ["Lviv", "dialectal"],
88 "łac.": "Latin",
89 "łac.kośc.": ["Ecclesiastical", "Latin"],
90 "łot.": "Latvian",
91 "m": "masculine",
92 "mac.": "Macedonian",
93 "malaj.": "Malay",
94 "marok.": "Moroccan",
95 "międzyr.": "interfix",
96 "młodz.": "youth",
97 "mong.": "Mongolian",
98 "mong. klas.": "Classical-Mongolian",
99 "moz.": "Mozambique",
100 "m.-os.": ["masculine", "personal"],
101 "mrz": ["masculine", "inanimate"],
102 "mzw": ["masculine", "animate"],
103 "n": "neuter",
104 "nah": "Nahuatl",
105 "nbk.": "Bokmål",
106 "ndk": "imperfective",
107 "neol.": "neologism",
108 "neutr.": "neutral",
109 "n.gr.": "Modern-Greek",
110 "niderl.": "Dutch",
111 "nieofic.": "unofficially",
112 "niem.": "German",
113 "niem. RFN": "Standard-German",
114 "nieodm.": "uninflected",
115 "nieos.": "impersonal",
116 "niepopr.": "incorrectly",
117 "n.łac.": "Neo-Latin",
118 "nm.-os.": "nonvirile",
119 "nn": "Nynorsk",
120 "norw.": "Norwegian",
121 "nowozel": "New-Zealand",
122 "nprzech.": "intransitive",
123 "nwh.": "Navajo",
124 "nżw": "inanimate",
125 "nord.": "Nordic",
126 "obraź": "offensive",
127 "odczas.": "verbal",
128 "odm.": "inflected",
129 "odprzym.": "deadjectival",
130 "odrzecz.": "substantival",
131 "ofic.": "officially",
132 "ogsłow.": "Common-Slavic",
133 "określ.": "determiner",
134 "os.": "person",
135 "oset.": "Ossetian",
136 "osm.": "Ottoman",
137 "oznajm.": "indicative",
138 "partyk.": "particle",
139 "paszto": "Pashto",
140 "Prt.": "partitive",
141 "pejor.": "pejorative",
142 "pers.": "Persian",
143 "peryfr.": "periphrastic",
144 "p.gr": "Late-Greek",
145 "pieszcz.": "endearing",
146 "p.łac.": "Late-Latin",
147 "płdbraz.": "Brazil",
148 "płnlap.": "Northern-Sámi",
149 "podn.": "elevatedly",
150 "poet.": "poetic",
151 "pogard.": "scornfully",
152 "pol.": "Polish",
153 "poł.": "Polabian",
154 "port.": "Portuguese",
155 "posp.": "commonly",
156 "postp.": "postpositional",
157 "pot.": "colloquial",
158 "pozn.": ["Poznań", "regional"],
159 "pragerm.": "Proto-Germanic",
160 "praindoeur.": "Proto-Indo-European",
161 "pranord.": "Proto-Norse",
162 "prasł.": "Proto-Slavic",
163 "praturk.": "Proto-Turkic",
164 "prawdop.": "presumably",
165 "prow.": "Provençal",
166 "przech.": "transitive",
167 "przecz.": "negation",
168 "przedr.": "prefix",
169 "przen.": "metaphoric",
170 "przest.": "obsolete",
171 "przesz.": "past",
172 "przyim.": "prepositional",
173 "przym.": "adjective",
174 "przyp.": "subjunctive",
175 "M.": "nominative",
176 "Nom.": "nominative",
177 "D.": "genitive",
178 "Gen.": "genitive",
179 "C.": "dative",
180 "Dat.": "dative",
181 "B.": "accusative",
182 "Akk.": "accusative",
183 "N.": "instrumental",
184 "Ms.": "locative",
185 "W.": "vocative",
186 "adess.": "adessive",
187 "all.": "allative",
188 "ess.": "essive",
189 "part.": "partitive",
190 "przyr.": "suffix",
191 "przysł.": "adverb",
192 "przysz.": "future",
193 "psych.": "psychology",
194 "pszcz.": "beekeeping",
195 "p. uwsp.": "modern", # "(p. uwsp.)" from template "uwsp"
196 "polinez.": "Polynesian",
197 "qu.": "Quechua",
198 "quen.": "Quenya",
199 "rzym.": "Roman",
200 "słow.": "Slavic",
201 "sumer.": "Sumerian",
202 "rodz.": "gendered-article",
203 "rodz. nieokr.": ["indefinite", "article"],
204 "rodz. okr.": ["definite", "article"],
205 "ros.": "Russian",
206 "rozk.": "imperative",
207 "bryt. (RP)": ["British", "Received-Pronunciation"],
208 "rub.": "broadly",
209 "rum.": "Romanian",
210 "run.": "Kirundi",
211 "rzad.": "rare",
212 "rzecz.": "noun",
213 "sanskr.": "Sanskrit",
214 "serb.": "Serbian",
215 "skr.": "abbreviation",
216 "slang.": "slang",
217 "słc.": "Slovak",
218 "słń.": "Slovene",
219 "słowiń.": "Slovene",
220 "somal.": "Somali",
221 "sp.": "conjunction",
222 "st.ang.": "Old-English",
223 "staroż.": "Ancient",
224 "st.cons.": "construct",
225 "st.czes.": "Old-Czech",
226 "st.duń.": "Old-Danish",
227 "st.egip.": "Ancient-Egyptian",
228 "st.franc.": "Old-French",
229 "st.fryz.": "Old-Frisian",
230 "st.gr.": "Ancient-Greek",
231 "st.ind.": "Ancient-Indian",
232 "st.irl.": "Old-Irish",
233 "st.łac.": "Old-Latin",
234 "st.nord.": "Old-Norse",
235 "st.pers.": "Old-Persian",
236 "st.pol.": "Old-Polish",
237 "st.poł.": "Old-Slavic",
238 "st.prus.": "Old-Prussian",
239 "strbr": "passive",
240 "strcz": "active",
241 "strzwr": "middle",
242 "st.rus.": "Old-Russian",
243 "st.saks.": "Old-Saxon",
244 "st.szw.": "Old-Swedish",
245 "st.turk.": "Old-Turkish",
246 "sus.": "Susu",
247 "sw.": "Swahili",
248 "swn.": "Old-High-German",
249 "symbol.": "symbol",
250 "syn.": "synonym",
251 "szw.": "Swedish",
252 "szwajc. franc.": ["French", "Switzerland"],
253 "szwajc. niem.": ["German", "Switzerland"],
254 "szwajc. wł.": ["Italian", "Switzerland"],
255 "szwb.": "German",
256 "śdn.": "Middle-Low-German",
257 "śl.": "Silesian",
258 "średnioang.": "Middle-English",
259 "średniofranc.": "Middle-French",
260 "śr.gr.": "Medieval-Greek",
261 "śr.łac.": "Medieval-Latin",
262 "śr.niderl.": "Middle-Dutch",
263 "śr.pol.": "Middle-Polish",
264 "śwn.": "Middle-High-German",
265 "t.": "also",
266 "taj.": "Thai",
267 "tamil.": "Tamil",
268 "tatar.": "Tatar",
269 "tem. słow.": "word-forming",
270 "ter.": "present",
271 "tim. port.": ["Portuguese", "East Timor"],
272 "tłum.": "translation",
273 "trad.": "Traditional",
274 "tur.": "Turkish",
275 "turkm.": "Turkmen",
276 "tuw.": "Tuvan",
277 "tyb.": "Tibetan",
278 "tzm.": "Tamazight",
279 "UK": "UK",
280 "ukr.": "Ukrainian",
281 "uproszcz.": "Simplified",
282 "urd.": "Urdu",
283 "urz.": "formal",
284 "US": "US",
285 "(p. uwsp.)": "modern-spelling",
286 "uzb.": "Uzbek",
287 "vo.": "Volapük",
288 "w": "common",
289 "wal.": "Welsh",
290 "war.": "variant",
291 "warsz.": ["Warsaw", "dialectal"],
292 "wed.": "Vedic",
293 "wenec.": "Venetian",
294 "węg.": "Hungarian",
295 "wiet.": "Vietnamese",
296 "wilam.": "Vilamovian",
297 "wł.": "Italian",
298 "wsch.": ["Eastern", "dialectal"],
299 "współcz.": "contemporary",
300 "wulg.": "vulgar",
301 "wych. z uż.": "archaic",
302 "wykrz.": "interjection",
303 "wyr. przyim.": ["prepositional", "phrase"],
304 "zach.": ["Western", "dialectal"],
305 "zaim.": "pronoun",
306 "zaw.": "professional",
307 "zaz.": "Zazaki",
308 "zdrobn.": "diminutive",
309 "zgrub.": "augmentative",
310 "zw rz": "regimen",
311 "zw zg": "concord",
312 "zwł.": "especially",
313 "ż": "feminine",
314 "żart.": "humorous",
315 "żmd.": "Samogitian",
316 "żw": "animate",
317 "żyd.": "Jewish",
318 # Category:Acronym templates - grammar
319 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_skrótów_-_gramatyka
320 # gender types in POS line
321 "męski": "masculine",
322 "męskozwierzęcy": ["masculine", "animate"],
323 "męskorzeczowy": ["masculine", "inanimate"],
324 "niepoliczalny": "uncountable",
325 "nieżywotny": "inanimate",
326 "nijaki": "neuter",
327 "policzalny": "countable",
328 "przechodni": "transitive",
329 "żeński": "feminine",
330 "żywotny": "animate",
331 "dzierżawczy": "possessive",
332 "niedokonany": "imperfective",
333 "relacyjny": "relational",
334 # "odmiana-rzeczownik-polski" template
335 "liczba pojedyncza": "singular",
336 "liczba mnoga": "plural",
337 "mianownik": "nominative",
338 "dopełniacz": "genitive",
339 "celownik": "dative",
340 "biernik": "accusative",
341 "narzędnik": "instrumental",
342 "miejscownik": "locative",
343 "wołacz": "vocative",
344 # "odmiana-przymiotnik-polski" template
345 "mos/mzw": ["masculine", "animate"],
346 "mos": "masculine",
347 "nmos": "nonvirile",
348 "stopień wyższy": "comparative",
349 "stopień najwyższy": "superlative",
350 # "odmiana-czasownik-polski" template
351 "1. os.": "first-person",
352 "2. os.": "second-person",
353 "3. os.": "third-person",
354 "bezokolicznik": "infinitive",
355 "czas teraźniejszy": "present",
356 "czas przeszły": "past",
357 "tryb rozkazujący": "imperative",
358 "czas przyszły": "future",
359 "czas zaprzeszły": "pluperfect",
360 "forma bezosobowa": "impersonal",
361 "czasu przeszłego": "past",
362 "tryb przypuszczający": "conditional",
363 "imiesłów przymiotnikowy czynny": ["active", "participle"],
364 "imiesłów przysłówkowy współczesny": [
365 "contemporary",
366 "adverbial",
367 "participle",
368 ],
369 # "odmiana-rzeczownik-esperanto" template
370 "ununombro": "singular",
371 "multenombro": "plural",
372 "nominativo": "nominative",
373 "akuzativo": "accusative",
374 "multenombro (virtuala)": ["plural", "virtual"],
375 # pos line
376 "nieprzechodni": "intransitive",
377}
379TOPICS = {
380 "adm.": "administration",
381 "agrot.": "agrotechnology",
382 "alch.": "alchemy",
383 "anat.": "anatomy",
384 "antrop.": "anthropology",
385 "arachn.": "arachnology",
386 "archit.": "architecture",
387 "archeol.": "archeology",
388 "astr.": "astronomy",
389 "astrol.": "astrology",
390 "astronaut.": "astronautics",
391 "bank.": "banking",
392 # "bibliot.": "",
393 "biochem.": "biochemistry",
394 "biol.": "biology",
395 # "biur.": "",
396 "bot.": "botany",
397 "bud.": "construction",
398 "ceram.": "ceramics",
399 "chem.": "chemistry",
400 "choreogr.": "choreography",
401 "cukiernictwo.": "confectionery",
402 "cybern.": "cybernetics",
403 # "daw.": "",
404 "demogr.": "demography",
405 "dendr.": "dendrology",
406 "drewn.": "woodworking",
407 "druk.": "printing",
408 "dypl.": "diplomacy",
409 "eduk.": "education",
410 "ekol.": "ecology",
411 "ekon.": "economics",
412 "elektr.": "electricity",
413 "elektron.": "electronics",
414 "enol.": "oenology",
415 "ent.": "entomology",
416 "etn.": "ethnography",
417 "etym.": "etymology",
418 "fant.": "speculative-fiction",
419 "farm.": "pharmacology",
420 "felinol.": "felinology",
421 "filatel.": "philately",
422 "film.": "film",
423 "filoz.": "philosophy",
424 "finans.": "finance",
425 "fitopatol.": "phytopathology",
426 "fiz.": "physics",
427 "fizj.": "physiology",
428 "flis.": "timber-rafting",
429 "folk.": "folklore",
430 "fonet.": "phonetics",
431 "form. słow.": "word-forming",
432 "fot.": "photography",
433 "fryzj.": "hairdressing",
434 "garb.": "tanning",
435 "gastr.": "gastronomy",
436 "genet.": "genetics",
437 "geod.": "geodesy",
438 "geofiz.": "geophysics",
439 "geogr.": "geography",
440 "geol.": "geology",
441 "geom.": "geometry",
442 "gend. st.": "gender-studies",
443 "ginek.": "gynaecology",
444 "górn.": "mining",
445 "gram.": "grammar",
446 "gry komp.": "computer games",
447 "hand.": "trade",
448 "harc.": "scouting",
449 "herald.": "heraldry",
450 "herp.": "herpetology",
451 "hig.": "hygienic",
452 "hipol.": "hippology",
453 "hist.": "history",
454 "hotel.": "hotel-industry",
455 "hutn.": "metallurgy",
456 "hydraul.": "hydraulics",
457 "hydrol.": "hydrology",
458 "icht.": "ichthyology",
459 "ikonogr.": "iconography",
460 "inform.": "computer-science",
461 "jedn. miar.": "units-of-measure",
462 "jedn. monet.": "units-of-monetary",
463 "jeźdz.": "equestrianism",
464 "jęz.": "linguistics",
465 "jubil.": "jewelry",
466 "kartogr.": "cartography",
467 "kolej.": "railways",
468 "konserwat.": "conservation",
469 "kosmet.": "cosmetics",
470 "kośc.": "ecclesiastical",
471 "kraw.": "tailoring",
472 "krym.": "criminology",
473 "kryptogr.": "cryptography",
474 "krystal.": "crystallography",
475 "księg.": "accounting",
476 "kulin.": "culinary",
477 "kult.": "cultural-studies",
478 "kynol.": "cynology",
479 "leśn.": "forestry",
480 "liter.": "literature",
481 "log.": "logic",
482 "lotn.": "aviation",
483 "łow.": "hunting",
484 "mar.": "nautical",
485 "mat.": "mathematics",
486 "mebl.": "furniture",
487 "mech.": "mechanics",
488 "med.": "medicine",
489 "met.": "metallurgy",
490 "meteorol.": "meteorology",
491 "metrol.": "metrology",
492 "mikol.": "mycology",
493 "mikrobiol.": "microbiology",
494 "miner.": "mineralogy",
495 "mit.": "mythology",
496 "młyn.": "milling",
497 "monet.": "monetary-unit",
498 "mors.": "maritime",
499 "mot.": "automotive",
500 "muz.": "musicology",
501 "myśl.": "hunting",
502 "nauk.": "sciences",
503 "nawig.": "navigation",
504 "numizm.": "numismatics",
505 "obuw.": "footwear",
506 "oceanogr.": "oceanography",
507 "odl.": "foundry",
508 "odzież.": "clothing-industry",
509 "opt.": "optics",
510 "ornit.": "ornithology",
511 "paleoantrop.": "paleoanthropology",
512 "paleont.": "paleontology",
513 "papier.": "papermaking",
514 "pedag.": "pedagogy",
515 "poczt.": "mail",
516 "poligr.": "printing",
517 "polit.": "political-science",
518 "praw.": "law",
519 "przestęp.": "criminal",
520 "rad.": "radio",
521 "reg.": "region",
522 "rel.": "religion",
523 "ręk.": "handicrafts",
524 "roln.": "agriculture",
525 "ryb.": "fishing",
526 "rzem.": "crafts",
527 "seks.": "sexology",
528 "s.f.": "science-fiction",
529 "socjol.": "sociology",
530 "speleol.": "speleology",
531 "społ.": "social",
532 "sport.": "sports",
533 "spoż.": "food",
534 "stat.": "statistics",
535 "stomat.": "stomatology",
536 "szach.": "chess",
537 "szt.": "art",
538 "taur.": "bullfighting",
539 "teatr.": "theater",
540 "techn.": "technology",
541 "telegr.": "telegraphy",
542 "telekom.": "telecommunications",
543 "telew.": "television",
544 "teol.": "theology",
545 "toksykol.": "toxicology",
546 "topogr.": "topography",
547 "transp.": "transport",
548 "turyst.": "tourism",
549 "typogr.": "typography",
550 "urb.": "urbanism",
551 "wet.": "veterinary",
552 "wędk.": "fishing",
553 "więz.": "prison",
554 "wiośl.": "rowing",
555 "włók.": "textiles",
556 "wojsk.": "military",
557 "zarz.": "management",
558 "zeg.": "horology",
559 "zool.": "zoology",
560 "żegl.": "sailing",
561}
564def translate_raw_tags(data: WordEntry) -> None:
565 raw_tags = []
566 for raw_tag in data.raw_tags:
567 if not check_tag(data, raw_tag):
568 found_tag = False
569 for part_of_tag in raw_tag.split():
570 if check_tag(data, part_of_tag): 570 ↛ 571line 570 didn't jump to line 571 because the condition on line 570 was never true
571 found_tag = True
572 if not found_tag: 572 ↛ 566line 572 didn't jump to line 566 because the condition on line 572 was always true
573 raw_tags.append(raw_tag)
574 data.raw_tags = raw_tags
577def check_tag(data: WordEntry, raw_tag: str) -> bool:
578 # return `True` if found tag or topic
579 if raw_tag in TAGS and hasattr(data, "tags"):
580 tag = TAGS[raw_tag]
581 if isinstance(tag, str) and tag not in data.tags:
582 data.tags.append(tag)
583 elif isinstance(tag, list):
584 for t in tag:
585 if t not in data.tags:
586 data.tags.append(t)
587 elif raw_tag in TOPICS and hasattr(data, "topics"):
588 topic = TOPICS[raw_tag]
589 if isinstance(topic, str): 589 ↛ 591line 589 didn't jump to line 591 because the condition on line 589 was always true
590 data.topics.append(topic)
591 elif isinstance(topic, list):
592 data.topics.extend(topic)
593 else:
594 return False
595 return True