Coverage for src/wiktextract/extractor/pl/tags.py: 86%

1from .models import WordEntry

3# Help:Abbreviations used in Wiktionary

4# https://pl.wiktionary.org/wiki/Pomoc:Skróty_używane_w_Wikisłowniku

5# Category:Shortcut templates

6# https://pl.wiktionary.org/wiki/Kategoria:Szablony_skrótów

7TAGS = {

8 "abl.": "ablative",

9 # "akust.": "",

10 "amer.": "US",

11 "aor.": "aorist",

12 "arab.": "Arabic",

13 "bałt.": "Baltic",

14 "bask.": "Basque",

15 "bezok.": "infinitive",

16 "bezosob.": "impersonal",

17 "bibl.": "Biblical",

18 "blm": "no-plural",

19 "blp": "no-singulative",

20 "Bm": "Bokmål",

21 "bośn.": "Bosnian",

22 "brytań.": "British",

23 "bułg.": "Bulgarian",

24 "bwr.": "Bavarian",

25 "celt.": "Celtic",

26 "chiń.": "Chinese",

27 "chorw.": "Croatian",

28 "cs.": "Church-Slavonic",

29 "czes.": "Czech",

30 "depr.": "depreciative",

31 "dial.": "dialectal",

32 "dk": "perfective",

33 "dosł.": "literally",

34 "du": "dual",

35 "dysfem.": "dysphemism",

36 "dysk.": "discourse",

37 "dźwięk.": "onomatopoeic",

38 "egip.": "Egyptian",

39 "ekspr.": "expressively",

40 "el.": "Greek",

41 "erud.": "eruditely",

42 "eufem.": "euphemistic",

43 "ew.": "alternative",

44 "ezot.": "esoteric",

45 "franc.": "French",

46 "galic.": "Galician",

47 "germ.": "Germanic",

48 "gr.": "Ancient-Greek",

49 "grec.": "Ancient-Greek",

50 "grub.": "offensive",

51 "grzecz.": "polite",

52 "gw.": "dialectal",

53 "hebr.": "Hebrew",

54 "hin.": "Hindi",

55 "hiszp.": "Spanish",

56 "honor.": "honorific",

57 "ims.": "participle",

58 "ind.": "India",

59 "infant.": "childish",

60 "irl.": "Irish",

61 "iron.": "ironic",

62 "iterat.": "iterative",

63 "jap.": "Japanese",

64 "kanad.": "Canadian-English",

65 "kanad. franc.": "Canadian-French",

66 "kant.": "Cantonese",

67 "katal.": "Catalan",

68 "kathar.": "Katharevousa",

69 "kaz.": "Kazakh",

70 "kor.": "Korean",

71 "kor. płd.": "South-Korean",

72 "kor. płn.": "North-Korean",

73 "korn.": "Cornish",

74 "książk.": "literary",

75 "lekcew.": "pejorative",

76 "lewant. arab.": "Levantine-Arabic",

77 "libij. arab.": "Libyan-Arabic",

78 "licz.": "numeral",

79 "licz. gł.": "cardinal",

80 "licz. porz.": "ordinal",

81 "litew.": "Lithuanian",

82 "lm": "plural",

83 "lm m": ["plural", "masculine"],

84 "lm nm": ["plural", "nonvirile"],

85 "lp": "singular",

86 "lud.": "vernacular",

87 "lwow.": ["Lviv", "dialectal"],

88 "łac.": "Latin",

89 "łac.kośc.": ["Ecclesiastical", "Latin"],

90 "łot.": "Latvian",

91 "m": "masculine",

92 "mac.": "Macedonian",

93 "malaj.": "Malay",

94 "marok.": "Moroccan",

95 "międzyr.": "interfix",

96 "młodz.": "youth",

97 "mong.": "Mongolian",

98 "mong. klas.": "Classical-Mongolian",

99 "moz.": "Mozambique",

100 "m.-os.": ["masculine", "personal"],

101 "mrz": ["masculine", "inanimate"],

102 "mzw": ["masculine", "animate"],

103 "n": "neuter",

104 "nah": "Nahuatl",

105 "nbk.": "Bokmål",

106 "ndk": "imperfective",

107 "neol.": "neologism",

108 "neutr.": "neutral",

109 "n.gr.": "Modern-Greek",

110 "niderl.": "Dutch",

111 "nieofic.": "unofficially",

112 "niem.": "German",

113 "niem. RFN": "Standard-German",

114 "nieodm.": "uninflected",

115 "nieos.": "impersonal",

116 "niepopr.": "incorrectly",

117 "n.łac.": "Neo-Latin",

118 "nm.-os.": "nonvirile",

119 "nn": "Nynorsk",

120 "norw.": "Norwegian",

121 "nowozel": "New-Zealand",

122 "nprzech.": "intransitive",

123 "nwh.": "Navajo",

124 "nżw": "inanimate",

125 "nord.": "Nordic",

126 "obraź.": "offensive",

127 "odczas.": "verbal",

128 "odm.": "inflected",

129 "odprzym.": "deadjectival",

130 "odrzecz.": "substantival",

131 "ofic.": "officially",

132 "ogsłow.": "Common-Slavic",

133 "określ.": "determiner",

134 "os.": "person",

135 "oset.": "Ossetian",

136 "osm.": "Ottoman",

137 "oznajm.": "indicative",

138 "partyk.": "particle",

139 "paszto": "Pashto",

140 "Prt.": "partitive",

141 "pejor.": "pejorative",

142 "pers.": "Persian",

143 "peryfr.": "periphrastic",

144 "p.gr": "Late-Greek",

145 "pieszcz.": "endearing",

146 "p.łac.": "Late-Latin",

147 "płdbraz.": "Brazil",

148 "płnlap.": "Northern-Sámi",

149 "podn.": "elevatedly",

150 "poet.": "poetic",

151 "pogard.": "scornfully",

152 "pol.": "Polish",

153 "poł.": "Polabian",

154 "port.": "Portuguese",

155 "posp.": "commonly",

156 "postp.": "postpositional",

157 "pot.": "colloquial",

158 "pozn.": ["Poznań", "regional"],

159 "pragerm.": "Proto-Germanic",

160 "praindoeur.": "Proto-Indo-European",

161 "pranord.": "Proto-Norse",

162 "prasł.": "Proto-Slavic",

163 "praturk.": "Proto-Turkic",

164 "prawdop.": "presumably",

165 "prow.": "Provençal",

166 "przech.": "transitive",

167 "przecz.": "negation",

168 "przedr.": "prefix",

169 "przen.": "metaphoric",

170 "przest.": "dated",

171 "przesz.": "past",

172 "przyim.": "prepositional",

173 "przym.": "adjective",

174 "przyp.": "subjunctive",

175 "M.": "nominative",

176 "Nom.": "nominative",

177 "D.": "genitive",

178 "Gen.": "genitive",

179 "C.": "dative",

180 "Dat.": "dative",

181 "B.": "accusative",

182 "Akk.": "accusative",

183 "N.": "instrumental",

184 "Ms.": "locative",

185 "W.": "vocative",

186 "adess.": "adessive",

187 "all.": "allative",

188 "ess.": "essive",

189 "part.": "partitive",

190 "przyr.": "suffix",

191 "przysł.": "adverb",

192 "przysz.": "future",

193 "psych.": "psychology",

194 "pszcz.": "beekeeping",

195 "p. uwsp.": "modern", # "(p. uwsp.)" from template "uwsp"

196 "polinez.": "Polynesian",

197 "qu.": "Quechua",

198 "quen.": "Quenya",

199 "rzym.": "Roman",

200 "słow.": "Slavic",

201 "sumer.": "Sumerian",

202 "rodz.": "gendered-article",

203 "rodz. nieokr.": ["indefinite", "article"],

204 "rodz. okr.": ["definite", "article"],

205 "ros.": "Russian",

206 "rozk.": "imperative",

207 "bryt. (RP)": ["British", "Received-Pronunciation"],

208 "rub.": "broadly",

209 "rum.": "Romanian",

210 "run.": "Kirundi",

211 "rzad.": "rare",

212 "rzecz.": "noun",

213 "sanskr.": "Sanskrit",

214 "serb.": "Serbian",

215 "skr.": "abbreviation",

216 "slang.": "slang",

217 "słc.": "Slovak",

218 "słń.": "Slovene",

219 "słowiń.": "Slovene",

220 "somal.": "Somali",

221 "sp.": "conjunction",

222 "st.ang.": "Old-English",

223 "staroż.": "Ancient",

224 "st.cons.": "construct",

225 "st.czes.": "Old-Czech",

226 "st.duń.": "Old-Danish",

227 "st.egip.": "Ancient-Egyptian",

228 "st.franc.": "Old-French",

229 "st.fryz.": "Old-Frisian",

230 "st.gr.": "Ancient-Greek",

231 "st.ind.": "Ancient-Indian",

232 "st.irl.": "Old-Irish",

233 "st.łac.": "Old-Latin",

234 "st.nord.": "Old-Norse",

235 "st.pers.": "Old-Persian",

236 "st.pol.": "Old-Polish",

237 "st.poł.": "Old-Slavic",

238 "st.prus.": "Old-Prussian",

239 "strbr": "passive",

240 "strcz": "active",

241 "strzwr": "middle",

242 "st.rus.": "Old-Russian",

243 "st.saks.": "Old-Saxon",

244 "st.szw.": "Old-Swedish",

245 "st.turk.": "Old-Turkish",

246 "sus.": "Susu",

247 "sw.": "Swahili",

248 "swn.": "Old-High-German",

249 "symbol.": "symbol",

250 "syn.": "synonym",

251 "szw.": "Swedish",

252 "szwajc. franc.": ["French", "Switzerland"],

253 "szwajc. niem.": ["German", "Switzerland"],

254 "szwajc. wł.": ["Italian", "Switzerland"],

255 "szwb.": "German",

256 "śdn.": "Middle-Low-German",

257 "śl.": "Silesian",

258 "średnioang.": "Middle-English",

259 "średniofranc.": "Middle-French",

260 "śr.gr.": "Medieval-Greek",

261 "śr.łac.": "Medieval-Latin",

262 "śr.niderl.": "Middle-Dutch",

263 "śr.pol.": "Middle-Polish",

264 "śwn.": "Middle-High-German",

265 "t.": "also",

266 "taj.": "Thai",

267 "tamil.": "Tamil",

268 "tatar.": "Tatar",

269 "tem. słow.": "word-forming",

270 "ter.": "present",

271 "tim. port.": ["Portuguese", "East Timor"],

272 "tłum.": "translation",

273 "trad.": "Traditional",

274 "tur.": "Turkish",

275 "turkm.": "Turkmen",

276 "tuw.": "Tuvan",

277 "tyb.": "Tibetan",

278 "tzm.": "Tamazight",

279 "UK": "UK",

280 "ukr.": "Ukrainian",

281 "uproszcz.": "Simplified",

282 "urd.": "Urdu",

283 "urz.": "formal",

284 "US": "US",

285 "(p. uwsp.)": "modern-spelling",

286 "uzb.": "Uzbek",

287 "vo.": "Volapük",

288 "w": "common",

289 "wal.": "Welsh",

290 "war.": "variant",

291 "warsz.": ["Warsaw", "dialectal"],

292 "wed.": "Vedic",

293 "wenec.": "Venetian",

294 "węg.": "Hungarian",

295 "wiet.": "Vietnamese",

296 "wilam.": "Vilamovian",

297 "wł.": "Italian",

298 "wsch.": ["Eastern", "dialectal"],

299 "współcz.": "contemporary",

300 "wulg.": "vulgar",

301 "wych. z uż.": "archaic",

302 "wykrz.": "interjection",

303 "wyr. przyim.": ["prepositional", "phrase"],

304 "zach.": ["Western", "dialectal"],

305 "zaim.": "pronoun",

306 "zaw.": "professional",

307 "zaz.": "Zazaki",

308 "zdrobn.": "diminutive",

309 "zgrub.": "augmentative",

310 "zw rz": "regimen",

311 "zw zg": "concord",

312 "zwł.": "especially",

313 "ż": "feminine",

314 "żart.": "humorous",

315 "żmd.": "Samogitian",

316 "żw": "animate",

317 "żyd.": "Jewish",

318 # Category:Acronym templates - grammar

319 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_skrótów_-_gramatyka

320 # gender types in POS line

321 "męski": "masculine",

322 "męskozwierzęcy": ["masculine", "animate"],

323 "męskorzeczowy": ["masculine", "inanimate"],

324 "niepoliczalny": "uncountable",

325 "nieżywotny": "inanimate",

326 "nijaki": "neuter",

327 "policzalny": "countable",

328 "przechodni": "transitive",

329 "żeński": "feminine",

330 "żywotny": "animate",

331 "dzierżawczy": "possessive",

332 "niedokonany": "imperfective",

333 "dokonany": "perfective",

334 "relacyjny": "relational",

335 # "odmiana-rzeczownik-polski" template

336 "liczba pojedyncza": "singular",

337 "liczba mnoga": "plural",

338 "mianownik": "nominative",

339 "dopełniacz": "genitive",

340 "celownik": "dative",

341 "biernik": "accusative",

342 "narzędnik": "instrumental",

343 "miejscownik": "locative",

344 "wołacz": "vocative",

345 # "odmiana-przymiotnik-polski" template

346 "mos/mzw": ["masculine", "animate"],

347 "mos": "masculine",

348 "nmos": "nonvirile",

349 "stopień wyższy": "comparative",

350 "stopień najwyższy": "superlative",

351 # "odmiana-czasownik-polski" template

352 "1. os.": "first-person",

353 "2. os.": "second-person",

354 "3. os.": "third-person",

355 "bezokolicznik": "infinitive",

356 "czas teraźniejszy": "present",

357 "czas przeszły": "past",

358 "tryb rozkazujący": "imperative",

359 "czas przyszły": "future",

360 "czas przyszły prosty": "future",

361 "czas zaprzeszły": "pluperfect",

362 "forma bezosobowa": "impersonal",

363 "czasu przeszłego": "past",

364 "tryb przypuszczający": "conditional",

365 "imiesłów przymiotnikowy czynny": ["active", "participle"],

366 "imiesłów przymiotnikowy bierny": ["passive", "participle"],

367 "imiesłów przysłówkowy współczesny": [

368 "contemporary",

369 "adverbial",

370 "participle",

371 ],

372 "imiesłów przymiotnikowy przeszły": ["past", "participle"],

373 "imiesłów przysłówkowy uprzedni": ["anterior", "adverbial", "participle"],

374 "rzeczownik odczasownikowy": "gerund",

375 # "odmiana-rzeczownik-esperanto" template

376 "ununombro": "singular",

377 "multenombro": "plural",

378 "nominativo": "nominative",

379 "akuzativo": "accusative",

380 "multenombro (virtuala)": ["plural", "virtual"],

381 # pos line

382 "nieprzechodni": "intransitive",

383 "czas.": "verb",

384 "ndk.": "imperfective",

385 "dk.": "perfective",

386 "wspólny": "common",

387 "męskoosobowy": "masculine",

388 "daw.": "obsolete",

389 "zwrotny": "reflexive",

390 "czasownikowa": "verb",

391 "nieprzechodnia": "intransitive",

392 "słaby": "weak",

393 "bryt.": "British-English",

394 "niemęskoosobowy": "nonvirile",

395 "nazwa własna": "proper-noun",

396 "jakościowy": "qualitative",

397 "policzalna": "countable",

398 "mocny": "strong",

399 "temat": "stem",

400 "niedokonana": "imperfective",

401 "transkrypcja w systemie Hepburna": "Hepburn-romanization",

402}

403

404TOPICS = {

405 "adm.": "administration",

406 "agrot.": "agrotechnology",

407 "alch.": "alchemy",

408 "anat.": "anatomy",

409 "antrop.": "anthropology",

410 "arachn.": "arachnology",

411 "archit.": "architecture",

412 "archeol.": "archeology",

413 "astr.": "astronomy",

414 "astrol.": "astrology",

415 "astronaut.": "astronautics",

416 "bank.": "banking",

417 # "bibliot.": "",

418 "biochem.": "biochemistry",

419 "biol.": "biology",

420 # "biur.": "",

421 "bot.": "botany",

422 "bud.": "construction",

423 "ceram.": "ceramics",

424 "chem.": "chemistry",

425 "choreogr.": "choreography",

426 "cukiernictwo.": "confectionery",

427 "cybern.": "cybernetics",

428 # "daw.": "",

429 "demogr.": "demography",

430 "dendr.": "dendrology",

431 "drewn.": "woodworking",

432 "druk.": "printing",

433 "dypl.": "diplomacy",

434 "eduk.": "education",

435 "ekol.": "ecology",

436 "ekon.": "economics",

437 "elektr.": "electricity",

438 "elektron.": "electronics",

439 "enol.": "oenology",

440 "ent.": "entomology",

441 "etn.": "ethnography",

442 "etym.": "etymology",

443 "fant.": "speculative-fiction",

444 "farm.": "pharmacology",

445 "felinol.": "felinology",

446 "filatel.": "philately",

447 "film.": "film",

448 "filoz.": "philosophy",

449 "finans.": "finance",

450 "fitopatol.": "phytopathology",

451 "fiz.": "physics",

452 "fizj.": "physiology",

453 "flis.": "timber-rafting",

454 "folk.": "folklore",

455 "fonet.": "phonetics",

456 "form. słow.": "word-forming",

457 "fot.": "photography",

458 "fryzj.": "hairdressing",

459 "garb.": "tanning",

460 "gastr.": "gastronomy",

461 "genet.": "genetics",

462 "geod.": "geodesy",

463 "geofiz.": "geophysics",

464 "geogr.": "geography",

465 "geol.": "geology",

466 "geom.": "geometry",

467 "gend. st.": "gender-studies",

468 "ginek.": "gynaecology",

469 "górn.": "mining",

470 "gram.": "grammar",

471 "gry komp.": "computer games",

472 "hand.": "trade",

473 "harc.": "scouting",

474 "herald.": "heraldry",

475 "herp.": "herpetology",

476 "hig.": "hygienic",

477 "hipol.": "hippology",

478 "hist.": "history",

479 "hotel.": "hotel-industry",

480 "hutn.": "metallurgy",

481 "hydraul.": "hydraulics",

482 "hydrol.": "hydrology",

483 "icht.": "ichthyology",

484 "ikonogr.": "iconography",

485 "inform.": "computer-science",

486 "jedn. miar.": "units-of-measure",

487 "jedn. monet.": "units-of-monetary",

488 "jeźdz.": "equestrianism",

489 "jęz.": "linguistics",

490 "jubil.": "jewelry",

491 "kartogr.": "cartography",

492 "kolej.": "railways",

493 "konserwat.": "conservation",

494 "kosmet.": "cosmetics",

495 "kośc.": "ecclesiastical",

496 "kraw.": "tailoring",

497 "krym.": "criminology",

498 "kryptogr.": "cryptography",

499 "krystal.": "crystallography",

500 "księg.": "accounting",

501 "kulin.": "culinary",

502 "kult.": "cultural-studies",

503 "kynol.": "cynology",

504 "leśn.": "forestry",

505 "liter.": "literature",

506 "log.": "logic",

507 "lotn.": "aviation",

508 "łow.": "hunting",

509 "mar.": "nautical",

510 "mat.": "mathematics",

511 "mebl.": "furniture",

512 "mech.": "mechanics",

513 "med.": "medicine",

514 "met.": "metallurgy",

515 "meteorol.": "meteorology",

516 "metrol.": "metrology",

517 "mikol.": "mycology",

518 "mikrobiol.": "microbiology",

519 "miner.": "mineralogy",

520 "mit.": "mythology",

521 "młyn.": "milling",

522 "monet.": "monetary-unit",

523 "mors.": "maritime",

524 "mot.": "automotive",

525 "muz.": "musicology",

526 "myśl.": "hunting",

527 "nauk.": "sciences",

528 "nawig.": "navigation",

529 "numizm.": "numismatics",

530 "obuw.": "footwear",

531 "oceanogr.": "oceanography",

532 "odl.": "foundry",

533 "odzież.": "clothing-industry",

534 "opt.": "optics",

535 "ornit.": "ornithology",

536 "paleoantrop.": "paleoanthropology",

537 "paleont.": "paleontology",

538 "papier.": "papermaking",

539 "pedag.": "pedagogy",

540 "poczt.": "mail",

541 "poligr.": "printing",

542 "polit.": "political-science",

543 "praw.": "law",

544 "przestęp.": "criminal",

545 "rad.": "radio",

546 "reg.": "region",

547 "rel.": "religion",

548 "ręk.": "handicrafts",

549 "roln.": "agriculture",

550 "ryb.": "fishing",

551 "rzem.": "crafts",

552 "seks.": "sexology",

553 "s.f.": "science-fiction",

554 "socjol.": "sociology",

555 "speleol.": "speleology",

556 "społ.": "social",

557 "sport.": "sports",

558 "spoż.": "food",

559 "stat.": "statistics",

560 "stomat.": "stomatology",

561 "szach.": "chess",

562 "szt.": "art",

563 "taur.": "bullfighting",

564 "teatr.": "theater",

565 "techn.": "technology",

566 "telegr.": "telegraphy",

567 "telekom.": "telecommunications",

568 "telew.": "television",

569 "teol.": "theology",

570 "toksykol.": "toxicology",

571 "topogr.": "topography",

572 "transp.": "transport",

573 "turyst.": "tourism",

574 "typogr.": "typography",

575 "urb.": "urbanism",

576 "wet.": "veterinary",

577 "wędk.": "fishing",

578 "więz.": "prison",

579 "wiośl.": "rowing",

580 "włók.": "textiles",

581 "wojsk.": "military",

582 "zarz.": "management",

583 "zeg.": "horology",

584 "zool.": "zoology",

585 "żegl.": "sailing",

586 "ogrod.": "horticulture",

587}

588

589

590def translate_raw_tags(data: WordEntry) -> None:

591 raw_tags = []

592 for raw_tag in data.raw_tags:

593 if not check_tag(data, raw_tag):

594 found_tag = False

595 for part_of_tag in raw_tag.split():

596 if check_tag(data, part_of_tag): 596 ↛ 597line 596 didn't jump to line 597 because the condition on line 596 was never true

597 found_tag = True

598 if not found_tag: 598 ↛ 592line 598 didn't jump to line 592 because the condition on line 598 was always true

599 raw_tags.append(raw_tag)

600 data.raw_tags = raw_tags

601

602

603def check_tag(data: WordEntry, raw_tag: str) -> bool:

604 # return `True` if found tag or topic

605 if raw_tag in TAGS and hasattr(data, "tags"):

606 tag = TAGS[raw_tag]

607 if isinstance(tag, str) and tag not in data.tags:

608 data.tags.append(tag)

609 elif isinstance(tag, list):

610 for t in tag:

611 if t not in data.tags:

612 data.tags.append(t)

613 elif raw_tag in TOPICS and hasattr(data, "topics"):

614 topic = TOPICS[raw_tag]

615 if isinstance(topic, str): 615 ↛ 617line 615 didn't jump to line 617 because the condition on line 615 was always true

616 data.topics.append(topic)

617 elif isinstance(topic, list):

618 data.topics.extend(topic)

619 else:

620 return False

621 return True