Coverage for src / wiktextract / extractor / ms / tags.py: 64%

28 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-02 00:27 +0000

1from .models import WordEntry 

2 

3GENDER_TAGS = { 

4 # Modul:gender_and_number/data 

5 "m": "masculine", 

6 "f": "feminine", 

7 "n": "neuter", 

8 "c": "common", 

9 "neutral": "neutral", 

10 "bernyawa": "animate", 

11 "tak bernyawa": "inanimate", 

12 "haiwan": "animal-not-person", 

13 "peribadi": "personal", 

14 "tak peribadi": "impersonal", 

15 "vir": "virile", 

16 "nvir": "nonvirile", 

17 "mf": "singular", 

18 "du": "dual", 

19 "jm": "plural", 

20 "impf": "imperfective", 

21 "pf": "perfective", 

22 "takrifan sama": ["masculine", "feminine"], 

23 "mengikut keadaan": ["masculine", "feminine"], 

24} 

25 

26LB_TAGS = { 

27 # Modul:labels/data 

28 "kependekan": "abbreviation", 

29 "akronim": "acronym", 

30 "transitif": "transitive", 

31 "tidak transitif": "intransitive", 

32 "jussive": "jussive", 

33 "arkaik": "archaic", 

34 "atelic": "imperfective", 

35 "kata bantu": ["auxiliary", "verb"], 

36 "nombor kardinal": "cardinal", 

37 "kausatif": "causative", 

38 # "berbilang": "", 

39 "kebudak-budakan": "childish", 

40 "chữ Nôm Vietnam": ["Chữ-Nôm", "Vietnam"], 

41 "hinaan": "offensive", 

42 "hinaan kaum": ["ethnic", "offensive"], 

43 "eufemisme": "euphemistic", 

44 "kiasan": "figuratively", 

45 "jenaka": "humorous", 

46 "tidak formal": "informal", 

47 "ironi": "ironic", 

48 "harfiah": "literally", 

49 "slanga perubatan": "slang", # medicine 

50 "metonim": "metonymically", 

51 "neologisme": "neologism", 

52 "bentuk bukan baku": "nonstandard", 

53 "usang": "obsolete", 

54 "lapuk": "obsolete", 

55 "kata kasar": "impolite", 

56 "sopan": "polite", 

57 "pasca-Klasik": "post-Classical", 

58 "slanga penjara": "slang", # prison 

59 # "hina agama": "", 

60 "slanga": "slang", 

61 "slanga sekolah": "slang", # school 

62 # "hina diri": "", 

63 "slanga universiti": "slang", # university 

64 "sinkop": "syncope", 

65 # "teknikal": "", # technical 

66 "slanga mesej": "slang", # message 

67 "lucah": "vulgar", 

68 "Amerika": "America", 

69 "Politik Malaysia": "Malaysia", 

70 "retorik": "rhetoric", 

71 "Kesatuan Soviet": "Soviet Union", 

72 "peribahasa": "proverb", 

73} 

74 

75POS_HEADER_TAGS = { 

76 "ejaan Jawi": "Jawi", 

77 "genitif": "genitive", 

78 "jamak": "plural", 

79 "terbilang dan tidak terbilang": ["countable", "uncountable"], 

80 "bentuk jamak": "plural", 

81 "Ejaan bahasa Urdu": "Urdu", 

82 "partitif": "partitive", 

83 "hanja": "hanja", 

84} 

85 

86SOUND_TAGS = { 

87 "Received Pronunciation": "Received-Pronunciation", 

88 "General American": "General-American", 

89 "UK": "UK", 

90 "A.S.": "General-American", 

91 "Pinyin": "Pinyin", 

92 "Wade-Giles": "Wade-Giles", 

93 "Baku Korea Selatan": "SK-Standard", 

94 "Seoul": "Seoul", 

95 "Perumian Semakan": ["revised", "romanization"], 

96 "Perumian Semakan (translit.)": [ 

97 "revised", 

98 "romanization", 

99 "transliteration", 

100 ], 

101 "McCune–Reischauer": "McCune-Reischauer", 

102 "Perumian Yale": ["Yale", "romanization"], 

103} 

104 

105 

106TAGS = {**GENDER_TAGS, **POS_HEADER_TAGS, **SOUND_TAGS, **LB_TAGS} 

107 

108 

109TOPICS = { 

110 # Modul:labels/data/topical 

111 "perakaunan": "accounting", 

112 "akustik": "acoustics", 

113 "lakonan": "acting", 

114 "periklanan": "advertising", 

115 "aeronautik": "aeronautics", 

116 "pertanian": "agriculture", 

117 "alkimia": "alchemy", 

118 "alkohol": "beverages", 

119 "algebra": "algebra", 

120 "geometri algebra": ["geometry", "algebra"], 

121 "perubatan alternatif": "alternative-medicine", 

122 "bola sepak Amerika": "American-football", 

123 "biokimia": "biochemistry", 

124 # "analisis": "analysis", 

125 "analytic geometry": "geometry", 

126 "kimia analisis": "chemistry", 

127 "anarkisme": "anarchism", 

128 "anatomi": "anatomy", 

129 "animasi": "anime", 

130 "anime": "anime", 

131 "antropologi": "anthropology", 

132 "araknologi": "arachnology", 

133 "arkeologi": "archeology", 

134 "memanah": "archery", 

135 "seni bina": "architecture", 

136 "mitologi Armenia": ["Armenia", "mythology"], 

137 "kecerdasan buatan": "artificial-intelligence", 

138 "seni": "arts", 

139 "uranography": "uranography", 

140 "astrologi": "astrology", 

141 "astronautik": "astronautics", 

142 "astronomi": "astronomy", 

143 "astrofizik": "astrophysics", 

144 "mitologi Asturia": ["Asturia", "mythology"], 

145 "olahraga": "sports", 

146 "auto racing": "racing", 

147 "automotif": "automotive", 

148 "penerbangan": "aviation", 

149 "backgammon": "backgammon", 

150 "bakteriologi": "bacteriology", 

151 "badminton": "badminton", 

152 "permainan bola": "ball-games", 

153 "balet": "ballet", 

154 "perbankan": "banking", 

155 "besbol": "baseball", 

156 "bola kerangjang": "basketball", 

157 "BDSM": "BDSM", 

158 "beekeeping": "beekeeping", 

159 "perjudian": "gambling", 

160 "Alkitab": ["Christianity", "biblical", "religion"], 

161 "biblical": "biblical", 

162 "billiards": "billiards", 

163 "bingo": "bingo", 

164 "biologi": "biology", 

165 "bioteknologi": "biotechnology", 

166 "birdwatching": "birdwatching", 

167 "blogging": "blogging", 

168 "permainan papan": "board-games", 

169 "board sports": "board-games", 

170 "bina badan": "bodybuilding", 

171 "botani": "botany", 

172 "boling": "bowling", 

173 "tinju": "boxing", 

174 "brewing": "brewing", 

175 "bridge": "bridge", 

176 "penyiaran": "broadcasting", 

177 "briologi": "bryology", 

178 "Buddhisme": "Buddhism", 

179 "bullfighting": "bullfighting", 

180 "perniagaan": "commerce", 

181 "kalkulus": "calculus", 

182 "Canadian football": "football", 

183 "zoologi anjing": ["zoology", "dogs"], 

184 "kardiologi": "cardiology", 

185 "cartography": "cartography", 

186 "category theory": "category-theory", 

187 "caving": "caving", 

188 "Celtic mythology": "Celtic-mythology", 

189 "seramik": "ceramics", 

190 "cheerleading": "cheerleading", 

191 "kejuruteraan kimia": "chemistry-engineering", 

192 "kimia": "chemistry", 

193 "catur": "chess", 

194 "mitologi Cina": "Chinese-mythology", 

195 "Kristian": "Christianity", 

196 "cinematography": "cinematography", 

197 "mekanik klasik": "classical-mechanics", 

198 "classical studies": "classical-studies", 

199 "climatology": "climatology", 

200 "climbing": "climbing", 

201 "clinical psychology": "clinical-psychology", 

202 "combinatorics": "combinatorics", 

203 "comedy": "comedy", 

204 "komik": "comics", 

205 "komunikasi": "communications", 

206 "komunisme": "communism", 

207 "analisis kompleks": "complex-analysis", 

208 "permainan komputer": "computer-games", 

209 "grafik komputer": "computer graphics", 

210 "perkakasan komputer": "computer hardware", 

211 "sains komputer": "computer-sciences", 

212 "pengkomputan": "computing", 

213 "computing theory": "computing-theory", 

214 "conchology": "conchology", 

215 "pembinaan": "construction", 

216 "memasak": "cooking", 

217 "hak cipta": "copyright", 

218 "kosmetik": "cosmetics", 

219 "kriket": "cricket", 

220 "criminology": "criminology", 

221 "cryptography": "cryptography", 

222 "cryptozoology": "cryptozoology", 

223 "crystallography": "crystallography", 

224 "curling": "curling", 

225 "numismatik": "numismatics", 

226 "cycling": "cycling", 

227 "cytology": "cytology", 

228 "dance": "dance", 

229 "darts": "darts", 

230 "pangkalan data": "databases", 

231 "demoscene": "demoscene", 

232 "pergigian": "dentistry", 

233 "dermatologi": "dermatology", 

234 "diplomasi": "diplomacy", 

235 "pathology": "pathology", 

236 "diving": "diving", 

237 "domino": "dominoes", 

238 "drama": "drama", 

239 "dressage": "dressage", 

240 "penghasilan makanan": "food-manufacture", 

241 # "sains bumi": "", 

242 "ekologi": "ecology", 

243 "ekonomi": "economy", 

244 "pendidikan": "education", 

245 "kejuruteraan elektrik": "electrical-engineering", 

246 "keelektrikan": "electricity", 

247 "keelektromagnetan": "electromagnetism", 

248 "elektronik": "electronic", 

249 "embriologi": "embryology", 

250 "perubatan kecemasan": "emergency-medicine", 

251 "kejuruteraan": "engineering", 

252 "entomologi": "entomology", 

253 "enzim": "enzyme", 

254 "epidemiologi": "epidemiology", 

255 "epistemologi": "epistemology", 

256 "etika": "ethics", 

257 "etnografi": "ethnography", 

258 "senaman": "exercise", 

259 "falconry": "falconry", 

260 "fesyen": "fashion", 

261 "kimia organik": "organic-chemistry", 

262 "filem": "film", 

263 "kewangan": "finance", 

264 "memancing": "fishing", 

265 "dinamik bendalir": "fluid-dynamics", 

266 "perhutanan": "forestry", 

267 "perabot": "furniture", 

268 "genealogi": "genealogy", 

269 "genetik": "genetics", 

270 "geografi": "geography", 

271 "geologi": "geology", 

272 "geometri": "geometry", 

273 "geomorfologi": "geomorphology", 

274 "gerontologi": "gerontology", 

275 "golf": "golf", 

276 "kerajaan": "government", 

277 "tatabahasa": "grammar", 

278 "gimnastik": "gymnastics", 

279 "ginekologi": "gynecology", 

280 "hematologi": "hematology", 

281 "Hinduisme": "Hinduism", 

282 "historiografi": "historiography", 

283 "sejarah": "history", 

284 "hoki": "hockey", 

285 "homeopati": "homeopathy", 

286 "hormon": "hormone", 

287 "lumba kuda": "horse-racing", 

288 "horticulture": "horticulture", 

289 "sumber manusia": "human-resources", 

290 "kemanusiaan": "humanity", 

291 "perburuan": "hunting", 

292 "hidrologi": "hydrology", 

293 "hoki ais": "ice-hockey", 

294 "imunokimia": "immunochemistry", 

295 "imunologi": "immunology", 

296 "sains maklumat": "information-science", 

297 "teori maklumat": "information-theory", 

298 "fizik": "physics", 

299 "kewartawanan": "journalism", 

300 "judo": "judo", 

301 "undang-undang": "law", 

302 "leksikografi": "lexicography", 

303 "likenologi": "lichenology", 

304 "limnologi": "limnology", 

305 "linguistik": "linguistics", 

306 "kesusasteraan": "literature", 

307 "logik": "logic", 

308 "malakologi": "malacology", 

309 "pemasaran": "marketing", 

310 "Marxisme": "Marxism", 

311 "sains bahan": "material-science", 

312 "matematik": "mathematics", 

313 "mekanik": "mechanics", 

314 "perubatan": "medicine", 

315 "metalurgi": "metallurgy", 

316 "meteorologi": "meteorology", 

317 "metrologi": "metrology", 

318 "mikrobiologi": "microbiology", 

319 "ketenteraan": "military", 

320 "mineralogi": "mineralogy", 

321 "perlombongan": "mining", 

322 "wang": "money", 

323 "otot": "muscle", 

324 "muzik": "music", 

325 "alat muzik": "musical-instrument", 

326 "mikologi": "mycology", 

327 "mitologi": "mythology", 

328 "nanoteknologi": "nanotechnology", 

329 "nautika": "nautical", 

330 "Nazisme": "Nazism", 

331 "neuroanatomi": "neuroanatomy", 

332 "neurologi": "neurology", 

333 "neurosains": "neuroscience", 

334 "fizik nuklear": "nuclear-physics", 

335 "teori nombor": "number-theory", 

336 "oseanografi": "oceanography", 

337 "onkologi": "oncology", 

338 "permainan dalam talian": "video-games", 

339 "optik": "optics", 

340 "sebatian organik": "organic-compound", 

341 "ornitologi": "ornithology", 

342 "ortodontik": "orthodontics", 

343 "paleontologi": "paleontology", 

344 "parapsikologi": "parapsychology", 

345 "fizik zarah": "particle-physics", 

346 "pempasteuran": "pasteurization", 

347 "patologi": "pathology", 

348 "petrokimia": "petrochemical", 

349 "petrologi": "petrology", 

350 "farmakologi": "pharmacology", 

351 "farmasi": "pharmacy", 

352 "filateli": "philately", 

353 "falsafah": "philosophy", 

354 "fonetik": "phonetics", 

355 "fonologi": "phonology", 

356 "fotografi": "photography", 

357 "kimia fizik": ["physics", "chemistry"], 

358 "fisiologi": "physiology", 

359 "planetologi": "planetology", 

360 "toksikologi": "toxicology", 

361 "sains politik": "political-science", 

362 "politik": "politics", 

363 "Politik Malaysia": "politics", 

364 "pornografi": "pornography", 

365 "percetakan": "printing", 

366 "teori kebarangkalian": "probability-theory", 

367 "pengaturcaraan": "programming", 

368 "undang-undang hartanah": ["real-estate", "law"], 

369 "psikiatri": "psychiatry", 

370 "psikoanalisis": "psychoanalysis", 

371 "psikologi": "psychology", 

372 "psikoterapi": "psychotherapy", 

373 "penerbitan": "publishing", 

374 "mekanik kuantum": "quantum-mechanics", 

375 "pengangkutan rel": "rail-transport", 

376 "agama": "religion", 

377 "robotik": "robotics", 

378 "Roman Katolik": "Roman-Catholicism", 

379 "mitologi Rom": "Roman-mythology", 

380 "ragbi": "rugby", 

381 "cereka sains": "science-fiction", 

382 "sains": "sciences", 

383 "seismologi": "seismology", 

384 "semantik": "semantics", 

385 "semiotik": "semiotics", 

386 "teori set": "set-theory", 

387 "menjahit": "sewing", 

388 "keseksualan": "sexuality", 

389 "pemprosesan isyarat": "signal processing", 

390 "menyanyi": "singing", 

391 "snuker": "snooker", 

392 "bola sepak": "soccer", 

393 "sains sosial": "social-science", 

394 "sosialisme": "socialism", 

395 "media sosial": "social-media", 

396 "sosiolinguistik": "sociolinguistics", 

397 "sosiologi": "sociology", 

398 "bola lisut": "softball", 

399 "perisian": "software", 

400 "kejuruteraan perisian": "software-engineering", 

401 "sains tanah": "soil-science", 

402 "bunyi": "sound", 

403 "kejuruteraan bunyi": "sound-engineering", 

404 "sains angkasa": "space-science", 

405 "spektroskopi": "spectroscopy", 

406 "sukan": "sports", 

407 "skuasy": "squash", 

408 "statistik": "statistics", 

409 "pasaran saham": "stock-market", 

410 "subbudaya": "subculture", 

411 "kesufian": "Sufism", 

412 "pembedahan": "surgery", 

413 "berenang": "swimming", 

414 "teori sistem": "systems-theory", 

415 "percukaian": "taxation", 

416 "taksonomi": "taxonomy", 

417 "teknologi": "technology", 

418 "telekomunikasi": "telecommunications", 

419 "televisyen": "television", 

420 "tenis": "tennis", 

421 "tekstil": "textiles", 

422 "teater": "theater", 

423 "teologi": "theology", 

424 "termodinamik": "thermodynamics", 

425 "topologi": "topology", 

426 "pelancongan": "tourism", 

427 "perdagangan": "commerce", 

428 "pengangkutan": "transport", 

429 "trigonometri": "trigonometry", 

430 "tipografi": "typography", 

431 "kenderaan": "vehicles", 

432 "perubatan veterinar": ["veterinary", "medicine"], 

433 "genre permainan video": "video-games", 

434 "permainan video": "video-games", 

435 "virologi": "virology", 

436 "volkanologi": "volcanology", 

437 "bola tampar": "volleyball", 

438 "senjata": "weapon", 

439 "cuaca": "weather", 

440 "reka bentuk web": "web design", 

441 "angkat berat": "weightlifting", 

442 "wain": "wine", 

443 "pertukangan kayu": "carpentry", 

444 "gusti": "wrestling", 

445 "Islam": "Islam", 

446} 

447 

448 

449def translate_raw_tags(data: WordEntry) -> None: 

450 raw_tags = [] 

451 for raw_tag in data.raw_tags: 

452 find_tag = False 

453 if raw_tag in TAGS and hasattr(data, "tags"): 

454 find_tag = True 

455 tr_tag = TAGS[raw_tag] 

456 if isinstance(tr_tag, str): 456 ↛ 458line 456 didn't jump to line 458 because the condition on line 456 was always true

457 data.tags.append(tr_tag) 

458 elif isinstance(tr_tag, list): 

459 data.tags.extend(tr_tag) 

460 if raw_tag in TOPICS and hasattr(data, "topics"): 460 ↛ 461line 460 didn't jump to line 461 because the condition on line 460 was never true

461 find_tag = True 

462 topic = TOPICS[raw_tag] 

463 if isinstance(topic, str): 

464 data.topics.append(topic) 

465 elif isinstance(topic, list): 

466 data.topics.extend(topic) 

467 if not find_tag: 

468 raw_tags.append(raw_tag) 

469 data.raw_tags = raw_tags