Coverage for src/wiktextract/extractor/ms/tags.py: 64%

28 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1from .models import WordEntry 

2 

3GENDER_TAGS = { 

4 # Modul:gender_and_number/data 

5 "m": "masculine", 

6 "f": "feminine", 

7 "n": "neuter", 

8 "c": "common", 

9 "neutral": "neutral", 

10 "bernyawa": "animate", 

11 "tak bernyawa": "inanimate", 

12 "haiwan": "animal-not-person", 

13 "peribadi": "personal", 

14 "tak peribadi": "impersonal", 

15 "vir": "virile", 

16 "nvir": "nonvirile", 

17 "mf": "singular", 

18 "du": "dual", 

19 "jm": "plural", 

20 "impf": "imperfective", 

21 "pf": "perfective", 

22 "takrifan sama": ["masculine", "feminine"], 

23 "mengikut keadaan": ["masculine", "feminine"], 

24} 

25 

26LB_TAGS = { 

27 # Modul:labels/data 

28 "kependekan": "abbreviation", 

29 "akronim": "acronym", 

30 "transitif": "transitive", 

31 "tidak transitif": "intransitive", 

32 "jussive": "jussive", 

33 "arkaik": "archaic", 

34 "atelic": "imperfective", 

35 "kata bantu": ["auxiliary", "verb"], 

36 "nombor kardinal": "cardinal", 

37 "kausatif": "causative", 

38 # "berbilang": "", 

39 "kebudak-budakan": "childish", 

40 "chữ Nôm Vietnam": ["Chữ-Nôm", "Vietnam"], 

41 "hinaan": "offensive", 

42 "hinaan kaum": ["ethnic", "offensive"], 

43 "eufemisme": "euphemistic", 

44 "kiasan": "figuratively", 

45 "jenaka": "humorous", 

46 "tidak formal": "informal", 

47 "ironi": "ironic", 

48 "harfiah": "literally", 

49 "slanga perubatan": "slang", # medicine 

50 "metonim": "metonymically", 

51 "neologisme": "neologism", 

52 "bentuk bukan baku": "nonstandard", 

53 "usang": "obsolete", 

54 "lapuk": "obsolete", 

55 "kata kasar": "impolite", 

56 "sopan": "polite", 

57 "pasca-Klasik": "post-Classical", 

58 "slanga penjara": "slang", # prison 

59 # "hina agama": "", 

60 "slanga": "slang", 

61 "slanga sekolah": "slang", # school 

62 # "hina diri": "", 

63 "slanga universiti": "slang", # university 

64 "sinkop": "syncope", 

65 # "teknikal": "", # technical 

66 "slanga mesej": "slang", # message 

67 "lucah": "vulgar", 

68 "Amerika": "America", 

69 "Politik Malaysia": "Malaysia", 

70 "retorik": "rhetoric", 

71 "Kesatuan Soviet": "Soviet Union", 

72 "peribahasa": "proverb", 

73} 

74 

75POS_HEADER_TAGS = { 

76 "ejaan Jawi": "Jawi", 

77 "genitif": "genitive", 

78 "jamak": "plural", 

79 "terbilang dan tidak terbilang": ["countable", "uncountable"], 

80 "bentuk jamak": "plural", 

81 "Ejaan bahasa Urdu": "Urdu", 

82 "partitif": "partitive", 

83} 

84 

85SOUND_TAGS = { 

86 "Received Pronunciation": "Received-Pronunciation", 

87 "General American": "General-American", 

88 "UK": "UK", 

89 "A.S.": "General-American", 

90 "Pinyin": "Pinyin", 

91 "Wade-Giles": "Wade-Giles", 

92} 

93 

94 

95TAGS = {**GENDER_TAGS, **POS_HEADER_TAGS, **SOUND_TAGS, **LB_TAGS} 

96 

97 

98TOPICS = { 

99 # Modul:labels/data/topical 

100 "perakaunan": "accounting", 

101 "akustik": "acoustics", 

102 "lakonan": "acting", 

103 "periklanan": "advertising", 

104 "aeronautik": "aeronautics", 

105 "pertanian": "agriculture", 

106 "alkimia": "alchemy", 

107 "alkohol": "beverages", 

108 "algebra": "algebra", 

109 "geometri algebra": ["geometry", "algebra"], 

110 "perubatan alternatif": "alternative-medicine", 

111 "bola sepak Amerika": "American-football", 

112 "biokimia": "biochemistry", 

113 # "analisis": "analysis", 

114 "analytic geometry": "geometry", 

115 "kimia analisis": "chemistry", 

116 "anarkisme": "anarchism", 

117 "anatomi": "anatomy", 

118 "animasi": "anime", 

119 "anime": "anime", 

120 "antropologi": "anthropology", 

121 "araknologi": "arachnology", 

122 "arkeologi": "archeology", 

123 "memanah": "archery", 

124 "seni bina": "architecture", 

125 "mitologi Armenia": ["Armenia", "mythology"], 

126 "kecerdasan buatan": "artificial-intelligence", 

127 "seni": "arts", 

128 "uranography": "uranography", 

129 "astrologi": "astrology", 

130 "astronautik": "astronautics", 

131 "astronomi": "astronomy", 

132 "astrofizik": "astrophysics", 

133 "mitologi Asturia": ["Asturia", "mythology"], 

134 "olahraga": "sports", 

135 "auto racing": "racing", 

136 "automotif": "automotive", 

137 "penerbangan": "aviation", 

138 "backgammon": "backgammon", 

139 "bakteriologi": "bacteriology", 

140 "badminton": "badminton", 

141 "permainan bola": "ball-games", 

142 "balet": "ballet", 

143 "perbankan": "banking", 

144 "besbol": "baseball", 

145 "bola kerangjang": "basketball", 

146 "BDSM": "BDSM", 

147 "beekeeping": "beekeeping", 

148 "perjudian": "gambling", 

149 "Alkitab": ["Christianity", "biblical", "religion"], 

150 "biblical": "biblical", 

151 "billiards": "billiards", 

152 "bingo": "bingo", 

153 "biologi": "biology", 

154 "bioteknologi": "biotechnology", 

155 "birdwatching": "birdwatching", 

156 "blogging": "blogging", 

157 "permainan papan": "board-games", 

158 "board sports": "board-games", 

159 "bina badan": "bodybuilding", 

160 "botani": "botany", 

161 "boling": "bowling", 

162 "tinju": "boxing", 

163 "brewing": "brewing", 

164 "bridge": "bridge", 

165 "penyiaran": "broadcasting", 

166 "briologi": "bryology", 

167 "Buddhisme": "Buddhism", 

168 "bullfighting": "bullfighting", 

169 "perniagaan": "commerce", 

170 "kalkulus": "calculus", 

171 "Canadian football": "football", 

172 "zoologi anjing": ["zoology", "dogs"], 

173 "kardiologi": "cardiology", 

174 "cartography": "cartography", 

175 "category theory": "category-theory", 

176 "caving": "caving", 

177 "Celtic mythology": "Celtic-mythology", 

178 "seramik": "ceramics", 

179 "cheerleading": "cheerleading", 

180 "kejuruteraan kimia": "chemistry-engineering", 

181 "kimia": "chemistry", 

182 "catur": "chess", 

183 "mitologi Cina": "Chinese-mythology", 

184 "Kristian": "Christianity", 

185 "cinematography": "cinematography", 

186 "mekanik klasik": "classical-mechanics", 

187 "classical studies": "classical-studies", 

188 "climatology": "climatology", 

189 "climbing": "climbing", 

190 "clinical psychology": "clinical-psychology", 

191 "combinatorics": "combinatorics", 

192 "comedy": "comedy", 

193 "komik": "comics", 

194 "komunikasi": "communications", 

195 "komunisme": "communism", 

196 "analisis kompleks": "complex-analysis", 

197 "permainan komputer": "computer-games", 

198 "grafik komputer": "computer graphics", 

199 "perkakasan komputer": "computer hardware", 

200 "sains komputer": "computer-sciences", 

201 "pengkomputan": "computing", 

202 "computing theory": "computing-theory", 

203 "conchology": "conchology", 

204 "pembinaan": "construction", 

205 "memasak": "cooking", 

206 "hak cipta": "copyright", 

207 "kosmetik": "cosmetics", 

208 "kriket": "cricket", 

209 "criminology": "criminology", 

210 "cryptography": "cryptography", 

211 "cryptozoology": "cryptozoology", 

212 "crystallography": "crystallography", 

213 "curling": "curling", 

214 "numismatik": "numismatics", 

215 "cycling": "cycling", 

216 "cytology": "cytology", 

217 "dance": "dance", 

218 "darts": "darts", 

219 "pangkalan data": "databases", 

220 "demoscene": "demoscene", 

221 "pergigian": "dentistry", 

222 "dermatologi": "dermatology", 

223 "diplomasi": "diplomacy", 

224 "pathology": "pathology", 

225 "diving": "diving", 

226 "domino": "dominoes", 

227 "drama": "drama", 

228 "dressage": "dressage", 

229 "penghasilan makanan": "food-manufacture", 

230 # "sains bumi": "", 

231 "ekologi": "ecology", 

232 "ekonomi": "economy", 

233 "pendidikan": "education", 

234 "kejuruteraan elektrik": "electrical-engineering", 

235 "keelektrikan": "electricity", 

236 "keelektromagnetan": "electromagnetism", 

237 "elektronik": "electronic", 

238 "embriologi": "embryology", 

239 "perubatan kecemasan": "emergency-medicine", 

240 "kejuruteraan": "engineering", 

241 "entomologi": "entomology", 

242 "enzim": "enzyme", 

243 "epidemiologi": "epidemiology", 

244 "epistemologi": "epistemology", 

245 "etika": "ethics", 

246 "etnografi": "ethnography", 

247 "senaman": "exercise", 

248 "falconry": "falconry", 

249 "fesyen": "fashion", 

250 "kimia organik": "organic-chemistry", 

251 "filem": "film", 

252 "kewangan": "finance", 

253 "memancing": "fishing", 

254 "dinamik bendalir": "fluid-dynamics", 

255 "perhutanan": "forestry", 

256 "perabot": "furniture", 

257 "genealogi": "genealogy", 

258 "genetik": "genetics", 

259 "geografi": "geography", 

260 "geologi": "geology", 

261 "geometri": "geometry", 

262 "geomorfologi": "geomorphology", 

263 "gerontologi": "gerontology", 

264 "golf": "golf", 

265 "kerajaan": "government", 

266 "tatabahasa": "grammar", 

267 "gimnastik": "gymnastics", 

268 "ginekologi": "gynecology", 

269 "hematologi": "hematology", 

270 "Hinduisme": "Hinduism", 

271 "historiografi": "historiography", 

272 "sejarah": "history", 

273 "hoki": "hockey", 

274 "homeopati": "homeopathy", 

275 "hormon": "hormone", 

276 "lumba kuda": "horse-racing", 

277 "horticulture": "horticulture", 

278 "sumber manusia": "human-resources", 

279 "kemanusiaan": "humanity", 

280 "perburuan": "hunting", 

281 "hidrologi": "hydrology", 

282 "hoki ais": "ice-hockey", 

283 "imunokimia": "immunochemistry", 

284 "imunologi": "immunology", 

285 "sains maklumat": "information-science", 

286 "teori maklumat": "information-theory", 

287 "fizik": "physics", 

288 "kewartawanan": "journalism", 

289 "judo": "judo", 

290 "undang-undang": "law", 

291 "leksikografi": "lexicography", 

292 "likenologi": "lichenology", 

293 "limnologi": "limnology", 

294 "linguistik": "linguistics", 

295 "kesusasteraan": "literature", 

296 "logik": "logic", 

297 "malakologi": "malacology", 

298 "pemasaran": "marketing", 

299 "Marxisme": "Marxism", 

300 "sains bahan": "material-science", 

301 "matematik": "mathematics", 

302 "mekanik": "mechanics", 

303 "perubatan": "medicine", 

304 "metalurgi": "metallurgy", 

305 "meteorologi": "meteorology", 

306 "metrologi": "metrology", 

307 "mikrobiologi": "microbiology", 

308 "ketenteraan": "military", 

309 "mineralogi": "mineralogy", 

310 "perlombongan": "mining", 

311 "wang": "money", 

312 "otot": "muscle", 

313 "muzik": "music", 

314 "alat muzik": "musical-instrument", 

315 "mikologi": "mycology", 

316 "mitologi": "mythology", 

317 "nanoteknologi": "nanotechnology", 

318 "nautika": "nautical", 

319 "Nazisme": "Nazism", 

320 "neuroanatomi": "neuroanatomy", 

321 "neurologi": "neurology", 

322 "neurosains": "neuroscience", 

323 "fizik nuklear": "nuclear-physics", 

324 "teori nombor": "number-theory", 

325 "oseanografi": "oceanography", 

326 "onkologi": "oncology", 

327 "permainan dalam talian": "video-games", 

328 "optik": "optics", 

329 "sebatian organik": "organic-compound", 

330 "ornitologi": "ornithology", 

331 "ortodontik": "orthodontics", 

332 "paleontologi": "paleontology", 

333 "parapsikologi": "parapsychology", 

334 "fizik zarah": "particle-physics", 

335 "pempasteuran": "pasteurization", 

336 "patologi": "pathology", 

337 "petrokimia": "petrochemical", 

338 "petrologi": "petrology", 

339 "farmakologi": "pharmacology", 

340 "farmasi": "pharmacy", 

341 "filateli": "philately", 

342 "falsafah": "philosophy", 

343 "fonetik": "phonetics", 

344 "fonologi": "phonology", 

345 "fotografi": "photography", 

346 "kimia fizik": ["physics", "chemistry"], 

347 "fisiologi": "physiology", 

348 "planetologi": "planetology", 

349 "toksikologi": "toxicology", 

350 "sains politik": "political-science", 

351 "politik": "politics", 

352 "Politik Malaysia": "politics", 

353 "pornografi": "pornography", 

354 "percetakan": "printing", 

355 "teori kebarangkalian": "probability-theory", 

356 "pengaturcaraan": "programming", 

357 "undang-undang hartanah": ["real-estate", "law"], 

358 "psikiatri": "psychiatry", 

359 "psikoanalisis": "psychoanalysis", 

360 "psikologi": "psychology", 

361 "psikoterapi": "psychotherapy", 

362 "penerbitan": "publishing", 

363 "mekanik kuantum": "quantum-mechanics", 

364 "pengangkutan rel": "rail-transport", 

365 "agama": "religion", 

366 "robotik": "robotics", 

367 "Roman Katolik": "Roman-Catholicism", 

368 "mitologi Rom": "Roman-mythology", 

369 "ragbi": "rugby", 

370 "cereka sains": "science-fiction", 

371 "sains": "sciences", 

372 "seismologi": "seismology", 

373 "semantik": "semantics", 

374 "semiotik": "semiotics", 

375 "teori set": "set-theory", 

376 "menjahit": "sewing", 

377 "keseksualan": "sexuality", 

378 "pemprosesan isyarat": "signal processing", 

379 "menyanyi": "singing", 

380 "snuker": "snooker", 

381 "bola sepak": "soccer", 

382 "sains sosial": "social-science", 

383 "sosialisme": "socialism", 

384 "media sosial": "social-media", 

385 "sosiolinguistik": "sociolinguistics", 

386 "sosiologi": "sociology", 

387 "bola lisut": "softball", 

388 "perisian": "software", 

389 "kejuruteraan perisian": "software-engineering", 

390 "sains tanah": "soil-science", 

391 "bunyi": "sound", 

392 "kejuruteraan bunyi": "sound-engineering", 

393 "sains angkasa": "space-science", 

394 "spektroskopi": "spectroscopy", 

395 "sukan": "sports", 

396 "skuasy": "squash", 

397 "statistik": "statistics", 

398 "pasaran saham": "stock-market", 

399 "subbudaya": "subculture", 

400 "kesufian": "Sufism", 

401 "pembedahan": "surgery", 

402 "berenang": "swimming", 

403 "teori sistem": "systems-theory", 

404 "percukaian": "taxation", 

405 "taksonomi": "taxonomy", 

406 "teknologi": "technology", 

407 "telekomunikasi": "telecommunications", 

408 "televisyen": "television", 

409 "tenis": "tennis", 

410 "tekstil": "textiles", 

411 "teater": "theater", 

412 "teologi": "theology", 

413 "termodinamik": "thermodynamics", 

414 "topologi": "topology", 

415 "pelancongan": "tourism", 

416 "perdagangan": "commerce", 

417 "pengangkutan": "transport", 

418 "trigonometri": "trigonometry", 

419 "tipografi": "typography", 

420 "kenderaan": "vehicles", 

421 "perubatan veterinar": ["veterinary", "medicine"], 

422 "genre permainan video": "video-games", 

423 "permainan video": "video-games", 

424 "virologi": "virology", 

425 "volkanologi": "volcanology", 

426 "bola tampar": "volleyball", 

427 "senjata": "weapon", 

428 "cuaca": "weather", 

429 "reka bentuk web": "web design", 

430 "angkat berat": "weightlifting", 

431 "wain": "wine", 

432 "pertukangan kayu": "carpentry", 

433 "gusti": "wrestling", 

434 "Islam": "Islam", 

435} 

436 

437 

438def translate_raw_tags(data: WordEntry) -> None: 

439 raw_tags = [] 

440 for raw_tag in data.raw_tags: 

441 find_tag = False 

442 if raw_tag in TAGS and hasattr(data, "tags"): 

443 find_tag = True 

444 tr_tag = TAGS[raw_tag] 

445 if isinstance(tr_tag, str): 445 ↛ 447line 445 didn't jump to line 447 because the condition on line 445 was always true

446 data.tags.append(tr_tag) 

447 elif isinstance(tr_tag, list): 

448 data.tags.extend(tr_tag) 

449 if raw_tag in TOPICS and hasattr(data, "topics"): 449 ↛ 450line 449 didn't jump to line 450 because the condition on line 449 was never true

450 find_tag = True 

451 topic = TOPICS[raw_tag] 

452 if isinstance(topic, str): 

453 data.topics.append(topic) 

454 elif isinstance(topic, list): 

455 data.topics.extend(topic) 

456 if not find_tag: 

457 raw_tags.append(raw_tag) 

458 data.raw_tags = raw_tags