Coverage for src / wiktextract / extractor / ms / tags.py: 64%
28 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-02 00:27 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-02 00:27 +0000
1from .models import WordEntry
3GENDER_TAGS = {
4 # Modul:gender_and_number/data
5 "m": "masculine",
6 "f": "feminine",
7 "n": "neuter",
8 "c": "common",
9 "neutral": "neutral",
10 "bernyawa": "animate",
11 "tak bernyawa": "inanimate",
12 "haiwan": "animal-not-person",
13 "peribadi": "personal",
14 "tak peribadi": "impersonal",
15 "vir": "virile",
16 "nvir": "nonvirile",
17 "mf": "singular",
18 "du": "dual",
19 "jm": "plural",
20 "impf": "imperfective",
21 "pf": "perfective",
22 "takrifan sama": ["masculine", "feminine"],
23 "mengikut keadaan": ["masculine", "feminine"],
24}
26LB_TAGS = {
27 # Modul:labels/data
28 "kependekan": "abbreviation",
29 "akronim": "acronym",
30 "transitif": "transitive",
31 "tidak transitif": "intransitive",
32 "jussive": "jussive",
33 "arkaik": "archaic",
34 "atelic": "imperfective",
35 "kata bantu": ["auxiliary", "verb"],
36 "nombor kardinal": "cardinal",
37 "kausatif": "causative",
38 # "berbilang": "",
39 "kebudak-budakan": "childish",
40 "chữ Nôm Vietnam": ["Chữ-Nôm", "Vietnam"],
41 "hinaan": "offensive",
42 "hinaan kaum": ["ethnic", "offensive"],
43 "eufemisme": "euphemistic",
44 "kiasan": "figuratively",
45 "jenaka": "humorous",
46 "tidak formal": "informal",
47 "ironi": "ironic",
48 "harfiah": "literally",
49 "slanga perubatan": "slang", # medicine
50 "metonim": "metonymically",
51 "neologisme": "neologism",
52 "bentuk bukan baku": "nonstandard",
53 "usang": "obsolete",
54 "lapuk": "obsolete",
55 "kata kasar": "impolite",
56 "sopan": "polite",
57 "pasca-Klasik": "post-Classical",
58 "slanga penjara": "slang", # prison
59 # "hina agama": "",
60 "slanga": "slang",
61 "slanga sekolah": "slang", # school
62 # "hina diri": "",
63 "slanga universiti": "slang", # university
64 "sinkop": "syncope",
65 # "teknikal": "", # technical
66 "slanga mesej": "slang", # message
67 "lucah": "vulgar",
68 "Amerika": "America",
69 "Politik Malaysia": "Malaysia",
70 "retorik": "rhetoric",
71 "Kesatuan Soviet": "Soviet Union",
72 "peribahasa": "proverb",
73}
75POS_HEADER_TAGS = {
76 "ejaan Jawi": "Jawi",
77 "genitif": "genitive",
78 "jamak": "plural",
79 "terbilang dan tidak terbilang": ["countable", "uncountable"],
80 "bentuk jamak": "plural",
81 "Ejaan bahasa Urdu": "Urdu",
82 "partitif": "partitive",
83 "hanja": "hanja",
84}
86SOUND_TAGS = {
87 "Received Pronunciation": "Received-Pronunciation",
88 "General American": "General-American",
89 "UK": "UK",
90 "A.S.": "General-American",
91 "Pinyin": "Pinyin",
92 "Wade-Giles": "Wade-Giles",
93 "Baku Korea Selatan": "SK-Standard",
94 "Seoul": "Seoul",
95 "Perumian Semakan": ["revised", "romanization"],
96 "Perumian Semakan (translit.)": [
97 "revised",
98 "romanization",
99 "transliteration",
100 ],
101 "McCune–Reischauer": "McCune-Reischauer",
102 "Perumian Yale": ["Yale", "romanization"],
103}
106TAGS = {**GENDER_TAGS, **POS_HEADER_TAGS, **SOUND_TAGS, **LB_TAGS}
109TOPICS = {
110 # Modul:labels/data/topical
111 "perakaunan": "accounting",
112 "akustik": "acoustics",
113 "lakonan": "acting",
114 "periklanan": "advertising",
115 "aeronautik": "aeronautics",
116 "pertanian": "agriculture",
117 "alkimia": "alchemy",
118 "alkohol": "beverages",
119 "algebra": "algebra",
120 "geometri algebra": ["geometry", "algebra"],
121 "perubatan alternatif": "alternative-medicine",
122 "bola sepak Amerika": "American-football",
123 "biokimia": "biochemistry",
124 # "analisis": "analysis",
125 "analytic geometry": "geometry",
126 "kimia analisis": "chemistry",
127 "anarkisme": "anarchism",
128 "anatomi": "anatomy",
129 "animasi": "anime",
130 "anime": "anime",
131 "antropologi": "anthropology",
132 "araknologi": "arachnology",
133 "arkeologi": "archeology",
134 "memanah": "archery",
135 "seni bina": "architecture",
136 "mitologi Armenia": ["Armenia", "mythology"],
137 "kecerdasan buatan": "artificial-intelligence",
138 "seni": "arts",
139 "uranography": "uranography",
140 "astrologi": "astrology",
141 "astronautik": "astronautics",
142 "astronomi": "astronomy",
143 "astrofizik": "astrophysics",
144 "mitologi Asturia": ["Asturia", "mythology"],
145 "olahraga": "sports",
146 "auto racing": "racing",
147 "automotif": "automotive",
148 "penerbangan": "aviation",
149 "backgammon": "backgammon",
150 "bakteriologi": "bacteriology",
151 "badminton": "badminton",
152 "permainan bola": "ball-games",
153 "balet": "ballet",
154 "perbankan": "banking",
155 "besbol": "baseball",
156 "bola kerangjang": "basketball",
157 "BDSM": "BDSM",
158 "beekeeping": "beekeeping",
159 "perjudian": "gambling",
160 "Alkitab": ["Christianity", "biblical", "religion"],
161 "biblical": "biblical",
162 "billiards": "billiards",
163 "bingo": "bingo",
164 "biologi": "biology",
165 "bioteknologi": "biotechnology",
166 "birdwatching": "birdwatching",
167 "blogging": "blogging",
168 "permainan papan": "board-games",
169 "board sports": "board-games",
170 "bina badan": "bodybuilding",
171 "botani": "botany",
172 "boling": "bowling",
173 "tinju": "boxing",
174 "brewing": "brewing",
175 "bridge": "bridge",
176 "penyiaran": "broadcasting",
177 "briologi": "bryology",
178 "Buddhisme": "Buddhism",
179 "bullfighting": "bullfighting",
180 "perniagaan": "commerce",
181 "kalkulus": "calculus",
182 "Canadian football": "football",
183 "zoologi anjing": ["zoology", "dogs"],
184 "kardiologi": "cardiology",
185 "cartography": "cartography",
186 "category theory": "category-theory",
187 "caving": "caving",
188 "Celtic mythology": "Celtic-mythology",
189 "seramik": "ceramics",
190 "cheerleading": "cheerleading",
191 "kejuruteraan kimia": "chemistry-engineering",
192 "kimia": "chemistry",
193 "catur": "chess",
194 "mitologi Cina": "Chinese-mythology",
195 "Kristian": "Christianity",
196 "cinematography": "cinematography",
197 "mekanik klasik": "classical-mechanics",
198 "classical studies": "classical-studies",
199 "climatology": "climatology",
200 "climbing": "climbing",
201 "clinical psychology": "clinical-psychology",
202 "combinatorics": "combinatorics",
203 "comedy": "comedy",
204 "komik": "comics",
205 "komunikasi": "communications",
206 "komunisme": "communism",
207 "analisis kompleks": "complex-analysis",
208 "permainan komputer": "computer-games",
209 "grafik komputer": "computer graphics",
210 "perkakasan komputer": "computer hardware",
211 "sains komputer": "computer-sciences",
212 "pengkomputan": "computing",
213 "computing theory": "computing-theory",
214 "conchology": "conchology",
215 "pembinaan": "construction",
216 "memasak": "cooking",
217 "hak cipta": "copyright",
218 "kosmetik": "cosmetics",
219 "kriket": "cricket",
220 "criminology": "criminology",
221 "cryptography": "cryptography",
222 "cryptozoology": "cryptozoology",
223 "crystallography": "crystallography",
224 "curling": "curling",
225 "numismatik": "numismatics",
226 "cycling": "cycling",
227 "cytology": "cytology",
228 "dance": "dance",
229 "darts": "darts",
230 "pangkalan data": "databases",
231 "demoscene": "demoscene",
232 "pergigian": "dentistry",
233 "dermatologi": "dermatology",
234 "diplomasi": "diplomacy",
235 "pathology": "pathology",
236 "diving": "diving",
237 "domino": "dominoes",
238 "drama": "drama",
239 "dressage": "dressage",
240 "penghasilan makanan": "food-manufacture",
241 # "sains bumi": "",
242 "ekologi": "ecology",
243 "ekonomi": "economy",
244 "pendidikan": "education",
245 "kejuruteraan elektrik": "electrical-engineering",
246 "keelektrikan": "electricity",
247 "keelektromagnetan": "electromagnetism",
248 "elektronik": "electronic",
249 "embriologi": "embryology",
250 "perubatan kecemasan": "emergency-medicine",
251 "kejuruteraan": "engineering",
252 "entomologi": "entomology",
253 "enzim": "enzyme",
254 "epidemiologi": "epidemiology",
255 "epistemologi": "epistemology",
256 "etika": "ethics",
257 "etnografi": "ethnography",
258 "senaman": "exercise",
259 "falconry": "falconry",
260 "fesyen": "fashion",
261 "kimia organik": "organic-chemistry",
262 "filem": "film",
263 "kewangan": "finance",
264 "memancing": "fishing",
265 "dinamik bendalir": "fluid-dynamics",
266 "perhutanan": "forestry",
267 "perabot": "furniture",
268 "genealogi": "genealogy",
269 "genetik": "genetics",
270 "geografi": "geography",
271 "geologi": "geology",
272 "geometri": "geometry",
273 "geomorfologi": "geomorphology",
274 "gerontologi": "gerontology",
275 "golf": "golf",
276 "kerajaan": "government",
277 "tatabahasa": "grammar",
278 "gimnastik": "gymnastics",
279 "ginekologi": "gynecology",
280 "hematologi": "hematology",
281 "Hinduisme": "Hinduism",
282 "historiografi": "historiography",
283 "sejarah": "history",
284 "hoki": "hockey",
285 "homeopati": "homeopathy",
286 "hormon": "hormone",
287 "lumba kuda": "horse-racing",
288 "horticulture": "horticulture",
289 "sumber manusia": "human-resources",
290 "kemanusiaan": "humanity",
291 "perburuan": "hunting",
292 "hidrologi": "hydrology",
293 "hoki ais": "ice-hockey",
294 "imunokimia": "immunochemistry",
295 "imunologi": "immunology",
296 "sains maklumat": "information-science",
297 "teori maklumat": "information-theory",
298 "fizik": "physics",
299 "kewartawanan": "journalism",
300 "judo": "judo",
301 "undang-undang": "law",
302 "leksikografi": "lexicography",
303 "likenologi": "lichenology",
304 "limnologi": "limnology",
305 "linguistik": "linguistics",
306 "kesusasteraan": "literature",
307 "logik": "logic",
308 "malakologi": "malacology",
309 "pemasaran": "marketing",
310 "Marxisme": "Marxism",
311 "sains bahan": "material-science",
312 "matematik": "mathematics",
313 "mekanik": "mechanics",
314 "perubatan": "medicine",
315 "metalurgi": "metallurgy",
316 "meteorologi": "meteorology",
317 "metrologi": "metrology",
318 "mikrobiologi": "microbiology",
319 "ketenteraan": "military",
320 "mineralogi": "mineralogy",
321 "perlombongan": "mining",
322 "wang": "money",
323 "otot": "muscle",
324 "muzik": "music",
325 "alat muzik": "musical-instrument",
326 "mikologi": "mycology",
327 "mitologi": "mythology",
328 "nanoteknologi": "nanotechnology",
329 "nautika": "nautical",
330 "Nazisme": "Nazism",
331 "neuroanatomi": "neuroanatomy",
332 "neurologi": "neurology",
333 "neurosains": "neuroscience",
334 "fizik nuklear": "nuclear-physics",
335 "teori nombor": "number-theory",
336 "oseanografi": "oceanography",
337 "onkologi": "oncology",
338 "permainan dalam talian": "video-games",
339 "optik": "optics",
340 "sebatian organik": "organic-compound",
341 "ornitologi": "ornithology",
342 "ortodontik": "orthodontics",
343 "paleontologi": "paleontology",
344 "parapsikologi": "parapsychology",
345 "fizik zarah": "particle-physics",
346 "pempasteuran": "pasteurization",
347 "patologi": "pathology",
348 "petrokimia": "petrochemical",
349 "petrologi": "petrology",
350 "farmakologi": "pharmacology",
351 "farmasi": "pharmacy",
352 "filateli": "philately",
353 "falsafah": "philosophy",
354 "fonetik": "phonetics",
355 "fonologi": "phonology",
356 "fotografi": "photography",
357 "kimia fizik": ["physics", "chemistry"],
358 "fisiologi": "physiology",
359 "planetologi": "planetology",
360 "toksikologi": "toxicology",
361 "sains politik": "political-science",
362 "politik": "politics",
363 "Politik Malaysia": "politics",
364 "pornografi": "pornography",
365 "percetakan": "printing",
366 "teori kebarangkalian": "probability-theory",
367 "pengaturcaraan": "programming",
368 "undang-undang hartanah": ["real-estate", "law"],
369 "psikiatri": "psychiatry",
370 "psikoanalisis": "psychoanalysis",
371 "psikologi": "psychology",
372 "psikoterapi": "psychotherapy",
373 "penerbitan": "publishing",
374 "mekanik kuantum": "quantum-mechanics",
375 "pengangkutan rel": "rail-transport",
376 "agama": "religion",
377 "robotik": "robotics",
378 "Roman Katolik": "Roman-Catholicism",
379 "mitologi Rom": "Roman-mythology",
380 "ragbi": "rugby",
381 "cereka sains": "science-fiction",
382 "sains": "sciences",
383 "seismologi": "seismology",
384 "semantik": "semantics",
385 "semiotik": "semiotics",
386 "teori set": "set-theory",
387 "menjahit": "sewing",
388 "keseksualan": "sexuality",
389 "pemprosesan isyarat": "signal processing",
390 "menyanyi": "singing",
391 "snuker": "snooker",
392 "bola sepak": "soccer",
393 "sains sosial": "social-science",
394 "sosialisme": "socialism",
395 "media sosial": "social-media",
396 "sosiolinguistik": "sociolinguistics",
397 "sosiologi": "sociology",
398 "bola lisut": "softball",
399 "perisian": "software",
400 "kejuruteraan perisian": "software-engineering",
401 "sains tanah": "soil-science",
402 "bunyi": "sound",
403 "kejuruteraan bunyi": "sound-engineering",
404 "sains angkasa": "space-science",
405 "spektroskopi": "spectroscopy",
406 "sukan": "sports",
407 "skuasy": "squash",
408 "statistik": "statistics",
409 "pasaran saham": "stock-market",
410 "subbudaya": "subculture",
411 "kesufian": "Sufism",
412 "pembedahan": "surgery",
413 "berenang": "swimming",
414 "teori sistem": "systems-theory",
415 "percukaian": "taxation",
416 "taksonomi": "taxonomy",
417 "teknologi": "technology",
418 "telekomunikasi": "telecommunications",
419 "televisyen": "television",
420 "tenis": "tennis",
421 "tekstil": "textiles",
422 "teater": "theater",
423 "teologi": "theology",
424 "termodinamik": "thermodynamics",
425 "topologi": "topology",
426 "pelancongan": "tourism",
427 "perdagangan": "commerce",
428 "pengangkutan": "transport",
429 "trigonometri": "trigonometry",
430 "tipografi": "typography",
431 "kenderaan": "vehicles",
432 "perubatan veterinar": ["veterinary", "medicine"],
433 "genre permainan video": "video-games",
434 "permainan video": "video-games",
435 "virologi": "virology",
436 "volkanologi": "volcanology",
437 "bola tampar": "volleyball",
438 "senjata": "weapon",
439 "cuaca": "weather",
440 "reka bentuk web": "web design",
441 "angkat berat": "weightlifting",
442 "wain": "wine",
443 "pertukangan kayu": "carpentry",
444 "gusti": "wrestling",
445 "Islam": "Islam",
446}
449def translate_raw_tags(data: WordEntry) -> None:
450 raw_tags = []
451 for raw_tag in data.raw_tags:
452 find_tag = False
453 if raw_tag in TAGS and hasattr(data, "tags"):
454 find_tag = True
455 tr_tag = TAGS[raw_tag]
456 if isinstance(tr_tag, str): 456 ↛ 458line 456 didn't jump to line 458 because the condition on line 456 was always true
457 data.tags.append(tr_tag)
458 elif isinstance(tr_tag, list):
459 data.tags.extend(tr_tag)
460 if raw_tag in TOPICS and hasattr(data, "topics"): 460 ↛ 461line 460 didn't jump to line 461 because the condition on line 460 was never true
461 find_tag = True
462 topic = TOPICS[raw_tag]
463 if isinstance(topic, str):
464 data.topics.append(topic)
465 elif isinstance(topic, list):
466 data.topics.extend(topic)
467 if not find_tag:
468 raw_tags.append(raw_tag)
469 data.raw_tags = raw_tags