Coverage for src/wiktextract/extractor/ms/tags.py: 64%
28 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1from .models import WordEntry
3GENDER_TAGS = {
4 # Modul:gender_and_number/data
5 "m": "masculine",
6 "f": "feminine",
7 "n": "neuter",
8 "c": "common",
9 "neutral": "neutral",
10 "bernyawa": "animate",
11 "tak bernyawa": "inanimate",
12 "haiwan": "animal-not-person",
13 "peribadi": "personal",
14 "tak peribadi": "impersonal",
15 "vir": "virile",
16 "nvir": "nonvirile",
17 "mf": "singular",
18 "du": "dual",
19 "jm": "plural",
20 "impf": "imperfective",
21 "pf": "perfective",
22 "takrifan sama": ["masculine", "feminine"],
23 "mengikut keadaan": ["masculine", "feminine"],
24}
26LB_TAGS = {
27 # Modul:labels/data
28 "kependekan": "abbreviation",
29 "akronim": "acronym",
30 "transitif": "transitive",
31 "tidak transitif": "intransitive",
32 "jussive": "jussive",
33 "arkaik": "archaic",
34 "atelic": "imperfective",
35 "kata bantu": ["auxiliary", "verb"],
36 "nombor kardinal": "cardinal",
37 "kausatif": "causative",
38 # "berbilang": "",
39 "kebudak-budakan": "childish",
40 "chữ Nôm Vietnam": ["Chữ-Nôm", "Vietnam"],
41 "hinaan": "offensive",
42 "hinaan kaum": ["ethnic", "offensive"],
43 "eufemisme": "euphemistic",
44 "kiasan": "figuratively",
45 "jenaka": "humorous",
46 "tidak formal": "informal",
47 "ironi": "ironic",
48 "harfiah": "literally",
49 "slanga perubatan": "slang", # medicine
50 "metonim": "metonymically",
51 "neologisme": "neologism",
52 "bentuk bukan baku": "nonstandard",
53 "usang": "obsolete",
54 "lapuk": "obsolete",
55 "kata kasar": "impolite",
56 "sopan": "polite",
57 "pasca-Klasik": "post-Classical",
58 "slanga penjara": "slang", # prison
59 # "hina agama": "",
60 "slanga": "slang",
61 "slanga sekolah": "slang", # school
62 # "hina diri": "",
63 "slanga universiti": "slang", # university
64 "sinkop": "syncope",
65 # "teknikal": "", # technical
66 "slanga mesej": "slang", # message
67 "lucah": "vulgar",
68 "Amerika": "America",
69 "Politik Malaysia": "Malaysia",
70 "retorik": "rhetoric",
71 "Kesatuan Soviet": "Soviet Union",
72 "peribahasa": "proverb",
73}
75POS_HEADER_TAGS = {
76 "ejaan Jawi": "Jawi",
77 "genitif": "genitive",
78 "jamak": "plural",
79 "terbilang dan tidak terbilang": ["countable", "uncountable"],
80 "bentuk jamak": "plural",
81 "Ejaan bahasa Urdu": "Urdu",
82 "partitif": "partitive",
83}
85SOUND_TAGS = {
86 "Received Pronunciation": "Received-Pronunciation",
87 "General American": "General-American",
88 "UK": "UK",
89 "A.S.": "General-American",
90 "Pinyin": "Pinyin",
91 "Wade-Giles": "Wade-Giles",
92}
95TAGS = {**GENDER_TAGS, **POS_HEADER_TAGS, **SOUND_TAGS, **LB_TAGS}
98TOPICS = {
99 # Modul:labels/data/topical
100 "perakaunan": "accounting",
101 "akustik": "acoustics",
102 "lakonan": "acting",
103 "periklanan": "advertising",
104 "aeronautik": "aeronautics",
105 "pertanian": "agriculture",
106 "alkimia": "alchemy",
107 "alkohol": "beverages",
108 "algebra": "algebra",
109 "geometri algebra": ["geometry", "algebra"],
110 "perubatan alternatif": "alternative-medicine",
111 "bola sepak Amerika": "American-football",
112 "biokimia": "biochemistry",
113 # "analisis": "analysis",
114 "analytic geometry": "geometry",
115 "kimia analisis": "chemistry",
116 "anarkisme": "anarchism",
117 "anatomi": "anatomy",
118 "animasi": "anime",
119 "anime": "anime",
120 "antropologi": "anthropology",
121 "araknologi": "arachnology",
122 "arkeologi": "archeology",
123 "memanah": "archery",
124 "seni bina": "architecture",
125 "mitologi Armenia": ["Armenia", "mythology"],
126 "kecerdasan buatan": "artificial-intelligence",
127 "seni": "arts",
128 "uranography": "uranography",
129 "astrologi": "astrology",
130 "astronautik": "astronautics",
131 "astronomi": "astronomy",
132 "astrofizik": "astrophysics",
133 "mitologi Asturia": ["Asturia", "mythology"],
134 "olahraga": "sports",
135 "auto racing": "racing",
136 "automotif": "automotive",
137 "penerbangan": "aviation",
138 "backgammon": "backgammon",
139 "bakteriologi": "bacteriology",
140 "badminton": "badminton",
141 "permainan bola": "ball-games",
142 "balet": "ballet",
143 "perbankan": "banking",
144 "besbol": "baseball",
145 "bola kerangjang": "basketball",
146 "BDSM": "BDSM",
147 "beekeeping": "beekeeping",
148 "perjudian": "gambling",
149 "Alkitab": ["Christianity", "biblical", "religion"],
150 "biblical": "biblical",
151 "billiards": "billiards",
152 "bingo": "bingo",
153 "biologi": "biology",
154 "bioteknologi": "biotechnology",
155 "birdwatching": "birdwatching",
156 "blogging": "blogging",
157 "permainan papan": "board-games",
158 "board sports": "board-games",
159 "bina badan": "bodybuilding",
160 "botani": "botany",
161 "boling": "bowling",
162 "tinju": "boxing",
163 "brewing": "brewing",
164 "bridge": "bridge",
165 "penyiaran": "broadcasting",
166 "briologi": "bryology",
167 "Buddhisme": "Buddhism",
168 "bullfighting": "bullfighting",
169 "perniagaan": "commerce",
170 "kalkulus": "calculus",
171 "Canadian football": "football",
172 "zoologi anjing": ["zoology", "dogs"],
173 "kardiologi": "cardiology",
174 "cartography": "cartography",
175 "category theory": "category-theory",
176 "caving": "caving",
177 "Celtic mythology": "Celtic-mythology",
178 "seramik": "ceramics",
179 "cheerleading": "cheerleading",
180 "kejuruteraan kimia": "chemistry-engineering",
181 "kimia": "chemistry",
182 "catur": "chess",
183 "mitologi Cina": "Chinese-mythology",
184 "Kristian": "Christianity",
185 "cinematography": "cinematography",
186 "mekanik klasik": "classical-mechanics",
187 "classical studies": "classical-studies",
188 "climatology": "climatology",
189 "climbing": "climbing",
190 "clinical psychology": "clinical-psychology",
191 "combinatorics": "combinatorics",
192 "comedy": "comedy",
193 "komik": "comics",
194 "komunikasi": "communications",
195 "komunisme": "communism",
196 "analisis kompleks": "complex-analysis",
197 "permainan komputer": "computer-games",
198 "grafik komputer": "computer graphics",
199 "perkakasan komputer": "computer hardware",
200 "sains komputer": "computer-sciences",
201 "pengkomputan": "computing",
202 "computing theory": "computing-theory",
203 "conchology": "conchology",
204 "pembinaan": "construction",
205 "memasak": "cooking",
206 "hak cipta": "copyright",
207 "kosmetik": "cosmetics",
208 "kriket": "cricket",
209 "criminology": "criminology",
210 "cryptography": "cryptography",
211 "cryptozoology": "cryptozoology",
212 "crystallography": "crystallography",
213 "curling": "curling",
214 "numismatik": "numismatics",
215 "cycling": "cycling",
216 "cytology": "cytology",
217 "dance": "dance",
218 "darts": "darts",
219 "pangkalan data": "databases",
220 "demoscene": "demoscene",
221 "pergigian": "dentistry",
222 "dermatologi": "dermatology",
223 "diplomasi": "diplomacy",
224 "pathology": "pathology",
225 "diving": "diving",
226 "domino": "dominoes",
227 "drama": "drama",
228 "dressage": "dressage",
229 "penghasilan makanan": "food-manufacture",
230 # "sains bumi": "",
231 "ekologi": "ecology",
232 "ekonomi": "economy",
233 "pendidikan": "education",
234 "kejuruteraan elektrik": "electrical-engineering",
235 "keelektrikan": "electricity",
236 "keelektromagnetan": "electromagnetism",
237 "elektronik": "electronic",
238 "embriologi": "embryology",
239 "perubatan kecemasan": "emergency-medicine",
240 "kejuruteraan": "engineering",
241 "entomologi": "entomology",
242 "enzim": "enzyme",
243 "epidemiologi": "epidemiology",
244 "epistemologi": "epistemology",
245 "etika": "ethics",
246 "etnografi": "ethnography",
247 "senaman": "exercise",
248 "falconry": "falconry",
249 "fesyen": "fashion",
250 "kimia organik": "organic-chemistry",
251 "filem": "film",
252 "kewangan": "finance",
253 "memancing": "fishing",
254 "dinamik bendalir": "fluid-dynamics",
255 "perhutanan": "forestry",
256 "perabot": "furniture",
257 "genealogi": "genealogy",
258 "genetik": "genetics",
259 "geografi": "geography",
260 "geologi": "geology",
261 "geometri": "geometry",
262 "geomorfologi": "geomorphology",
263 "gerontologi": "gerontology",
264 "golf": "golf",
265 "kerajaan": "government",
266 "tatabahasa": "grammar",
267 "gimnastik": "gymnastics",
268 "ginekologi": "gynecology",
269 "hematologi": "hematology",
270 "Hinduisme": "Hinduism",
271 "historiografi": "historiography",
272 "sejarah": "history",
273 "hoki": "hockey",
274 "homeopati": "homeopathy",
275 "hormon": "hormone",
276 "lumba kuda": "horse-racing",
277 "horticulture": "horticulture",
278 "sumber manusia": "human-resources",
279 "kemanusiaan": "humanity",
280 "perburuan": "hunting",
281 "hidrologi": "hydrology",
282 "hoki ais": "ice-hockey",
283 "imunokimia": "immunochemistry",
284 "imunologi": "immunology",
285 "sains maklumat": "information-science",
286 "teori maklumat": "information-theory",
287 "fizik": "physics",
288 "kewartawanan": "journalism",
289 "judo": "judo",
290 "undang-undang": "law",
291 "leksikografi": "lexicography",
292 "likenologi": "lichenology",
293 "limnologi": "limnology",
294 "linguistik": "linguistics",
295 "kesusasteraan": "literature",
296 "logik": "logic",
297 "malakologi": "malacology",
298 "pemasaran": "marketing",
299 "Marxisme": "Marxism",
300 "sains bahan": "material-science",
301 "matematik": "mathematics",
302 "mekanik": "mechanics",
303 "perubatan": "medicine",
304 "metalurgi": "metallurgy",
305 "meteorologi": "meteorology",
306 "metrologi": "metrology",
307 "mikrobiologi": "microbiology",
308 "ketenteraan": "military",
309 "mineralogi": "mineralogy",
310 "perlombongan": "mining",
311 "wang": "money",
312 "otot": "muscle",
313 "muzik": "music",
314 "alat muzik": "musical-instrument",
315 "mikologi": "mycology",
316 "mitologi": "mythology",
317 "nanoteknologi": "nanotechnology",
318 "nautika": "nautical",
319 "Nazisme": "Nazism",
320 "neuroanatomi": "neuroanatomy",
321 "neurologi": "neurology",
322 "neurosains": "neuroscience",
323 "fizik nuklear": "nuclear-physics",
324 "teori nombor": "number-theory",
325 "oseanografi": "oceanography",
326 "onkologi": "oncology",
327 "permainan dalam talian": "video-games",
328 "optik": "optics",
329 "sebatian organik": "organic-compound",
330 "ornitologi": "ornithology",
331 "ortodontik": "orthodontics",
332 "paleontologi": "paleontology",
333 "parapsikologi": "parapsychology",
334 "fizik zarah": "particle-physics",
335 "pempasteuran": "pasteurization",
336 "patologi": "pathology",
337 "petrokimia": "petrochemical",
338 "petrologi": "petrology",
339 "farmakologi": "pharmacology",
340 "farmasi": "pharmacy",
341 "filateli": "philately",
342 "falsafah": "philosophy",
343 "fonetik": "phonetics",
344 "fonologi": "phonology",
345 "fotografi": "photography",
346 "kimia fizik": ["physics", "chemistry"],
347 "fisiologi": "physiology",
348 "planetologi": "planetology",
349 "toksikologi": "toxicology",
350 "sains politik": "political-science",
351 "politik": "politics",
352 "Politik Malaysia": "politics",
353 "pornografi": "pornography",
354 "percetakan": "printing",
355 "teori kebarangkalian": "probability-theory",
356 "pengaturcaraan": "programming",
357 "undang-undang hartanah": ["real-estate", "law"],
358 "psikiatri": "psychiatry",
359 "psikoanalisis": "psychoanalysis",
360 "psikologi": "psychology",
361 "psikoterapi": "psychotherapy",
362 "penerbitan": "publishing",
363 "mekanik kuantum": "quantum-mechanics",
364 "pengangkutan rel": "rail-transport",
365 "agama": "religion",
366 "robotik": "robotics",
367 "Roman Katolik": "Roman-Catholicism",
368 "mitologi Rom": "Roman-mythology",
369 "ragbi": "rugby",
370 "cereka sains": "science-fiction",
371 "sains": "sciences",
372 "seismologi": "seismology",
373 "semantik": "semantics",
374 "semiotik": "semiotics",
375 "teori set": "set-theory",
376 "menjahit": "sewing",
377 "keseksualan": "sexuality",
378 "pemprosesan isyarat": "signal processing",
379 "menyanyi": "singing",
380 "snuker": "snooker",
381 "bola sepak": "soccer",
382 "sains sosial": "social-science",
383 "sosialisme": "socialism",
384 "media sosial": "social-media",
385 "sosiolinguistik": "sociolinguistics",
386 "sosiologi": "sociology",
387 "bola lisut": "softball",
388 "perisian": "software",
389 "kejuruteraan perisian": "software-engineering",
390 "sains tanah": "soil-science",
391 "bunyi": "sound",
392 "kejuruteraan bunyi": "sound-engineering",
393 "sains angkasa": "space-science",
394 "spektroskopi": "spectroscopy",
395 "sukan": "sports",
396 "skuasy": "squash",
397 "statistik": "statistics",
398 "pasaran saham": "stock-market",
399 "subbudaya": "subculture",
400 "kesufian": "Sufism",
401 "pembedahan": "surgery",
402 "berenang": "swimming",
403 "teori sistem": "systems-theory",
404 "percukaian": "taxation",
405 "taksonomi": "taxonomy",
406 "teknologi": "technology",
407 "telekomunikasi": "telecommunications",
408 "televisyen": "television",
409 "tenis": "tennis",
410 "tekstil": "textiles",
411 "teater": "theater",
412 "teologi": "theology",
413 "termodinamik": "thermodynamics",
414 "topologi": "topology",
415 "pelancongan": "tourism",
416 "perdagangan": "commerce",
417 "pengangkutan": "transport",
418 "trigonometri": "trigonometry",
419 "tipografi": "typography",
420 "kenderaan": "vehicles",
421 "perubatan veterinar": ["veterinary", "medicine"],
422 "genre permainan video": "video-games",
423 "permainan video": "video-games",
424 "virologi": "virology",
425 "volkanologi": "volcanology",
426 "bola tampar": "volleyball",
427 "senjata": "weapon",
428 "cuaca": "weather",
429 "reka bentuk web": "web design",
430 "angkat berat": "weightlifting",
431 "wain": "wine",
432 "pertukangan kayu": "carpentry",
433 "gusti": "wrestling",
434 "Islam": "Islam",
435}
438def translate_raw_tags(data: WordEntry) -> None:
439 raw_tags = []
440 for raw_tag in data.raw_tags:
441 find_tag = False
442 if raw_tag in TAGS and hasattr(data, "tags"):
443 find_tag = True
444 tr_tag = TAGS[raw_tag]
445 if isinstance(tr_tag, str): 445 ↛ 447line 445 didn't jump to line 447 because the condition on line 445 was always true
446 data.tags.append(tr_tag)
447 elif isinstance(tr_tag, list):
448 data.tags.extend(tr_tag)
449 if raw_tag in TOPICS and hasattr(data, "topics"): 449 ↛ 450line 449 didn't jump to line 450 because the condition on line 449 was never true
450 find_tag = True
451 topic = TOPICS[raw_tag]
452 if isinstance(topic, str):
453 data.topics.append(topic)
454 elif isinstance(topic, list):
455 data.topics.extend(topic)
456 if not find_tag:
457 raw_tags.append(raw_tag)
458 data.raw_tags = raw_tags