Coverage for src/wiktextract/extractor/ko/tags.py: 90%
20 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from .models import WordEntry
3# https://ko.wiktionary.org/wiki/모듈:labels/data/topical
4# https://ko.wiktionary.org/wiki/모듈:labels/data
5GLOSS_TAGS = {
6 "인명": "name",
7 "고어": "archaic",
8 "구식": "archaic",
9 # "대명동사": "",
10 # "말고름": "",
11 "비유": "metaphoric",
12 "사어": "obsolete", # dead language
13 "유아어": "baby-talk",
14 "자동사": "intransitive",
15 "직역": "literally",
16 "타동사": "transitive",
17 "드물게": "rare",
18 "원래의 의미": "naturally",
19 "문학적": "literary",
20 "해학적": "humorous",
21 "완곡적": "euphemistic",
22}
24SOUND_TAGS = {
25 # 틀:ko-IPA
26 "Revised Romanization": ["revised", "romanization"],
27 "Revised Romanization (translit.)": [
28 "revised",
29 "romanization",
30 "transliteration",
31 ],
32 "McCune-Reischauer": "McCune-Reischauer",
33 "Yale Romanization": ["Yale", "romanization"],
34 "표준어/서울": ["SK-Standard", "Seoul"],
35 # 틀:ja-pron
36 "도쿄": "Tokyo",
37 # 틀:발음 듣기, 틀:IPA
38 "영국": "UK",
39 "미국": "US",
40 "영": "UK",
41 "미": "US",
42 "표준": "standard",
43 "남부": "South",
44 "북부": "North",
45 "고대": "archaic",
46 "동부": "East",
47 "서부": "West",
48 "포르투갈": "Portugal",
49 "이집트": "Egypt",
50 "시리아": "Syria",
51 "브라질": "Brazil",
52 "독일": "Germany",
53 "현대": "modern",
54 "캐나다": "Canada",
55 "하노이": "Hanoi",
56 "브라질 남부": "Southern-Brazil",
57 "벨기에": "Belgium",
58 "이란": "Iran",
59 "파리": "Paris",
60 "모로코": "Morocco",
61 "베를린": "Berlin",
62 "비격식체": "informal",
63 "민난어 장저우": ["Min-Nan", "Zhangzhou"],
64}
66HEADER_TAGS = {
67 # 틀:한국어_동사
68 "부정사형": "infinitive",
69 "연결어미형": "sequential",
70 "명사형": "noun",
71 "사동사": "causative",
72}
74# also in linkage lists
75TRANSLATION_TAGS = {
76 "남성": "masculine",
77 "여성": "feminine",
78 "라틴": "Latin",
79 "중성": "neuter",
80 "간체": "Simplified Chinese",
81 "번체": "Traditional Chinese",
82 "번체자": "Traditional Chinese",
83 "오스트리아": "Austria",
84 "표준어": "standard",
85 "히브리 문자": ["Hebrew", "letter"],
86 "아랍 문자": ["Arabic", "letter"],
87 "복수형": "plural",
88 "단수": "singular",
89 "복수": "plural",
90 "불완료체": "imperfect",
91 "완료체": "completive",
92 "양성": "masculine",
93 "바이에른 방언": ["Bavarian", "dialectal"],
94 "광둥어": "Cantonese",
95 "오스트레일리아": "Australia",
96 "글라골 문자": ["Glagolitic", "letter"],
97 "속어": "slang",
98 "멕시코 속어": ["Mexico", "slang"],
99 "에스파냐": "Spain",
100 "가타카나": "katakana",
101 "고어": "archaic",
102 "쯔놈": "Chu-Nom",
103 "형용사": "adjective",
104 "사투리": "dialectal",
105 "약자": "abbreviation",
106 "동사": "verb",
107 "드문 단어": "rare",
108}
110TAGS = {**GLOSS_TAGS, **SOUND_TAGS, **HEADER_TAGS, **TRANSLATION_TAGS}
112TOPICS = {
113 "금융": "finance",
114 "광고": "advertising",
115 "군사": "military",
116 "어류": "fish",
117 "물리": "physics",
118 "법률": "law",
119 "식물": "botany",
120 "역사": "history",
121 "의류": "clothing",
122 "의학": "medicine",
123 "전기": "electricity",
124 # "조류": "birds",
125 "지리": "geography",
126 "프로그래밍": "programming",
127 "컴퓨터": "computer",
128 "해부학": "anatomy",
129 "정치": "politics",
130 "종교": "religion",
131 "가톨릭": "Catholicism",
132 "축구": "football",
133 # "체육": "physical-education",
134}
137def translate_raw_tags(data: WordEntry) -> None:
138 raw_tags = []
139 for raw_tag in data.raw_tags:
140 if raw_tag in TAGS:
141 tr_tag = TAGS[raw_tag]
142 if isinstance(tr_tag, str):
143 data.tags.append(tr_tag)
144 elif isinstance(tr_tag, list): 144 ↛ 139line 144 didn't jump to line 139 because the condition on line 144 was always true
145 data.tags.extend(tr_tag)
146 elif hasattr(data, "topics") and raw_tag in TOPICS: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true
147 data.topics.append(TOPICS[raw_tag])
148 else:
149 raw_tags.append(raw_tag)
150 data.raw_tags = raw_tags