Coverage for src/wiktextract/extractor/ko/tags.py: 90%
20 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-30 10:31 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-30 10:31 +0000
1from .models import WordEntry
3# https://ko.wiktionary.org/wiki/모듈:labels/data/topical
4# https://ko.wiktionary.org/wiki/모듈:labels/data
5GLOSS_TAGS = {
6 "인명": "name",
7 "고어": "archaic",
8 "구식": "archaic",
9 # "대명동사": "",
10 # "말고름": "",
11 "비유": "metaphoric",
12 "사어": "obsolete", # dead language
13 "유아어": "baby-talk",
14 "자동사": "intransitive",
15 "직역": "literally",
16 "타동사": "transitive",
17 "드물게": "rare",
18 "원래의 의미": "naturally",
19 "문학적": "literary",
20 "해학적": "humorous",
21 "완곡적": "euphemistic",
22 "가산": "countable",
23 "불가산": "uncountable",
24}
26SOUND_TAGS = {
27 # 틀:ko-IPA
28 "Revised Romanization": ["revised", "romanization"],
29 "Revised Romanization (translit.)": [
30 "revised",
31 "romanization",
32 "transliteration",
33 ],
34 "McCune-Reischauer": "McCune-Reischauer",
35 "Yale Romanization": ["Yale", "romanization"],
36 "표준어/서울": ["SK-Standard", "Seoul"],
37 # 틀:ja-pron
38 "도쿄": "Tokyo",
39 # 틀:발음 듣기, 틀:IPA
40 "영국": "UK",
41 "미국": "US",
42 "영": "UK",
43 "미": "US",
44 "표준": "standard",
45 "남부": "South",
46 "북부": "North",
47 "고대": "archaic",
48 "동부": "East",
49 "서부": "West",
50 "포르투갈": "Portugal",
51 "이집트": "Egypt",
52 "시리아": "Syria",
53 "브라질": "Brazil",
54 "독일": "Germany",
55 "현대": "modern",
56 "캐나다": "Canada",
57 "하노이": "Hanoi",
58 "브라질 남부": "Southern-Brazil",
59 "벨기에": "Belgium",
60 "이란": "Iran",
61 "파리": "Paris",
62 "모로코": "Morocco",
63 "베를린": "Berlin",
64 "비격식체": "informal",
65 "민난어 장저우": ["Min-Nan", "Zhangzhou"],
66}
68HEADER_TAGS = {
69 # 틀:한국어_동사
70 "부정사형": "infinitive",
71 "연결어미형": "sequential",
72 "명사형": "noun",
73 "사동사": "causative",
74}
76# also in linkage lists
77TRANSLATION_TAGS = {
78 "남성": "masculine",
79 "여성": "feminine",
80 "라틴": "Latin",
81 "중성": "neuter",
82 "간체": "Simplified-Chinese",
83 "번체": "Traditional-Chinese",
84 "번체자": "Traditional-Chinese",
85 "오스트리아": "Austria",
86 "표준어": "standard",
87 "히브리 문자": ["Hebrew", "letter"],
88 "아랍 문자": ["Arabic", "letter"],
89 "복수형": "plural",
90 "단수": "singular",
91 "복수": "plural",
92 "불완료체": "imperfect",
93 "완료체": "completive",
94 "양성": "masculine",
95 "바이에른 방언": ["Bavarian", "dialectal"],
96 "광둥어": "Cantonese",
97 "오스트레일리아": "Australia",
98 "글라골 문자": ["Glagolitic", "letter"],
99 "속어": "slang",
100 "멕시코 속어": ["Mexico", "slang"],
101 "에스파냐": "Spain",
102 "가타카나": "katakana",
103 "고어": "archaic",
104 "쯔놈": "Chu-Nom",
105 "형용사": "adjective",
106 "사투리": "dialectal",
107 "약자": "abbreviation",
108 "동사": "verb",
109 "드문 단어": "rare",
110}
112TAGS = {
113 **GLOSS_TAGS,
114 **SOUND_TAGS,
115 **HEADER_TAGS,
116 **TRANSLATION_TAGS,
117 # Template:zh-forms
118 "정체": "Traditional-Chinese",
119 "간체": "Simplified-Chinese",
120 # Template:zh-x
121 "대만 관화": "Taiwanese-Mandarin",
122 "표준 중국어": "Standard-Chinese",
123 "한어병음": "Pinyin",
124 "광저우 광둥어": "Guangzhou-Cantonese",
125 "월병": "Jyutping",
126}
128TOPICS = {
129 "금융": "finance",
130 "광고": "advertising",
131 "군사": "military",
132 "어류": "fish",
133 "물리": "physics",
134 "법률": "law",
135 "식물": "botany",
136 "역사": "history",
137 "의류": "clothing",
138 "의학": "medicine",
139 "전기": "electricity",
140 # "조류": "birds",
141 "지리": "geography",
142 "프로그래밍": "programming",
143 "컴퓨터": "computer",
144 "해부학": "anatomy",
145 "정치": "politics",
146 "종교": "religion",
147 "가톨릭": "Catholicism",
148 "축구": "football",
149 # "체육": "physical-education",
150}
153def translate_raw_tags(data: WordEntry) -> None:
154 raw_tags = []
155 for raw_tag in data.raw_tags:
156 if raw_tag in TAGS:
157 tr_tag = TAGS[raw_tag]
158 if isinstance(tr_tag, str):
159 data.tags.append(tr_tag)
160 elif isinstance(tr_tag, list): 160 ↛ 155line 160 didn't jump to line 155 because the condition on line 160 was always true
161 data.tags.extend(tr_tag)
162 elif hasattr(data, "topics") and raw_tag in TOPICS: 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true
163 data.topics.append(TOPICS[raw_tag])
164 else:
165 raw_tags.append(raw_tag)
166 data.raw_tags = raw_tags