Coverage for src/wiktextract/extractor/zh/tags.py: 100%
31 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from .models import WordEntry
2from .topics import LABEL_TOPICS
4GENDER_TAGS: dict[str, str] = {
5 "陰性": "feminine",
6 "陽性": "masculine",
7 "中性": "neuter",
8}
10NUMBER_TAGS: dict[str, str] = {
11 "單數": "singular",
12 "複數": "plural",
13 "定單數": "definite singular",
14 "不定複數": "indefinite plural",
15 "定複數": "definite plural",
16 "斜格複數": "oblique plural",
17 "主格單數": "nominative singular",
18 "主格複數": "nominative plural",
19 "屬格單數": "genitive singular",
20 "屬格複數": "genitive plural",
21 "陰性單數": "feminine singular",
22 "陽性單數": "masculine singular",
23 "陰性複數": "feminine plural",
24 "陽性複數": "masculine plural",
25 "中性複數": "neuter plural",
26 "中性單數": "neuter singular",
27}
29# https://en.wikipedia.org/wiki/Count_noun
30COUNT_TAGS: dict[str, str] = {
31 "可數": "countable",
32 "不可數": "uncountable",
33}
35OTHER_TAGS: dict[str, str] = {
36 "指小詞": "diminutive",
37 "變格類型": "declension pattern",
38}
40VERB_TAGS: dict[str, str] = {
41 "及物": "transitive",
42 "不及物": "intransitive",
43 "动宾结构": "verb-object",
44}
46# https://en.wikipedia.org/wiki/Japanese_grammar#Stem_forms
47JA_STEM_FORMS: dict[str, str] = {
48 "未然形": "imperfective",
49 "連用形": "continuative",
50 "終止形": "terminal",
51 "連體形": "attributive",
52 "連体形": "attributive",
53 "假定形": "hypothetical",
54 "仮定形": "hypothetical",
55 "命令形": "imperative",
56}
58# https://en.wikipedia.org/wiki/Voice_(grammar)
59VOICE_TAGS: dict[str, str] = {
60 "被動形": "passive",
61 "使役形": "causative",
62 "可能形": "potential",
63 "意志形": "volitional",
64 "否定形": "negative",
65 "否定連用形": "negative continuative",
66 "尊敬形": "formal",
67 "完成形": "perfective",
68 "接續形": "conjunctive",
69 "條件形": "hypothetical conditional",
70}
72COMPARISON_TAGS: dict[str, str] = {
73 # https://en.wikipedia.org/wiki/Comparison_(grammar)
74 "原级": "positive",
75 "比較級": "comparative",
76 "最高級": "superlative",
77}
79GRAMMATICAL_TAGS: dict[str, str] = {
80 **GENDER_TAGS,
81 **NUMBER_TAGS,
82 **COUNT_TAGS,
83 **OTHER_TAGS,
84 **VERB_TAGS,
85 **JA_STEM_FORMS,
86 **VOICE_TAGS,
87 **COMPARISON_TAGS,
88}
90# https://zh.wiktionary.org/wiki/Template:Label
91# https://zh.wiktionary.org/wiki/Module:Labels/data
92# https://zh.wiktionary.org/wiki/Template:Qualifier
93# https://zh.wiktionary.org/wiki/Template:古
94# https://zh.wiktionary.org/wiki/Template:注释
95LABEL_TAGS = {
96 "棄用": "obsolete",
97 "比喻": "figuratively",
98 "古": "archaic",
99 "陽": "masculine",
100 "陰": "feminine",
101 "喻": "figuratively",
102 "書": "literary",
103 "口": "colloquial",
104 "俚": "slang",
105 "俗": "slang",
106 "方": "dialectal",
107 "废": "obsolete",
108 "貶": "derogatory",
109 "罕": "rare",
110 "引": "broadly",
111 "現已罕用": "archaic",
112 # Module:Labels/data
113 "back slang": "slang",
114 "synecdochically": "synecdoche",
115 "不再自由造詞": "idiomatic",
116 "不及物": "intransitive",
117 "不可數": "uncountable",
118 "不定": "indefinite",
119 "不常見": "uncommon",
120 "不推薦使用": "proscribed",
121 "中性": "neuter",
122 "中間被動語態": "mediopassive",
123 "中間語態": "middle",
124 "主動語態": "active",
125 "主要用於否定": "usually with-negation",
126 "交互": "reciprocal",
127 "以單數形式": "singular",
128 "以複數形式": "in-plural",
129 "作定語": "attributive",
130 "作格": "ergative",
131 "作表語": "predicative",
132 "使役": "causative",
133 "俗語": "idiomatic",
134 "俚語": "slang",
135 "兒童用語": "childish",
136 "公文": "bureaucratese",
137 "冒犯": "offensive",
138 "分詞": "participle",
139 "前古典": "pre-Classical",
140 "助動詞": "auxiliary",
141 "助記符": "mnemonic",
142 "及物": "transitive",
143 "反問句": "rhetoric",
144 "反身": "reflexive",
145 "口語": "colloquial",
146 "古舊": "archaic",
147 "可數": "countable",
148 "同性戀俚語": "slang LGBT",
149 "名詞化": "noun-from-verb",
150 "唯單": "singular singular-only singular",
151 "唯複": "plural plural-only",
152 "國際音標": "IPA",
153 "基數詞": "cardinal",
154 "大寫": "capitalized",
155 "委婉": "euphemistic",
156 "字面義": "literally",
157 "完整": "perfect",
158 "完整體": "perfective",
159 "定語": "attributive",
160 "實詞": "substantive",
161 "尊敬": "honorific",
162 "常用複數": "plural-normally",
163 "幽默": "humorous",
164 "序數詞": "ordinal",
165 "廣義來說": "broadly",
166 "引申": "broadly",
167 "弱祈使式": "jussive",
168 "強調": "emphatic",
169 "後古典": "obsolete",
170 "性別中立": "gender-neutral",
171 "情態": "modal",
172 "愛稱": "endearing",
173 "所有格代詞": "possessive pronoun without-noun",
174 "押韻俚語": "slang",
175 "抽象名詞": "abstract-noun",
176 "擬態詞": "ideophonic",
177 "擬聲詞": "onomatopoeic",
178 "新詞": "neologism",
179 "方言": "dialectal",
180 "書面": "literary",
181 "有比較級": "comparable",
182 "有生": "animate",
183 "正式": "formal",
184 "歷史": "historical",
185 "比喻義": "figuratively",
186 "無人稱": "impersonal",
187 "無比較級": "not-comparable",
188 "無生": "inanimate",
189 "焦點": "focus",
190 "狹義": "narrowly",
191 "監獄俚語": "slang",
192 "直陳語氣": "indicative",
193 "短信": "Internet",
194 "祈使語氣": "imperative",
195 "禮貌": "polite",
196 "種族歧視語": "slur",
197 "粉絲用語": "slang lifestyle",
198 "粗俗": "vulgar",
199 "系動詞": "copulative",
200 "網路用語": "Internet",
201 "縮寫": "abbreviation",
202 "罕用": "rare",
203 "臨時語": "nonce-word",
204 "虛擬語氣": "subjunctive",
205 "表語": "predicative",
206 "被動語態": "passive",
207 "視覺方言": "pronunciation-spelling",
208 "親切": "familiar",
209 "詈語": "expletive",
210 "詩歌": "poetic",
211 "誇飾": "excessive",
212 "語中音省略": "syncope",
213 "諷刺": "sarcastic",
214 "謙遜": "humble",
215 "貶義": "derogatory",
216 "轉喻義": "metonymically",
217 "返璞詞": "retronym",
218 "過時": "dated",
219 "陰性": "feminine",
220 "陽性": "masculine",
221 "雙及物動詞": "ditransitive",
222 "靜態動詞": "stative",
223 "非完整": "imperfect",
224 "非完整體": "imperfective",
225 "非常罕用": "rare",
226 "非標準": "nonstandard",
227 "非標準形式": "nonstandard",
228 "非正式": "informal",
229 "首字母縮略詞": "initialism",
230 "駭客語": "Leet Internet",
231 "高語域": "honorific",
232 "中醫": "Traditional-Chinese-Medicine",
233 "修辭學": "rhetoric",
234 "印度教": "Hinduism",
235 "摩門教": "Mormonism",
236 "物理": "particle",
237 "猶太教": "Judaism",
238 "納粹主義": "Nazism",
239 "網際網路": "Internet",
240 "耆那教": "Jainism",
241 "聖經": "Biblical",
242 "解剖學": "anatomy",
243 "貴格會": "Quakerism",
244 "錫克教": "Sikhism",
245 "馬克思主義": "Marxism",
246 # also from Module:Labels/data, but translated manually
247 "喃字": "Chu-Nom",
248 "反身代詞": "reflexive",
249 "字面意義": "literally",
250 "成語": "Chengyu",
251 "及物、不及物": ["transitive", "intransitive"],
252 "集合名詞": "collective",
253 "控制動詞": "control-verb",
254 "省略": "ellipsis",
255 "分數": "fractional",
256 "以雙數形式": "dual",
257 "主要用於否定複數": ["negative", "plural"],
258 "數詞縮寫": ["numeral", "abbreviation"],
259 "主要用於肯定": "positive",
260}
262# example sentence template
263# https://zh.wiktionary.org/wiki/Template:Zh-x
264# https://zh.wiktionary.org/wiki/Module:Zh-usex/data
265ZH_X_TAGS = {
266 "繁體": "Traditional Chinese",
267 "簡體": "Simplified Chinese",
268 "繁體和簡體": ["Traditional Chinese", "Simplified Chinese"],
269 "漢語拼音": "Pinyin",
270 "粵拼": "Jyutping",
271 "現代標準漢語": "Standard Chinese",
272 "文言文": "Classical Chinese",
273 "官話白話文": "Written vernacular Chinese",
274 "粵語": "Cantonese",
275 "吳語": "Wu",
276 "廣州話": "Cantonese",
277}
279# classifier tags
280# https://zh.wiktionary.org/wiki/Template:zh-mw
281# https://zh.wiktionary.org/wiki/Module:Zh/templates
282ZH_TAGS = {
283 "官話": "Mandarin",
284 "贛語": "Gan",
285 "客家話": "Hakka",
286 "晉語": "Jin",
287 "閩北語": "Northern Min",
288 "閩東語": "Eastern Min",
289 "閩南語": "Southern Min",
290 "潮州話": "Teochew",
291 "湘語": "Xiang",
292}
294# https://zh.wiktionary.org/wiki/Template:Zh-pron
295# https://zh.wiktionary.org/wiki/Module:Zh-pron
296ZH_PRON_TAGS = {
297 "拼音": "Pinyin",
298 "注音": "Bopomofo",
299 "潮州話拼音": "Peng'im",
300 "上海": "Shanghai",
301 "吳語學堂拼音": "Wugniu",
302 "通用拼音": "Tongyong-Pinyin",
303 "威妥瑪拼音": "Wade–Giles",
304 "耶魯官話拼音": "Yale",
305 "國語羅馬字": "Gwoyeu-Romatsyh",
306 "西里爾字母轉寫": "Palladius",
307 "漢語國際音標": "Sinological-IPA",
308 "耶魯粵拼": ["Yale", "Jyutping"],
309 "廣州話拼音": ["Cantonese", "Pinyin"],
310 "廣東拼音": "Guangdong-Romanization",
311 "國際音標": "IPA",
312 "模仿白話字": "POJ",
313}
316ALL_TAGS = {
317 **GRAMMATICAL_TAGS,
318 **LABEL_TAGS,
319 **ZH_X_TAGS,
320 **ZH_TAGS,
321 **ZH_PRON_TAGS,
322}
325def translate_raw_tags(data: WordEntry) -> WordEntry:
326 raw_tags = []
327 for raw_tag in data.raw_tags:
328 if raw_tag in ALL_TAGS:
329 tr_tag = ALL_TAGS[raw_tag]
330 if isinstance(tr_tag, str) and tr_tag not in data.tags:
331 data.tags.append(tr_tag)
332 elif isinstance(tr_tag, list):
333 data.tags.extend(tr_tag)
334 elif raw_tag in LABEL_TOPICS and hasattr(data, "topics"):
335 data.topics.append(LABEL_TOPICS[raw_tag])
336 else:
337 raw_tags.append(raw_tag)
338 data.raw_tags = raw_tags
339 return data
342# https://zh.wiktionary.org/wiki/Template:T
343# https://zh.wiktionary.org/wiki/Template:Head
344# https://zh.wiktionary.org/wiki/Module:Gender_and_number
345TEMPLATE_TAG_ARGS = {
346 "f": "feminine",
347 "m": "masculine",
348 "n": "neuter",
349 "c": "common",
350 # Animacy
351 "an": "animate",
352 "in": "inanimate",
353 # Animal (for Ukrainian, Belarusian, Polish)
354 "anml": "animal",
355 # Personal (for Ukrainian, Belarusian, Polish)
356 "pr": "personal",
357 # Nonpersonal not currently used
358 "np": "nonpersonal",
359 # Virility (for Polish)
360 "vr": "virile",
361 "nv": "nonvirile",
362 # Numbers
363 "s": "singular number",
364 "d": "dual number",
365 "p": "plural number",
366 # Verb qualifiers
367 "impf": "imperfective aspect",
368 "pf": "perfective aspect",
369 "mf": "masculine feminine",
370}