Coverage for src/wiktextract/extractor/zh/tags.py: 100%
32 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1from .models import WordEntry
2from .topics import LABEL_TOPICS
4GENDER_TAGS: dict[str, str] = {
5 "陰性": "feminine",
6 "阴性": "feminine",
7 "陰性形式": "feminine",
8 "陰性等價詞": "feminine",
9 "陽性": "masculine",
10 "陽性形式": "masculine",
11 "中性": "neuter",
12 "中性形式": "neuter",
13}
15NUMBER_TAGS: dict[str, str | list[str]] = {
16 "單數": "singular",
17 "单数": "singular",
18 "複數": "plural",
19 "复数": "plural",
20 "定單數": ["definite", "singular"],
21 "定单数": ["definite", "singular"],
22 "不定單數": ["indefinite", "singular"],
23 "不定单数": ["indefinite", "singular"],
24 "不定複數": ["indefinite", "plural"],
25 "不定复数": ["indefinite", "plural"],
26 "定複數": ["definite", "plural"],
27 "斜格複數": ["oblique", "plural"],
28 "主格單數": ["nominative", "singular"],
29 "主格複數": ["nominative", "plural"],
30 "屬格單數": ["genitive", "singular"],
31 "屬格複數": ["genitive", "plural"],
32 "陰性單數": ["feminine", "singular"],
33 "陽性單數": ["masculine", "singular"],
34 "陰性複數": ["feminine", "plural"],
35 "陽性複數": ["masculine", "plural"],
36 "中性複數": ["neuter", "plural"],
37 "中性單數": ["neuter", "singular"],
38 "賓格單數": ["accusative", "singular"],
39 "賓格複數": ["accusative", "plural"],
40 "無複數": "no-plural",
41}
43# https://en.wikipedia.org/wiki/Count_noun
44COUNT_TAGS: dict[str, str] = {
45 "可數": "countable",
46 "不可數": "uncountable",
47}
49OTHER_TAGS: dict[str, str] = {
50 "指小詞": "diminutive",
51 "指小": "diminutive",
52 "變格類型": "declension-pattern-of",
53 "屬格": "genitive",
54 "部分格": "partitive",
55 "個人": "person",
56 "無屈折": "indeclinable",
57 "諺文": "hangeul",
58 "漢字": "hanja",
59 # Template:cs-proper noun
60 "相關形容詞": ["relational", "adjective"],
61 "關係形容詞": ["relational", "adjective"],
62 "居民稱謂詞": "demonym",
63 "女性居民稱謂詞": ["feminine", "demonym"],
64 "定賓格": ["definite", "accusative"],
65 "定宾格": ["definite", "accusative"],
66 "拉丁字母拼寫": "romanization",
67 "定指賓格": ["definite", "accusative"],
68 "前元音和諧變體": "front-vowel-harmony",
69}
71VERB_TAGS: dict[str, str] = {
72 "及物": "transitive",
73 "不及物": "intransitive",
74 "动宾结构": "verb-object",
75 "非完": "imperfective",
76 "完": "perfective",
77 "強變化": "strong",
78 "動名詞": "supine",
79 "命令式": "imperative",
80}
82# https://en.wikipedia.org/wiki/Japanese_grammar#Stem_forms
83JA_STEM_FORMS: dict[str, str] = {
84 "未然形": "imperfective",
85 "連用形": "continuative",
86 "終止形": "terminal",
87 "連體形": "attributive",
88 "連体形": "attributive",
89 "假定形": "hypothetical",
90 "仮定形": "hypothetical",
91 "命令形": "imperative",
92}
94# https://en.wikipedia.org/wiki/Voice_(grammar)
95VOICE_TAGS: dict[str, str | list[str]] = {
96 "被動形": "passive",
97 "使役形": "causative",
98 "可能形": "potential",
99 "意志形": "volitional",
100 "否定形": "negative",
101 "否定連用形": ["negative", "continuative"],
102 "尊敬形": "formal",
103 "完成形": "perfective",
104 "接續形": "conjunctive",
105 "條件形": ["hypothetical", "conditional"],
106}
108COMPARISON_TAGS: dict[str, str] = {
109 # https://en.wikipedia.org/wiki/Comparison_(grammar)
110 "原级": "positive",
111 "比較級": "comparative",
112 "最高級": "superlative",
113}
115TENSE_TAGS = {
116 "過去時": "preterite",
117 "過去式": "past",
118 "過去分詞": ["past", "participle"],
119 "現在時": "present",
120 "第三人稱單數現在時": ["third-person", "singular", "present"],
121 "助動詞": "auxiliary",
122 # Template:de-verb
123 "弱變化": "weak",
124 "弱变化": "weak",
125 "第三人稱單數簡單現在時": ["third-person", "singular", "present"],
126 "現在分詞": ["present", "participle"],
127 "一般過去時及過去分詞": ["past", "participle"],
128 # Template:it-verb
129 "第一人稱單數 現在時": ["first-person", "singular", "present"],
130 "第一人稱單數 先過去時": ["first-person", "singular", "past", "historic"],
131 # Template:de-adj
132 "強變化主格陽性單數": ["strong", "nominative", "masculine", "singular"],
133 # Template:la-verb
134 "现在时不定式": ["present", "infinitive"],
135 "完成时主动式": ["perfect", "active"],
136 "目的动名词": "supine",
137}
139GRAMMATICAL_TAGS: dict[str, str] = {
140 **GENDER_TAGS,
141 **NUMBER_TAGS,
142 **COUNT_TAGS,
143 **OTHER_TAGS,
144 **VERB_TAGS,
145 **JA_STEM_FORMS,
146 **VOICE_TAGS,
147 **COMPARISON_TAGS,
148 **TENSE_TAGS,
149}
151# https://zh.wiktionary.org/wiki/Template:Label
152# https://zh.wiktionary.org/wiki/Module:Labels/data
153# https://zh.wiktionary.org/wiki/Template:Qualifier
154# https://zh.wiktionary.org/wiki/Template:古
155# https://zh.wiktionary.org/wiki/Template:注释
156LABEL_TAGS = {
157 "棄用": "obsolete",
158 "弃用": "obsolete",
159 "比喻": "figuratively",
160 "古": "archaic",
161 "陽": "masculine",
162 "陰": "feminine",
163 "喻": "figuratively",
164 "書": "literary",
165 "口": "colloquial",
166 "俚": "slang",
167 "俗": "slang",
168 "方": "dialectal",
169 "废": "obsolete",
170 "貶": "derogatory",
171 "罕": "rare",
172 "引": "broadly",
173 "現已罕用": "archaic",
174 # Module:Labels/data
175 "back slang": "slang",
176 "synecdochically": "synecdoche",
177 "不再自由造詞": "idiomatic",
178 "不及物": "intransitive",
179 "不可數": "uncountable",
180 "不定": "indefinite",
181 "不常見": "uncommon",
182 "不推薦使用": "proscribed",
183 "中性": "neuter",
184 "中間被動語態": "mediopassive",
185 "中間語態": "middle",
186 "主動語態": "active",
187 "主要用於否定": ["usually", "with-negation"],
188 "交互": "reciprocal",
189 "以單數形式": "singular",
190 "以複數形式": "in-plural",
191 "作定語": "attributive",
192 "作格": "ergative",
193 "作表語": "predicative",
194 "使役": "causative",
195 "俗語": "idiomatic",
196 "俚語": "slang",
197 "俚语": "slang",
198 "兒童用語": "childish",
199 "公文": "bureaucratese",
200 "冒犯": "offensive",
201 "分詞": "participle",
202 "前古典": "pre-Classical",
203 "助動詞": "auxiliary",
204 "助記符": "mnemonic",
205 "及物": "transitive",
206 "反問句": "rhetoric",
207 "反身": "reflexive",
208 "口語": "colloquial",
209 "古舊": "archaic",
210 "可數": "countable",
211 "同性戀俚語": ["slang", "LGBT"],
212 "名詞化": "noun-from-verb",
213 "唯單": "singular-only",
214 "唯複": "plural-only",
215 "國際音標": "IPA",
216 "基數詞": "cardinal",
217 "大寫": "capitalized",
218 "委婉": "euphemistic",
219 "字面義": "literally",
220 "完整": "perfect",
221 "完整體": "perfective",
222 "定語": "attributive",
223 "實詞": "substantive",
224 "尊敬": "honorific",
225 "常用複數": "plural-normally",
226 "幽默": "humorous",
227 "序數詞": "ordinal",
228 "廣義來說": "broadly",
229 "引申": "broadly",
230 "弱祈使式": "jussive",
231 "強調": "emphatic",
232 "後古典": "obsolete",
233 "性別中立": "gender-neutral",
234 "情態": "modal",
235 "愛稱": "endearing",
236 "所有格代詞": ["possessive", "pronoun", "without-noun"],
237 "押韻俚語": "slang",
238 "抽象名詞": "abstract-noun",
239 "擬態詞": "ideophonic",
240 "擬聲詞": "onomatopoeic",
241 "新詞": "neologism",
242 "方言": "dialectal",
243 "書面": "literary",
244 "有比較級": "comparable",
245 "有生": "animate",
246 "正式": "formal",
247 "歷史": "historical",
248 "比喻義": "figuratively",
249 "無人稱": "impersonal",
250 "無比較級": "not-comparable",
251 "無生": "inanimate",
252 "焦點": "focus",
253 "狹義": "narrowly",
254 "監獄俚語": "slang",
255 "直陳語氣": "indicative",
256 "短信": "Internet",
257 "祈使語氣": "imperative",
258 "禮貌": "polite",
259 "種族歧視語": "slur",
260 "粉絲用語": ["slang", "lifestyle"],
261 "粗俗": "vulgar",
262 "系動詞": "copulative",
263 "網路用語": "Internet",
264 "縮寫": "abbreviation",
265 "罕用": "rare",
266 "臨時語": "nonce-word",
267 "虛擬語氣": "subjunctive",
268 "表語": "predicative",
269 "被動語態": "passive",
270 "視覺方言": "pronunciation-spelling",
271 "親切": "familiar",
272 "詈語": "expletive",
273 "詩歌": "poetic",
274 "誇飾": "excessive",
275 "語中音省略": "syncope",
276 "諷刺": "sarcastic",
277 "謙遜": "humble",
278 "貶義": "derogatory",
279 "轉喻義": "metonymically",
280 "返璞詞": "retronym",
281 "過時": "dated",
282 "陰性": "feminine",
283 "陽性": "masculine",
284 "雙及物動詞": "ditransitive",
285 "靜態動詞": "stative",
286 "非完整": "imperfect",
287 "非完整體": "imperfective",
288 "非常罕用": "rare",
289 "非標準": "nonstandard",
290 "非標準形式": "nonstandard",
291 "非正式": "informal",
292 "首字母縮略詞": "initialism",
293 "駭客語": ["Leet", "Internet"],
294 "高語域": "honorific",
295 "中醫": "Traditional-Chinese-Medicine",
296 "修辭學": "rhetoric",
297 "印度教": "Hinduism",
298 "摩門教": "Mormonism",
299 "物理": "particle",
300 "猶太教": "Judaism",
301 "納粹主義": "Nazism",
302 "網際網路": "Internet",
303 "耆那教": "Jainism",
304 "聖經": "Biblical",
305 "解剖學": "anatomy",
306 "貴格會": "Quakerism",
307 "錫克教": "Sikhism",
308 "馬克思主義": "Marxism",
309 # also from Module:Labels/data, but translated manually
310 "喃字": "Chu-Nom",
311 "反身代詞": "reflexive",
312 "字面意義": "literally",
313 "成語": "Chengyu",
314 "及物、不及物": ["transitive", "intransitive"],
315 "集合名詞": "collective",
316 "控制動詞": "control-verb",
317 "省略": "ellipsis",
318 "分數": "fractional",
319 "以雙數形式": "dual",
320 "主要用於否定複數": ["negative", "plural"],
321 "數詞縮寫": ["numeral", "abbreviation"],
322 "主要用於肯定": "positive",
323 "古典": "Classical",
324 "中國大陸": "Mainland-China",
325 "書面語": "literary",
326}
328# example sentence template
329# https://zh.wiktionary.org/wiki/Template:Zh-x
330# https://zh.wiktionary.org/wiki/Module:Zh-usex/data
331ZH_X_TAGS = {
332 "繁體": "Traditional Chinese",
333 "繁體和": "Traditional Chinese",
334 "簡體": "Simplified Chinese",
335 "繁體和簡體": ["Traditional Chinese", "Simplified Chinese"],
336 "漢語拼音": "Pinyin",
337 "粵拼": "Jyutping",
338 "現代標準漢語": "Standard Chinese",
339 "文言文": "Classical Chinese",
340 "官話白話文": "Written vernacular Chinese",
341 "粵語": "Cantonese",
342 "吳語": "Wu",
343 "廣州話": "Cantonese",
344}
346# classifier tags
347# https://zh.wiktionary.org/wiki/Template:zh-mw
348# https://zh.wiktionary.org/wiki/Module:Zh/templates
349ZH_TAGS = {
350 "官話": "Mandarin",
351 "贛語": "Gan",
352 "客家話": "Hakka",
353 "晉語": "Jin",
354 "閩北語": "Northern Min",
355 "閩東語": "Eastern Min",
356 "閩南語": "Southern Min",
357 "潮州話": "Teochew",
358 "湘語": "Xiang",
359}
361# https://zh.wiktionary.org/wiki/Template:Zh-pron
362# https://zh.wiktionary.org/wiki/Module:Zh-pron
363ZH_PRON_TAGS = {
364 "拼音": "Pinyin",
365 "注音": "Bopomofo",
366 "潮州話拼音": "Peng'im",
367 "上海": "Shanghai",
368 "吳語學堂拼音": "Wugniu",
369 "通用拼音": "Tongyong-Pinyin",
370 "威妥瑪拼音": "Wade–Giles",
371 "耶魯官話拼音": "Yale",
372 "國語羅馬字": "Gwoyeu-Romatsyh",
373 "西里爾字母轉寫": "Palladius",
374 "漢語國際音標": "Sinological-IPA",
375 "耶魯粵拼": ["Yale", "Jyutping"],
376 "廣州話拼音": ["Cantonese", "Pinyin"],
377 "廣東拼音": "Guangdong-Romanization",
378 "國際音標": "IPA",
379 "模仿白話字": "POJ",
380}
383ALL_TAGS = {
384 **GRAMMATICAL_TAGS,
385 **LABEL_TAGS,
386 **ZH_X_TAGS,
387 **ZH_TAGS,
388 **ZH_PRON_TAGS,
389}
392def translate_raw_tags(data: WordEntry) -> WordEntry:
393 raw_tags = []
394 for raw_tag in data.raw_tags:
395 if raw_tag in ALL_TAGS:
396 tr_tag = ALL_TAGS[raw_tag]
397 if isinstance(tr_tag, str) and tr_tag not in data.tags:
398 data.tags.append(tr_tag)
399 elif isinstance(tr_tag, list):
400 data.tags.extend(tr_tag)
401 elif raw_tag in LABEL_TOPICS and hasattr(data, "topics"):
402 data.topics.append(LABEL_TOPICS[raw_tag])
403 else:
404 raw_tags.append(raw_tag)
405 data.raw_tags = raw_tags
406 return data
409# https://zh.wiktionary.org/wiki/Template:T
410# https://zh.wiktionary.org/wiki/Template:Head
411# https://zh.wiktionary.org/wiki/Module:Gender_and_number
412TEMPLATE_TAG_ARGS = {
413 "f": "feminine",
414 "m": "masculine",
415 "n": "neuter",
416 "c": "common",
417 # Animacy
418 "an": "animate",
419 "in": "inanimate",
420 # Animal (for Ukrainian, Belarusian, Polish)
421 "anml": "animal",
422 # Personal (for Ukrainian, Belarusian, Polish)
423 "pr": "personal",
424 # Nonpersonal not currently used
425 "np": "nonpersonal",
426 # Virility (for Polish)
427 "vr": "virile",
428 "nv": "nonvirile",
429 # Numbers
430 "s": "singular number",
431 "d": "dual number",
432 "p": "plural number",
433 # Verb qualifiers
434 "impf": "imperfective",
435 "pf": "perfective",
436 "mf": ["masculine", "feminine"],
437}