Coverage for src/wiktextract/extractor/zh/tags.py: 98%
35 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-06 08:01 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-06 08:01 +0000
1from .models import WordEntry
2from .topics import LABEL_TOPICS
4GENDER_TAGS: dict[str, str] = {
5 "陰性": "feminine",
6 "阴性": "feminine",
7 "陰性形式": "feminine",
8 "陰性等價詞": "feminine",
9 "陽性": "masculine",
10 "陽性形式": "masculine",
11 "中性": "neuter",
12 "中性形式": "neuter",
13}
15NUMBER_TAGS: dict[str, str | list[str]] = {
16 "單數": "singular",
17 "单数": "singular",
18 "複數": "plural",
19 "复数": "plural",
20 "定單數": ["definite", "singular"],
21 "定单数": ["definite", "singular"],
22 "不定單數": ["indefinite", "singular"],
23 "不定单数": ["indefinite", "singular"],
24 "不定複數": ["indefinite", "plural"],
25 "不定复数": ["indefinite", "plural"],
26 "定複數": ["definite", "plural"],
27 "斜格複數": ["oblique", "plural"],
28 "主格單數": ["nominative", "singular"],
29 "主格複數": ["nominative", "plural"],
30 "屬格單數": ["genitive", "singular"],
31 "屬格複數": ["genitive", "plural"],
32 "陰性單數": ["feminine", "singular"],
33 "陽性單數": ["masculine", "singular"],
34 "陰性複數": ["feminine", "plural"],
35 "陽性複數": ["masculine", "plural"],
36 "中性複數": ["neuter", "plural"],
37 "中性單數": ["neuter", "singular"],
38 "賓格單數": ["accusative", "singular"],
39 "賓格複數": ["accusative", "plural"],
40 "無複數": "no-plural",
41}
43# https://en.wikipedia.org/wiki/Count_noun
44COUNT_TAGS: dict[str, str] = {
45 "可數": "countable",
46 "不可數": "uncountable",
47}
49OTHER_TAGS: dict[str, str] = {
50 "指小詞": "diminutive",
51 "指小": "diminutive",
52 "變格類型": "declension-pattern-of",
53 "屬格": "genitive",
54 "部分格": "partitive",
55 "個人": "person",
56 "無屈折": "indeclinable",
57 "諺文": "hangeul",
58 "漢字": "hanja",
59 # Template:cs-proper noun
60 "相關形容詞": ["relational", "adjective"],
61 "關係形容詞": ["relational", "adjective"],
62 "居民稱謂詞": "demonym",
63 "女性居民稱謂詞": ["feminine", "demonym"],
64 "定賓格": ["definite", "accusative"],
65 "定宾格": ["definite", "accusative"],
66 "拉丁字母拼寫": "romanization",
67 "定指賓格": ["definite", "accusative"],
68 "前元音和諧變體": "front-vowel-harmony",
69 # Template:zh-forms
70 "正體": "Standard-Chinese",
71 "繁體": "Traditional-Chinese",
72 "簡體": "Simplified-Chinese",
73 "異體": "alternative",
74 "仿譯詞": "calque",
75 "貶稱詞": "pejorative",
76}
78VERB_TAGS: dict[str, str] = {
79 "及物": "transitive",
80 "不及物": "intransitive",
81 "动宾结构": "verb-object",
82 "非完": "imperfective",
83 "完": "perfective",
84 "強變化": "strong",
85 "動名詞": "supine",
86 "命令式": "imperative",
87 # Template:ar-verb
88 "第I類": "form-i",
89 "第II類": "form-ii",
90 "第III類": "form-iii",
91 "第IV類": "form-iv",
92 "第V類": "form-v",
93 "第VI類": "form-vi",
94 "第VII類": "form-vii",
95 "第IX類": "form-ix",
96 "第X類": "form-x",
97 "第XII類": "form-xii",
98 "第XIII類": "form-xiii",
99 "第XIV類": "form-xiv",
100 "第XV類": "form-xv",
101 "第Iq類": "form-iq",
102 "第IIq類": "form-iiq",
103 "第IIIq類": "form-iiiq",
104 "第IVq類": "form-ivq",
105}
107# https://en.wikipedia.org/wiki/Japanese_grammar#Stem_forms
108JA_STEM_FORMS: dict[str, str] = {
109 "未然形": "imperfective",
110 "連用形": "continuative",
111 "終止形": "terminal",
112 "連體形": "attributive",
113 "連体形": "attributive",
114 "假定形": "hypothetical",
115 "仮定形": "hypothetical",
116 "命令形": "imperative",
117}
119# https://en.wikipedia.org/wiki/Voice_(grammar)
120VOICE_TAGS: dict[str, str | list[str]] = {
121 "被動形": "passive",
122 "使役形": "causative",
123 "可能形": "potential",
124 "意志形": "volitional",
125 "否定形": "negative",
126 "否定連用形": ["negative", "continuative"],
127 "尊敬形": "formal",
128 "完成形": "perfective",
129 "接續形": "conjunctive",
130 "條件形": ["hypothetical", "conditional"],
131}
133COMPARISON_TAGS: dict[str, str] = {
134 # https://en.wikipedia.org/wiki/Comparison_(grammar)
135 "原级": "positive",
136 "比較級": "comparative",
137 "最高級": "superlative",
138}
140TENSE_TAGS = {
141 "過去時": "preterite",
142 "過去式": "past",
143 "過去分詞": ["past", "participle"],
144 "現在時": "present",
145 "第三人稱單數現在時": ["third-person", "singular", "present"],
146 "助動詞": "auxiliary",
147 # Template:de-verb
148 "弱變化": "weak",
149 "弱变化": "weak",
150 "第三人稱單數簡單現在時": ["third-person", "singular", "present"],
151 "現在分詞": ["present", "participle"],
152 "一般過去時及過去分詞": ["past", "participle"],
153 # Template:it-verb
154 "第一人稱單數 現在時": ["first-person", "singular", "present"],
155 "第一人稱單數 先過去時": ["first-person", "singular", "past", "historic"],
156 # Template:de-adj
157 "強變化主格陽性單數": ["strong", "nominative", "masculine", "singular"],
158 # Template:la-verb
159 "现在时不定式": ["present", "infinitive"],
160 "完成时主动式": ["perfect", "active"],
161 "目的动名词": "supine",
162 # Template:ar-verb
163 "非過去時": "non-past",
164 "動詞性名詞": "noun-from-verb",
165 "主動分詞": ["active", "participle"],
166 "被動分詞": ["passive", "participle"],
167}
169GRAMMATICAL_TAGS: dict[str, str] = {
170 **GENDER_TAGS,
171 **NUMBER_TAGS,
172 **COUNT_TAGS,
173 **OTHER_TAGS,
174 **VERB_TAGS,
175 **JA_STEM_FORMS,
176 **VOICE_TAGS,
177 **COMPARISON_TAGS,
178 **TENSE_TAGS,
179}
181# https://zh.wiktionary.org/wiki/Template:Label
182# https://zh.wiktionary.org/wiki/Module:Labels/data
183# https://zh.wiktionary.org/wiki/Template:Qualifier
184# https://zh.wiktionary.org/wiki/Template:古
185# https://zh.wiktionary.org/wiki/Template:注释
186LABEL_TAGS = {
187 "棄用": "obsolete",
188 "弃用": "obsolete",
189 "比喻": "figuratively",
190 "古": "archaic",
191 "陽": "masculine",
192 "陰": "feminine",
193 "喻": "figuratively",
194 "書": "literary",
195 "口": "colloquial",
196 "俚": "slang",
197 "俗": "slang",
198 "方": "dialectal",
199 "废": "obsolete",
200 "貶": "derogatory",
201 "罕": "rare",
202 "引": "broadly",
203 "現已罕用": "archaic",
204 # Module:Labels/data
205 "back slang": "slang",
206 "synecdochically": "synecdoche",
207 "不再自由造詞": "idiomatic",
208 "不及物": "intransitive",
209 "不可數": "uncountable",
210 "不定": "indefinite",
211 "不常見": "uncommon",
212 "不推薦使用": "proscribed",
213 "中性": "neuter",
214 "中間被動語態": "mediopassive",
215 "中間語態": "middle",
216 "主動語態": "active",
217 "主要用於否定": ["usually", "with-negation"],
218 "交互": "reciprocal",
219 "以單數形式": "singular",
220 "以複數形式": "in-plural",
221 "作定語": "attributive",
222 "作格": "ergative",
223 "作表語": "predicative",
224 "使役": "causative",
225 "俗語": "idiomatic",
226 "俚語": "slang",
227 "俚语": "slang",
228 "兒童用語": "childish",
229 "公文": "bureaucratese",
230 "冒犯": "offensive",
231 "分詞": "participle",
232 "前古典": "pre-Classical",
233 "助動詞": "auxiliary",
234 "助記符": "mnemonic",
235 "及物": "transitive",
236 "反問句": "rhetoric",
237 "反身": "reflexive",
238 "口語": "colloquial",
239 "口语": "colloquial",
240 "古舊": "archaic",
241 "可數": "countable",
242 "同性戀俚語": ["slang", "LGBT"],
243 "名詞化": "noun-from-verb",
244 "唯單": "singular-only",
245 "唯複": "plural-only",
246 "國際音標": "IPA",
247 "基數詞": "cardinal",
248 "大寫": "capitalized",
249 "委婉": "euphemistic",
250 "字面義": "literally",
251 "完整": "perfect",
252 "完整體": "perfective",
253 "定語": "attributive",
254 "實詞": "substantive",
255 "尊敬": "honorific",
256 "敬語": "honorific",
257 "敬语": "honorific",
258 "常用複數": "plural-normally",
259 "幽默": "humorous",
260 "序數詞": "ordinal",
261 "廣義來說": "broadly",
262 "引申": "broadly",
263 "弱祈使式": "jussive",
264 "強調": "emphatic",
265 "後古典": "obsolete",
266 "性別中立": "gender-neutral",
267 "情態": "modal",
268 "愛稱": "endearing",
269 "所有格代詞": ["possessive", "pronoun", "without-noun"],
270 "押韻俚語": "slang",
271 "抽象名詞": "abstract-noun",
272 "擬態詞": "ideophonic",
273 "擬聲詞": "onomatopoeic",
274 "新詞": "neologism",
275 "方言": "dialectal",
276 "書面": "literary",
277 "书面": "literary",
278 "有比較級": "comparable",
279 "有生": "animate",
280 "正式": "formal",
281 "歷史": "historical",
282 "比喻義": "figuratively",
283 "無人稱": "impersonal",
284 "無比較級": "not-comparable",
285 "無生": "inanimate",
286 "焦點": "focus",
287 "狹義": "narrowly",
288 "監獄俚語": "slang",
289 "直陳語氣": "indicative",
290 "短信": "Internet",
291 "祈使語氣": "imperative",
292 "禮貌": "polite",
293 "種族歧視語": "slur",
294 "粉絲用語": ["slang", "lifestyle"],
295 "粗俗": "vulgar",
296 "系動詞": "copulative",
297 "網路用語": "Internet",
298 "縮寫": "abbreviation",
299 "罕用": "rare",
300 "臨時語": "nonce-word",
301 "虛擬語氣": "subjunctive",
302 "表語": "predicative",
303 "被動語態": "passive",
304 "視覺方言": "pronunciation-spelling",
305 "親切": "familiar",
306 "詈語": "expletive",
307 "詩歌": "poetic",
308 "誇飾": "excessive",
309 "語中音省略": "syncope",
310 "諷刺": "sarcastic",
311 "謙遜": "humble",
312 "貶義": "derogatory",
313 "轉喻義": "metonymically",
314 "返璞詞": "retronym",
315 "過時": "dated",
316 "陰性": "feminine",
317 "陽性": "masculine",
318 "雙及物動詞": "ditransitive",
319 "靜態動詞": "stative",
320 "非完整": "imperfect",
321 "非完整體": "imperfective",
322 "非常罕用": "rare",
323 "非標準": "nonstandard",
324 "非标准": "nonstandard",
325 "非標準形式": "nonstandard",
326 "非正式": "informal",
327 "首字母縮略詞": "initialism",
328 "駭客語": ["Leet", "Internet"],
329 "高語域": "honorific",
330 "中醫": "Traditional-Chinese-Medicine",
331 "修辭學": "rhetoric",
332 "印度教": "Hinduism",
333 "摩門教": "Mormonism",
334 "物理": "particle",
335 "猶太教": "Judaism",
336 "納粹主義": "Nazism",
337 "網際網路": "Internet",
338 "耆那教": "Jainism",
339 "聖經": "Biblical",
340 "解剖學": "anatomy",
341 "貴格會": "Quakerism",
342 "錫克教": "Sikhism",
343 "馬克思主義": "Marxism",
344 # also from Module:Labels/data, but translated manually
345 "喃字": "Chu-Nom",
346 "反身代詞": "reflexive",
347 "字面意義": "literally",
348 "成語": "Chengyu",
349 "及物、不及物": ["transitive", "intransitive"],
350 "集合名詞": "collective",
351 "控制動詞": "control-verb",
352 "省略": "ellipsis",
353 "分數": "fractional",
354 "以雙數形式": "dual",
355 "主要用於否定複數": ["negative", "plural"],
356 "數詞縮寫": ["numeral", "abbreviation"],
357 "主要用於肯定": "positive",
358 "古典": "Classical",
359 "中國大陸": "Mainland-China",
360 "書面語": "literary",
361}
363# example sentence template
364# https://zh.wiktionary.org/wiki/Template:Zh-x
365# https://zh.wiktionary.org/wiki/Module:Zh-usex/data
366ZH_X_TAGS = {
367 "繁體": "Traditional-Chinese",
368 "繁體和": "Traditional-Chinese",
369 "簡體": "Simplified-Chinese",
370 "繁體和簡體": ["Traditional-Chinese", "Simplified-Chinese"],
371 "漢語拼音": "Pinyin",
372 "粵拼": "Jyutping",
373 "現代標準漢語": "Standard-Chinese",
374 "文言文": "Classical-Chinese",
375 "官話白話文": "Written-vernacular-Chinese",
376 "粵語": "Cantonese",
377 "吳語": "Wu",
378 "廣州話": "Cantonese",
379 "臺灣華語": "Taiwanese-Mandarin",
380}
382# classifier tags
383# https://zh.wiktionary.org/wiki/Template:zh-mw
384# https://zh.wiktionary.org/wiki/Module:Zh/templates
385ZH_TAGS = {
386 "官話": "Mandarin",
387 "贛語": "Gan",
388 "客家話": "Hakka",
389 "晉語": "Jin",
390 "閩北語": "Min-Bei",
391 "閩東語": "Min-Dong",
392 "閩南語": "Min-Nan",
393 "潮州話": "Teochew",
394 "湘語": "Xiang",
395}
397# https://zh.wiktionary.org/wiki/Template:Zh-pron
398# https://zh.wiktionary.org/wiki/Module:Zh-pron
399ZH_PRON_TAGS = {
400 "拼音": "Pinyin",
401 "注音": "Bopomofo",
402 "潮州話拼音": "Peng'im",
403 "上海": "Shanghai",
404 "吳語學堂拼音": "Wugniu",
405 "通用拼音": "Tongyong-Pinyin",
406 "威妥瑪拼音": "Wade–Giles",
407 "耶魯官話拼音": "Yale",
408 "國語羅馬字": "Gwoyeu-Romatsyh",
409 "西里爾字母轉寫": "Cyrillic",
410 "西里爾字母": "Cyrillic",
411 "漢語國際音標": "Sinological-IPA",
412 "耶魯粵拼": ["Yale", "Jyutping"],
413 "廣州話拼音": ["Cantonese", "Pinyin"],
414 "廣東拼音": "Guangdong-Romanization",
415 "國際音標": "IPA",
416 "模仿白話字": "POJ",
417 "標準粵語": "Standard-Cantonese",
418 "廣州–香港話": ["Guangzhou", "Hong Kong"],
419 "福州話": "Fuzhou",
420 "平話字": "Foochow-Romanized",
421 "客家語": "Hakka",
422 "白話字": "Phak-fa-su",
423 "泉漳話": "Hokkien",
424 "泉州": "Quanzhou",
425 "廈門": "Xiamen",
426 "輕尾聲異讀": "toneless-final-syllable-variant",
427 "維基詞典": "Wiktionary-specific",
428 "維基詞典拼音": ["Wiktionary-specific", "Pinyin"],
429 "維基詞典轉寫": "Wiktionary-specific",
430 "成都話": "Chengdu",
431 "四川話拼音": ["Sichuanese", "Pinyin"],
432 "東干語": "Dongan",
433 "台山話": "Taishanese",
434 "四縣": "Sixian",
435 "長沙話": "Changsha",
436 "四川話拉丁化新文字": "Latinxua-Sin-Wenz",
437 "台城": "Taicheng",
438 "南昌話": "Nanchang",
439 "四縣話": "Sixian",
440 "苗栗": "Miaoli",
441 "美濃": "Neipu",
442 "客家語拼音": "Hakka-Romanization-System",
443 "客家話拼音方案": "Hagfa-Pinyim",
444 "太原話": "Taiyuan",
445 "老派": "dated",
446 "新加坡": "Singapore",
447 "臺羅": "Tâi-lô",
448 "普實台文": "Phofsit-Daibuun",
449 "太湖片": "Northern",
450 "吳音小字典": "MiniDict",
451 "維基詞典羅馬化": ["Wiktionary-specific", "romanization"],
452 "上海話": "Shanghai",
453 "中古漢語": "Middle-Chinese",
454 "莆仙語": "Puxian-Min",
455 "莆仙話拼音": "Pouseng-Ping'ing",
456 "莆田": "Putian",
457 "仙遊": "Xianyou",
458 "漳州": "Zhangzhou",
459 "臺北": "Taibei",
460 "高雄": "Kaohsiung",
461 "實際讀音": "phonetic",
462 "臺灣話": "Taiwanese",
463 "常用": "general",
464 "檳城": "Penang",
465 "兒化": "Erhua",
466 "文讀": "literary",
467 "中國大陸標準讀法": ["Mainland-China", "standard"],
468 "臺灣異讀法": ["Taiwan", "variant"],
469 "中國大陸與臺灣標準讀法": ["Mainland-China", "Taiwan", "standard"],
470}
472ZH_DIAL_TAGS = {
473 "白話文": "Written-vernacular-Chinese",
474 "北京": "Beijing",
475 "燕京官話": "Northeastern-Mandarin",
476 "冀魯官話": "Jilu-Mandarin",
477 "膠遼官話": "Jiaoliao-Mandarin",
478 "中原官話": "Central-Plains-Mandarin",
479 "蘭銀官話": "Lanyin-Mandarin",
480 "西南官話": "Southwestern-Mandarin",
481 "江淮官話": "Jianghuai-Mandarin",
482 "徽語": "Huizhou",
483 "南部平話": "Southern-Pinghua",
484 "濟南": "Jinan",
485 "臺灣": "Taiwan",
486}
488TH_PRON_TAGS = {
489 "寫法": "orthographic",
490 "音素": "phoneme",
491 "泰語羅馬化": "romanization",
492 "派汶拼音": "Paiboon",
493 "皇家轉寫": "Royal-Institute",
494 "非正字法": "unorthographical",
495 "短音": "short",
496}
499ALL_TAGS = {
500 **GRAMMATICAL_TAGS,
501 **LABEL_TAGS,
502 **ZH_X_TAGS,
503 **ZH_TAGS,
504 **ZH_PRON_TAGS,
505 **ZH_DIAL_TAGS,
506 **TH_PRON_TAGS,
507}
510def translate_raw_tags(data: WordEntry) -> WordEntry:
511 raw_tags = []
512 for raw_tag in data.raw_tags:
513 if raw_tag in ALL_TAGS:
514 tr_tag = ALL_TAGS[raw_tag]
515 if isinstance(tr_tag, str) and tr_tag not in data.tags:
516 data.tags.append(tr_tag)
517 elif isinstance(tr_tag, list):
518 data.tags.extend(tr_tag)
519 elif raw_tag in LABEL_TOPICS and hasattr(data, "topics"):
520 data.topics.append(LABEL_TOPICS[raw_tag])
521 elif raw_tag not in raw_tags: 521 ↛ 512line 521 didn't jump to line 512 because the condition on line 521 was always true
522 raw_tags.append(raw_tag)
523 data.raw_tags = raw_tags
524 return data
527# https://zh.wiktionary.org/wiki/Template:T
528# https://zh.wiktionary.org/wiki/Template:Head
529# https://zh.wiktionary.org/wiki/Module:Gender_and_number
530TEMPLATE_TAG_ARGS = {
531 "f": "feminine",
532 "m": "masculine",
533 "n": "neuter",
534 "c": "common",
535 # Animacy
536 "an": "animate",
537 "in": "inanimate",
538 # Animal (for Ukrainian, Belarusian, Polish)
539 "anml": "animal",
540 # Personal (for Ukrainian, Belarusian, Polish)
541 "pr": "personal",
542 # Nonpersonal not currently used
543 "np": "nonpersonal",
544 # Virility (for Polish)
545 "vr": "virile",
546 "nv": "nonvirile",
547 # Numbers
548 "s": "singular number",
549 "d": "dual number",
550 "p": "plural number",
551 # Verb qualifiers
552 "impf": "imperfective",
553 "pf": "perfective",
554 "mf": ["masculine", "feminine"],
555}