Coverage for src/wiktextract/extractor/zh/tags.py: 100%

32 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1from .models import WordEntry 

2from .topics import LABEL_TOPICS 

3 

4GENDER_TAGS: dict[str, str] = { 

5 "陰性": "feminine", 

6 "阴性": "feminine", 

7 "陰性形式": "feminine", 

8 "陰性等價詞": "feminine", 

9 "陽性": "masculine", 

10 "陽性形式": "masculine", 

11 "中性": "neuter", 

12 "中性形式": "neuter", 

13} 

14 

15NUMBER_TAGS: dict[str, str | list[str]] = { 

16 "單數": "singular", 

17 "单数": "singular", 

18 "複數": "plural", 

19 "复数": "plural", 

20 "定單數": ["definite", "singular"], 

21 "定单数": ["definite", "singular"], 

22 "不定單數": ["indefinite", "singular"], 

23 "不定单数": ["indefinite", "singular"], 

24 "不定複數": ["indefinite", "plural"], 

25 "不定复数": ["indefinite", "plural"], 

26 "定複數": ["definite", "plural"], 

27 "斜格複數": ["oblique", "plural"], 

28 "主格單數": ["nominative", "singular"], 

29 "主格複數": ["nominative", "plural"], 

30 "屬格單數": ["genitive", "singular"], 

31 "屬格複數": ["genitive", "plural"], 

32 "陰性單數": ["feminine", "singular"], 

33 "陽性單數": ["masculine", "singular"], 

34 "陰性複數": ["feminine", "plural"], 

35 "陽性複數": ["masculine", "plural"], 

36 "中性複數": ["neuter", "plural"], 

37 "中性單數": ["neuter", "singular"], 

38 "賓格單數": ["accusative", "singular"], 

39 "賓格複數": ["accusative", "plural"], 

40 "無複數": "no-plural", 

41} 

42 

43# https://en.wikipedia.org/wiki/Count_noun 

44COUNT_TAGS: dict[str, str] = { 

45 "可數": "countable", 

46 "不可數": "uncountable", 

47} 

48 

49OTHER_TAGS: dict[str, str] = { 

50 "指小詞": "diminutive", 

51 "指小": "diminutive", 

52 "變格類型": "declension-pattern-of", 

53 "屬格": "genitive", 

54 "部分格": "partitive", 

55 "個人": "person", 

56 "無屈折": "indeclinable", 

57 "諺文": "hangeul", 

58 "漢字": "hanja", 

59 # Template:cs-proper noun 

60 "相關形容詞": ["relational", "adjective"], 

61 "關係形容詞": ["relational", "adjective"], 

62 "居民稱謂詞": "demonym", 

63 "女性居民稱謂詞": ["feminine", "demonym"], 

64 "定賓格": ["definite", "accusative"], 

65 "定宾格": ["definite", "accusative"], 

66 "拉丁字母拼寫": "romanization", 

67 "定指賓格": ["definite", "accusative"], 

68 "前元音和諧變體": "front-vowel-harmony", 

69} 

70 

71VERB_TAGS: dict[str, str] = { 

72 "及物": "transitive", 

73 "不及物": "intransitive", 

74 "动宾结构": "verb-object", 

75 "非完": "imperfective", 

76 "完": "perfective", 

77 "強變化": "strong", 

78 "動名詞": "supine", 

79 "命令式": "imperative", 

80} 

81 

82# https://en.wikipedia.org/wiki/Japanese_grammar#Stem_forms 

83JA_STEM_FORMS: dict[str, str] = { 

84 "未然形": "imperfective", 

85 "連用形": "continuative", 

86 "終止形": "terminal", 

87 "連體形": "attributive", 

88 "連体形": "attributive", 

89 "假定形": "hypothetical", 

90 "仮定形": "hypothetical", 

91 "命令形": "imperative", 

92} 

93 

94# https://en.wikipedia.org/wiki/Voice_(grammar) 

95VOICE_TAGS: dict[str, str | list[str]] = { 

96 "被動形": "passive", 

97 "使役形": "causative", 

98 "可能形": "potential", 

99 "意志形": "volitional", 

100 "否定形": "negative", 

101 "否定連用形": ["negative", "continuative"], 

102 "尊敬形": "formal", 

103 "完成形": "perfective", 

104 "接續形": "conjunctive", 

105 "條件形": ["hypothetical", "conditional"], 

106} 

107 

108COMPARISON_TAGS: dict[str, str] = { 

109 # https://en.wikipedia.org/wiki/Comparison_(grammar) 

110 "原级": "positive", 

111 "比較級": "comparative", 

112 "最高級": "superlative", 

113} 

114 

115TENSE_TAGS = { 

116 "過去時": "preterite", 

117 "過去式": "past", 

118 "過去分詞": ["past", "participle"], 

119 "現在時": "present", 

120 "第三人稱單數現在時": ["third-person", "singular", "present"], 

121 "助動詞": "auxiliary", 

122 # Template:de-verb 

123 "弱變化": "weak", 

124 "弱变化": "weak", 

125 "第三人稱單數簡單現在時": ["third-person", "singular", "present"], 

126 "現在分詞": ["present", "participle"], 

127 "一般過去時及過去分詞": ["past", "participle"], 

128 # Template:it-verb 

129 "第一人稱單數 現在時": ["first-person", "singular", "present"], 

130 "第一人稱單數 先過去時": ["first-person", "singular", "past", "historic"], 

131 # Template:de-adj 

132 "強變化主格陽性單數": ["strong", "nominative", "masculine", "singular"], 

133 # Template:la-verb 

134 "现在时不定式": ["present", "infinitive"], 

135 "完成时主动式": ["perfect", "active"], 

136 "目的动名词": "supine", 

137} 

138 

139GRAMMATICAL_TAGS: dict[str, str] = { 

140 **GENDER_TAGS, 

141 **NUMBER_TAGS, 

142 **COUNT_TAGS, 

143 **OTHER_TAGS, 

144 **VERB_TAGS, 

145 **JA_STEM_FORMS, 

146 **VOICE_TAGS, 

147 **COMPARISON_TAGS, 

148 **TENSE_TAGS, 

149} 

150 

151# https://zh.wiktionary.org/wiki/Template:Label 

152# https://zh.wiktionary.org/wiki/Module:Labels/data 

153# https://zh.wiktionary.org/wiki/Template:Qualifier 

154# https://zh.wiktionary.org/wiki/Template:古 

155# https://zh.wiktionary.org/wiki/Template:注释 

156LABEL_TAGS = { 

157 "棄用": "obsolete", 

158 "弃用": "obsolete", 

159 "比喻": "figuratively", 

160 "古": "archaic", 

161 "陽": "masculine", 

162 "陰": "feminine", 

163 "喻": "figuratively", 

164 "書": "literary", 

165 "口": "colloquial", 

166 "俚": "slang", 

167 "俗": "slang", 

168 "方": "dialectal", 

169 "废": "obsolete", 

170 "貶": "derogatory", 

171 "罕": "rare", 

172 "引": "broadly", 

173 "現已罕用": "archaic", 

174 # Module:Labels/data 

175 "back slang": "slang", 

176 "synecdochically": "synecdoche", 

177 "不再自由造詞": "idiomatic", 

178 "不及物": "intransitive", 

179 "不可數": "uncountable", 

180 "不定": "indefinite", 

181 "不常見": "uncommon", 

182 "不推薦使用": "proscribed", 

183 "中性": "neuter", 

184 "中間被動語態": "mediopassive", 

185 "中間語態": "middle", 

186 "主動語態": "active", 

187 "主要用於否定": ["usually", "with-negation"], 

188 "交互": "reciprocal", 

189 "以單數形式": "singular", 

190 "以複數形式": "in-plural", 

191 "作定語": "attributive", 

192 "作格": "ergative", 

193 "作表語": "predicative", 

194 "使役": "causative", 

195 "俗語": "idiomatic", 

196 "俚語": "slang", 

197 "俚语": "slang", 

198 "兒童用語": "childish", 

199 "公文": "bureaucratese", 

200 "冒犯": "offensive", 

201 "分詞": "participle", 

202 "前古典": "pre-Classical", 

203 "助動詞": "auxiliary", 

204 "助記符": "mnemonic", 

205 "及物": "transitive", 

206 "反問句": "rhetoric", 

207 "反身": "reflexive", 

208 "口語": "colloquial", 

209 "古舊": "archaic", 

210 "可數": "countable", 

211 "同性戀俚語": ["slang", "LGBT"], 

212 "名詞化": "noun-from-verb", 

213 "唯單": "singular-only", 

214 "唯複": "plural-only", 

215 "國際音標": "IPA", 

216 "基數詞": "cardinal", 

217 "大寫": "capitalized", 

218 "委婉": "euphemistic", 

219 "字面義": "literally", 

220 "完整": "perfect", 

221 "完整體": "perfective", 

222 "定語": "attributive", 

223 "實詞": "substantive", 

224 "尊敬": "honorific", 

225 "常用複數": "plural-normally", 

226 "幽默": "humorous", 

227 "序數詞": "ordinal", 

228 "廣義來說": "broadly", 

229 "引申": "broadly", 

230 "弱祈使式": "jussive", 

231 "強調": "emphatic", 

232 "後古典": "obsolete", 

233 "性別中立": "gender-neutral", 

234 "情態": "modal", 

235 "愛稱": "endearing", 

236 "所有格代詞": ["possessive", "pronoun", "without-noun"], 

237 "押韻俚語": "slang", 

238 "抽象名詞": "abstract-noun", 

239 "擬態詞": "ideophonic", 

240 "擬聲詞": "onomatopoeic", 

241 "新詞": "neologism", 

242 "方言": "dialectal", 

243 "書面": "literary", 

244 "有比較級": "comparable", 

245 "有生": "animate", 

246 "正式": "formal", 

247 "歷史": "historical", 

248 "比喻義": "figuratively", 

249 "無人稱": "impersonal", 

250 "無比較級": "not-comparable", 

251 "無生": "inanimate", 

252 "焦點": "focus", 

253 "狹義": "narrowly", 

254 "監獄俚語": "slang", 

255 "直陳語氣": "indicative", 

256 "短信": "Internet", 

257 "祈使語氣": "imperative", 

258 "禮貌": "polite", 

259 "種族歧視語": "slur", 

260 "粉絲用語": ["slang", "lifestyle"], 

261 "粗俗": "vulgar", 

262 "系動詞": "copulative", 

263 "網路用語": "Internet", 

264 "縮寫": "abbreviation", 

265 "罕用": "rare", 

266 "臨時語": "nonce-word", 

267 "虛擬語氣": "subjunctive", 

268 "表語": "predicative", 

269 "被動語態": "passive", 

270 "視覺方言": "pronunciation-spelling", 

271 "親切": "familiar", 

272 "詈語": "expletive", 

273 "詩歌": "poetic", 

274 "誇飾": "excessive", 

275 "語中音省略": "syncope", 

276 "諷刺": "sarcastic", 

277 "謙遜": "humble", 

278 "貶義": "derogatory", 

279 "轉喻義": "metonymically", 

280 "返璞詞": "retronym", 

281 "過時": "dated", 

282 "陰性": "feminine", 

283 "陽性": "masculine", 

284 "雙及物動詞": "ditransitive", 

285 "靜態動詞": "stative", 

286 "非完整": "imperfect", 

287 "非完整體": "imperfective", 

288 "非常罕用": "rare", 

289 "非標準": "nonstandard", 

290 "非標準形式": "nonstandard", 

291 "非正式": "informal", 

292 "首字母縮略詞": "initialism", 

293 "駭客語": ["Leet", "Internet"], 

294 "高語域": "honorific", 

295 "中醫": "Traditional-Chinese-Medicine", 

296 "修辭學": "rhetoric", 

297 "印度教": "Hinduism", 

298 "摩門教": "Mormonism", 

299 "物理": "particle", 

300 "猶太教": "Judaism", 

301 "納粹主義": "Nazism", 

302 "網際網路": "Internet", 

303 "耆那教": "Jainism", 

304 "聖經": "Biblical", 

305 "解剖學": "anatomy", 

306 "貴格會": "Quakerism", 

307 "錫克教": "Sikhism", 

308 "馬克思主義": "Marxism", 

309 # also from Module:Labels/data, but translated manually 

310 "喃字": "Chu-Nom", 

311 "反身代詞": "reflexive", 

312 "字面意義": "literally", 

313 "成語": "Chengyu", 

314 "及物、不及物": ["transitive", "intransitive"], 

315 "集合名詞": "collective", 

316 "控制動詞": "control-verb", 

317 "省略": "ellipsis", 

318 "分數": "fractional", 

319 "以雙數形式": "dual", 

320 "主要用於否定複數": ["negative", "plural"], 

321 "數詞縮寫": ["numeral", "abbreviation"], 

322 "主要用於肯定": "positive", 

323 "古典": "Classical", 

324 "中國大陸": "Mainland-China", 

325 "書面語": "literary", 

326} 

327 

328# example sentence template 

329# https://zh.wiktionary.org/wiki/Template:Zh-x 

330# https://zh.wiktionary.org/wiki/Module:Zh-usex/data 

331ZH_X_TAGS = { 

332 "繁體": "Traditional Chinese", 

333 "繁體和": "Traditional Chinese", 

334 "簡體": "Simplified Chinese", 

335 "繁體和簡體": ["Traditional Chinese", "Simplified Chinese"], 

336 "漢語拼音": "Pinyin", 

337 "粵拼": "Jyutping", 

338 "現代標準漢語": "Standard Chinese", 

339 "文言文": "Classical Chinese", 

340 "官話白話文": "Written vernacular Chinese", 

341 "粵語": "Cantonese", 

342 "吳語": "Wu", 

343 "廣州話": "Cantonese", 

344} 

345 

346# classifier tags 

347# https://zh.wiktionary.org/wiki/Template:zh-mw 

348# https://zh.wiktionary.org/wiki/Module:Zh/templates 

349ZH_TAGS = { 

350 "官話": "Mandarin", 

351 "贛語": "Gan", 

352 "客家話": "Hakka", 

353 "晉語": "Jin", 

354 "閩北語": "Northern Min", 

355 "閩東語": "Eastern Min", 

356 "閩南語": "Southern Min", 

357 "潮州話": "Teochew", 

358 "湘語": "Xiang", 

359} 

360 

361# https://zh.wiktionary.org/wiki/Template:Zh-pron 

362# https://zh.wiktionary.org/wiki/Module:Zh-pron 

363ZH_PRON_TAGS = { 

364 "拼音": "Pinyin", 

365 "注音": "Bopomofo", 

366 "潮州話拼音": "Peng'im", 

367 "上海": "Shanghai", 

368 "吳語學堂拼音": "Wugniu", 

369 "通用拼音": "Tongyong-Pinyin", 

370 "威妥瑪拼音": "Wade–Giles", 

371 "耶魯官話拼音": "Yale", 

372 "國語羅馬字": "Gwoyeu-Romatsyh", 

373 "西里爾字母轉寫": "Palladius", 

374 "漢語國際音標": "Sinological-IPA", 

375 "耶魯粵拼": ["Yale", "Jyutping"], 

376 "廣州話拼音": ["Cantonese", "Pinyin"], 

377 "廣東拼音": "Guangdong-Romanization", 

378 "國際音標": "IPA", 

379 "模仿白話字": "POJ", 

380} 

381 

382 

383ALL_TAGS = { 

384 **GRAMMATICAL_TAGS, 

385 **LABEL_TAGS, 

386 **ZH_X_TAGS, 

387 **ZH_TAGS, 

388 **ZH_PRON_TAGS, 

389} 

390 

391 

392def translate_raw_tags(data: WordEntry) -> WordEntry: 

393 raw_tags = [] 

394 for raw_tag in data.raw_tags: 

395 if raw_tag in ALL_TAGS: 

396 tr_tag = ALL_TAGS[raw_tag] 

397 if isinstance(tr_tag, str) and tr_tag not in data.tags: 

398 data.tags.append(tr_tag) 

399 elif isinstance(tr_tag, list): 

400 data.tags.extend(tr_tag) 

401 elif raw_tag in LABEL_TOPICS and hasattr(data, "topics"): 

402 data.topics.append(LABEL_TOPICS[raw_tag]) 

403 else: 

404 raw_tags.append(raw_tag) 

405 data.raw_tags = raw_tags 

406 return data 

407 

408 

409# https://zh.wiktionary.org/wiki/Template:T 

410# https://zh.wiktionary.org/wiki/Template:Head 

411# https://zh.wiktionary.org/wiki/Module:Gender_and_number 

412TEMPLATE_TAG_ARGS = { 

413 "f": "feminine", 

414 "m": "masculine", 

415 "n": "neuter", 

416 "c": "common", 

417 # Animacy 

418 "an": "animate", 

419 "in": "inanimate", 

420 # Animal (for Ukrainian, Belarusian, Polish) 

421 "anml": "animal", 

422 # Personal (for Ukrainian, Belarusian, Polish) 

423 "pr": "personal", 

424 # Nonpersonal not currently used 

425 "np": "nonpersonal", 

426 # Virility (for Polish) 

427 "vr": "virile", 

428 "nv": "nonvirile", 

429 # Numbers 

430 "s": "singular number", 

431 "d": "dual number", 

432 "p": "plural number", 

433 # Verb qualifiers 

434 "impf": "imperfective", 

435 "pf": "perfective", 

436 "mf": ["masculine", "feminine"], 

437}