Coverage for src/wiktextract/extractor/zh/tags.py: 100%

31 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from .models import WordEntry 

2from .topics import LABEL_TOPICS 

3 

4GENDER_TAGS: dict[str, str] = { 

5 "陰性": "feminine", 

6 "陽性": "masculine", 

7 "中性": "neuter", 

8} 

9 

10NUMBER_TAGS: dict[str, str] = { 

11 "單數": "singular", 

12 "複數": "plural", 

13 "定單數": "definite singular", 

14 "不定複數": "indefinite plural", 

15 "定複數": "definite plural", 

16 "斜格複數": "oblique plural", 

17 "主格單數": "nominative singular", 

18 "主格複數": "nominative plural", 

19 "屬格單數": "genitive singular", 

20 "屬格複數": "genitive plural", 

21 "陰性單數": "feminine singular", 

22 "陽性單數": "masculine singular", 

23 "陰性複數": "feminine plural", 

24 "陽性複數": "masculine plural", 

25 "中性複數": "neuter plural", 

26 "中性單數": "neuter singular", 

27} 

28 

29# https://en.wikipedia.org/wiki/Count_noun 

30COUNT_TAGS: dict[str, str] = { 

31 "可數": "countable", 

32 "不可數": "uncountable", 

33} 

34 

35OTHER_TAGS: dict[str, str] = { 

36 "指小詞": "diminutive", 

37 "變格類型": "declension pattern", 

38} 

39 

40VERB_TAGS: dict[str, str] = { 

41 "及物": "transitive", 

42 "不及物": "intransitive", 

43 "动宾结构": "verb-object", 

44} 

45 

46# https://en.wikipedia.org/wiki/Japanese_grammar#Stem_forms 

47JA_STEM_FORMS: dict[str, str] = { 

48 "未然形": "imperfective", 

49 "連用形": "continuative", 

50 "終止形": "terminal", 

51 "連體形": "attributive", 

52 "連体形": "attributive", 

53 "假定形": "hypothetical", 

54 "仮定形": "hypothetical", 

55 "命令形": "imperative", 

56} 

57 

58# https://en.wikipedia.org/wiki/Voice_(grammar) 

59VOICE_TAGS: dict[str, str] = { 

60 "被動形": "passive", 

61 "使役形": "causative", 

62 "可能形": "potential", 

63 "意志形": "volitional", 

64 "否定形": "negative", 

65 "否定連用形": "negative continuative", 

66 "尊敬形": "formal", 

67 "完成形": "perfective", 

68 "接續形": "conjunctive", 

69 "條件形": "hypothetical conditional", 

70} 

71 

72COMPARISON_TAGS: dict[str, str] = { 

73 # https://en.wikipedia.org/wiki/Comparison_(grammar) 

74 "原级": "positive", 

75 "比較級": "comparative", 

76 "最高級": "superlative", 

77} 

78 

79GRAMMATICAL_TAGS: dict[str, str] = { 

80 **GENDER_TAGS, 

81 **NUMBER_TAGS, 

82 **COUNT_TAGS, 

83 **OTHER_TAGS, 

84 **VERB_TAGS, 

85 **JA_STEM_FORMS, 

86 **VOICE_TAGS, 

87 **COMPARISON_TAGS, 

88} 

89 

90# https://zh.wiktionary.org/wiki/Template:Label 

91# https://zh.wiktionary.org/wiki/Module:Labels/data 

92# https://zh.wiktionary.org/wiki/Template:Qualifier 

93# https://zh.wiktionary.org/wiki/Template:古 

94# https://zh.wiktionary.org/wiki/Template:注释 

95LABEL_TAGS = { 

96 "棄用": "obsolete", 

97 "比喻": "figuratively", 

98 "古": "archaic", 

99 "陽": "masculine", 

100 "陰": "feminine", 

101 "喻": "figuratively", 

102 "書": "literary", 

103 "口": "colloquial", 

104 "俚": "slang", 

105 "俗": "slang", 

106 "方": "dialectal", 

107 "废": "obsolete", 

108 "貶": "derogatory", 

109 "罕": "rare", 

110 "引": "broadly", 

111 "現已罕用": "archaic", 

112 # Module:Labels/data 

113 "back slang": "slang", 

114 "synecdochically": "synecdoche", 

115 "不再自由造詞": "idiomatic", 

116 "不及物": "intransitive", 

117 "不可數": "uncountable", 

118 "不定": "indefinite", 

119 "不常見": "uncommon", 

120 "不推薦使用": "proscribed", 

121 "中性": "neuter", 

122 "中間被動語態": "mediopassive", 

123 "中間語態": "middle", 

124 "主動語態": "active", 

125 "主要用於否定": "usually with-negation", 

126 "交互": "reciprocal", 

127 "以單數形式": "singular", 

128 "以複數形式": "in-plural", 

129 "作定語": "attributive", 

130 "作格": "ergative", 

131 "作表語": "predicative", 

132 "使役": "causative", 

133 "俗語": "idiomatic", 

134 "俚語": "slang", 

135 "兒童用語": "childish", 

136 "公文": "bureaucratese", 

137 "冒犯": "offensive", 

138 "分詞": "participle", 

139 "前古典": "pre-Classical", 

140 "助動詞": "auxiliary", 

141 "助記符": "mnemonic", 

142 "及物": "transitive", 

143 "反問句": "rhetoric", 

144 "反身": "reflexive", 

145 "口語": "colloquial", 

146 "古舊": "archaic", 

147 "可數": "countable", 

148 "同性戀俚語": "slang LGBT", 

149 "名詞化": "noun-from-verb", 

150 "唯單": "singular singular-only singular", 

151 "唯複": "plural plural-only", 

152 "國際音標": "IPA", 

153 "基數詞": "cardinal", 

154 "大寫": "capitalized", 

155 "委婉": "euphemistic", 

156 "字面義": "literally", 

157 "完整": "perfect", 

158 "完整體": "perfective", 

159 "定語": "attributive", 

160 "實詞": "substantive", 

161 "尊敬": "honorific", 

162 "常用複數": "plural-normally", 

163 "幽默": "humorous", 

164 "序數詞": "ordinal", 

165 "廣義來說": "broadly", 

166 "引申": "broadly", 

167 "弱祈使式": "jussive", 

168 "強調": "emphatic", 

169 "後古典": "obsolete", 

170 "性別中立": "gender-neutral", 

171 "情態": "modal", 

172 "愛稱": "endearing", 

173 "所有格代詞": "possessive pronoun without-noun", 

174 "押韻俚語": "slang", 

175 "抽象名詞": "abstract-noun", 

176 "擬態詞": "ideophonic", 

177 "擬聲詞": "onomatopoeic", 

178 "新詞": "neologism", 

179 "方言": "dialectal", 

180 "書面": "literary", 

181 "有比較級": "comparable", 

182 "有生": "animate", 

183 "正式": "formal", 

184 "歷史": "historical", 

185 "比喻義": "figuratively", 

186 "無人稱": "impersonal", 

187 "無比較級": "not-comparable", 

188 "無生": "inanimate", 

189 "焦點": "focus", 

190 "狹義": "narrowly", 

191 "監獄俚語": "slang", 

192 "直陳語氣": "indicative", 

193 "短信": "Internet", 

194 "祈使語氣": "imperative", 

195 "禮貌": "polite", 

196 "種族歧視語": "slur", 

197 "粉絲用語": "slang lifestyle", 

198 "粗俗": "vulgar", 

199 "系動詞": "copulative", 

200 "網路用語": "Internet", 

201 "縮寫": "abbreviation", 

202 "罕用": "rare", 

203 "臨時語": "nonce-word", 

204 "虛擬語氣": "subjunctive", 

205 "表語": "predicative", 

206 "被動語態": "passive", 

207 "視覺方言": "pronunciation-spelling", 

208 "親切": "familiar", 

209 "詈語": "expletive", 

210 "詩歌": "poetic", 

211 "誇飾": "excessive", 

212 "語中音省略": "syncope", 

213 "諷刺": "sarcastic", 

214 "謙遜": "humble", 

215 "貶義": "derogatory", 

216 "轉喻義": "metonymically", 

217 "返璞詞": "retronym", 

218 "過時": "dated", 

219 "陰性": "feminine", 

220 "陽性": "masculine", 

221 "雙及物動詞": "ditransitive", 

222 "靜態動詞": "stative", 

223 "非完整": "imperfect", 

224 "非完整體": "imperfective", 

225 "非常罕用": "rare", 

226 "非標準": "nonstandard", 

227 "非標準形式": "nonstandard", 

228 "非正式": "informal", 

229 "首字母縮略詞": "initialism", 

230 "駭客語": "Leet Internet", 

231 "高語域": "honorific", 

232 "中醫": "Traditional-Chinese-Medicine", 

233 "修辭學": "rhetoric", 

234 "印度教": "Hinduism", 

235 "摩門教": "Mormonism", 

236 "物理": "particle", 

237 "猶太教": "Judaism", 

238 "納粹主義": "Nazism", 

239 "網際網路": "Internet", 

240 "耆那教": "Jainism", 

241 "聖經": "Biblical", 

242 "解剖學": "anatomy", 

243 "貴格會": "Quakerism", 

244 "錫克教": "Sikhism", 

245 "馬克思主義": "Marxism", 

246 # also from Module:Labels/data, but translated manually 

247 "喃字": "Chu-Nom", 

248 "反身代詞": "reflexive", 

249 "字面意義": "literally", 

250 "成語": "Chengyu", 

251 "及物、不及物": ["transitive", "intransitive"], 

252 "集合名詞": "collective", 

253 "控制動詞": "control-verb", 

254 "省略": "ellipsis", 

255 "分數": "fractional", 

256 "以雙數形式": "dual", 

257 "主要用於否定複數": ["negative", "plural"], 

258 "數詞縮寫": ["numeral", "abbreviation"], 

259 "主要用於肯定": "positive", 

260} 

261 

262# example sentence template 

263# https://zh.wiktionary.org/wiki/Template:Zh-x 

264# https://zh.wiktionary.org/wiki/Module:Zh-usex/data 

265ZH_X_TAGS = { 

266 "繁體": "Traditional Chinese", 

267 "簡體": "Simplified Chinese", 

268 "繁體和簡體": ["Traditional Chinese", "Simplified Chinese"], 

269 "漢語拼音": "Pinyin", 

270 "粵拼": "Jyutping", 

271 "現代標準漢語": "Standard Chinese", 

272 "文言文": "Classical Chinese", 

273 "官話白話文": "Written vernacular Chinese", 

274 "粵語": "Cantonese", 

275 "吳語": "Wu", 

276 "廣州話": "Cantonese", 

277} 

278 

279# classifier tags 

280# https://zh.wiktionary.org/wiki/Template:zh-mw 

281# https://zh.wiktionary.org/wiki/Module:Zh/templates 

282ZH_TAGS = { 

283 "官話": "Mandarin", 

284 "贛語": "Gan", 

285 "客家話": "Hakka", 

286 "晉語": "Jin", 

287 "閩北語": "Northern Min", 

288 "閩東語": "Eastern Min", 

289 "閩南語": "Southern Min", 

290 "潮州話": "Teochew", 

291 "湘語": "Xiang", 

292} 

293 

294# https://zh.wiktionary.org/wiki/Template:Zh-pron 

295# https://zh.wiktionary.org/wiki/Module:Zh-pron 

296ZH_PRON_TAGS = { 

297 "拼音": "Pinyin", 

298 "注音": "Bopomofo", 

299 "潮州話拼音": "Peng'im", 

300 "上海": "Shanghai", 

301 "吳語學堂拼音": "Wugniu", 

302 "通用拼音": "Tongyong-Pinyin", 

303 "威妥瑪拼音": "Wade–Giles", 

304 "耶魯官話拼音": "Yale", 

305 "國語羅馬字": "Gwoyeu-Romatsyh", 

306 "西里爾字母轉寫": "Palladius", 

307 "漢語國際音標": "Sinological-IPA", 

308 "耶魯粵拼": ["Yale", "Jyutping"], 

309 "廣州話拼音": ["Cantonese", "Pinyin"], 

310 "廣東拼音": "Guangdong-Romanization", 

311 "國際音標": "IPA", 

312 "模仿白話字": "POJ", 

313} 

314 

315 

316ALL_TAGS = { 

317 **GRAMMATICAL_TAGS, 

318 **LABEL_TAGS, 

319 **ZH_X_TAGS, 

320 **ZH_TAGS, 

321 **ZH_PRON_TAGS, 

322} 

323 

324 

325def translate_raw_tags(data: WordEntry) -> WordEntry: 

326 raw_tags = [] 

327 for raw_tag in data.raw_tags: 

328 if raw_tag in ALL_TAGS: 

329 tr_tag = ALL_TAGS[raw_tag] 

330 if isinstance(tr_tag, str) and tr_tag not in data.tags: 

331 data.tags.append(tr_tag) 

332 elif isinstance(tr_tag, list): 

333 data.tags.extend(tr_tag) 

334 elif raw_tag in LABEL_TOPICS and hasattr(data, "topics"): 

335 data.topics.append(LABEL_TOPICS[raw_tag]) 

336 else: 

337 raw_tags.append(raw_tag) 

338 data.raw_tags = raw_tags 

339 return data 

340 

341 

342# https://zh.wiktionary.org/wiki/Template:T 

343# https://zh.wiktionary.org/wiki/Template:Head 

344# https://zh.wiktionary.org/wiki/Module:Gender_and_number 

345TEMPLATE_TAG_ARGS = { 

346 "f": "feminine", 

347 "m": "masculine", 

348 "n": "neuter", 

349 "c": "common", 

350 # Animacy 

351 "an": "animate", 

352 "in": "inanimate", 

353 # Animal (for Ukrainian, Belarusian, Polish) 

354 "anml": "animal", 

355 # Personal (for Ukrainian, Belarusian, Polish) 

356 "pr": "personal", 

357 # Nonpersonal not currently used 

358 "np": "nonpersonal", 

359 # Virility (for Polish) 

360 "vr": "virile", 

361 "nv": "nonvirile", 

362 # Numbers 

363 "s": "singular number", 

364 "d": "dual number", 

365 "p": "plural number", 

366 # Verb qualifiers 

367 "impf": "imperfective aspect", 

368 "pf": "perfective aspect", 

369 "mf": "masculine feminine", 

370}