Coverage for src/wiktextract/extractor/zh/tags.py: 98%

1from .models import WordEntry

2from .topics import LABEL_TOPICS

4GENDER_TAGS: dict[str, str] = {

5 "陰性": "feminine",

6 "阴性": "feminine",

7 "陰性形式": "feminine",

8 "陰性等價詞": "feminine",

9 "陽性": "masculine",

10 "陽性形式": "masculine",

11 "中性": "neuter",

12 "中性形式": "neuter",

13}

15NUMBER_TAGS: dict[str, str | list[str]] = {

16 "單數": "singular",

17 "单数": "singular",

18 "複數": "plural",

19 "复数": "plural",

20 "定單數": ["definite", "singular"],

21 "定单数": ["definite", "singular"],

22 "不定單數": ["indefinite", "singular"],

23 "不定单数": ["indefinite", "singular"],

24 "不定複數": ["indefinite", "plural"],

25 "不定复数": ["indefinite", "plural"],

26 "定複數": ["definite", "plural"],

27 "斜格複數": ["oblique", "plural"],

28 "主格單數": ["nominative", "singular"],

29 "主格複數": ["nominative", "plural"],

30 "屬格單數": ["genitive", "singular"],

31 "屬格複數": ["genitive", "plural"],

32 "陰性單數": ["feminine", "singular"],

33 "陽性單數": ["masculine", "singular"],

34 "陰性複數": ["feminine", "plural"],

35 "陽性複數": ["masculine", "plural"],

36 "中性複數": ["neuter", "plural"],

37 "中性單數": ["neuter", "singular"],

38 "賓格單數": ["accusative", "singular"],

39 "賓格複數": ["accusative", "plural"],

40 "無複數": "no-plural",

41}

43# https://en.wikipedia.org/wiki/Count_noun

44COUNT_TAGS: dict[str, str] = {

45 "可數": "countable",

46 "不可數": "uncountable",

47}

49OTHER_TAGS: dict[str, str] = {

50 "指小詞": "diminutive",

51 "指小": "diminutive",

52 "變格類型": "declension-pattern-of",

53 "屬格": "genitive",

54 "部分格": "partitive",

55 "個人": "person",

56 "無屈折": "indeclinable",

57 "諺文": "hangeul",

58 "漢字": "hanja",

59 # Template:cs-proper noun

60 "相關形容詞": ["relational", "adjective"],

61 "關係形容詞": ["relational", "adjective"],

62 "居民稱謂詞": "demonym",

63 "女性居民稱謂詞": ["feminine", "demonym"],

64 "定賓格": ["definite", "accusative"],

65 "定宾格": ["definite", "accusative"],

66 "拉丁字母拼寫": "romanization",

67 "定指賓格": ["definite", "accusative"],

68 "前元音和諧變體": "front-vowel-harmony",

69 # Template:zh-forms

70 "正體": "Standard-Chinese",

71 "繁體": "Traditional-Chinese",

72 "簡體": "Simplified-Chinese",

73 "異體": "alternative",

74}

76VERB_TAGS: dict[str, str] = {

77 "及物": "transitive",

78 "不及物": "intransitive",

79 "动宾结构": "verb-object",

80 "非完": "imperfective",

81 "完": "perfective",

82 "強變化": "strong",

83 "動名詞": "supine",

84 "命令式": "imperative",

85}

87# https://en.wikipedia.org/wiki/Japanese_grammar#Stem_forms

88JA_STEM_FORMS: dict[str, str] = {

89 "未然形": "imperfective",

90 "連用形": "continuative",

91 "終止形": "terminal",

92 "連體形": "attributive",

93 "連体形": "attributive",

94 "假定形": "hypothetical",

95 "仮定形": "hypothetical",

96 "命令形": "imperative",

97}

99# https://en.wikipedia.org/wiki/Voice_(grammar)

100VOICE_TAGS: dict[str, str | list[str]] = {

101 "被動形": "passive",

102 "使役形": "causative",

103 "可能形": "potential",

104 "意志形": "volitional",

105 "否定形": "negative",

106 "否定連用形": ["negative", "continuative"],

107 "尊敬形": "formal",

108 "完成形": "perfective",

109 "接續形": "conjunctive",

110 "條件形": ["hypothetical", "conditional"],

111}

112

113COMPARISON_TAGS: dict[str, str] = {

114 # https://en.wikipedia.org/wiki/Comparison_(grammar)

115 "原级": "positive",

116 "比較級": "comparative",

117 "最高級": "superlative",

118}

119

120TENSE_TAGS = {

121 "過去時": "preterite",

122 "過去式": "past",

123 "過去分詞": ["past", "participle"],

124 "現在時": "present",

125 "第三人稱單數現在時": ["third-person", "singular", "present"],

126 "助動詞": "auxiliary",

127 # Template:de-verb

128 "弱變化": "weak",

129 "弱变化": "weak",

130 "第三人稱單數簡單現在時": ["third-person", "singular", "present"],

131 "現在分詞": ["present", "participle"],

132 "一般過去時及過去分詞": ["past", "participle"],

133 # Template:it-verb

134 "第一人稱單數現在時": ["first-person", "singular", "present"],

135 "第一人稱單數先過去時": ["first-person", "singular", "past", "historic"],

136 # Template:de-adj

137 "強變化主格陽性單數": ["strong", "nominative", "masculine", "singular"],

138 # Template:la-verb

139 "现在时不定式": ["present", "infinitive"],

140 "完成时主动式": ["perfect", "active"],

141 "目的动名词": "supine",

142}

143

144GRAMMATICAL_TAGS: dict[str, str] = {

145 **GENDER_TAGS,

146 **NUMBER_TAGS,

147 **COUNT_TAGS,

148 **OTHER_TAGS,

149 **VERB_TAGS,

150 **JA_STEM_FORMS,

151 **VOICE_TAGS,

152 **COMPARISON_TAGS,

153 **TENSE_TAGS,

154}

155

156# https://zh.wiktionary.org/wiki/Template:Label

157# https://zh.wiktionary.org/wiki/Module:Labels/data

158# https://zh.wiktionary.org/wiki/Template:Qualifier

159# https://zh.wiktionary.org/wiki/Template:古

160# https://zh.wiktionary.org/wiki/Template:注释

161LABEL_TAGS = {

162 "棄用": "obsolete",

163 "弃用": "obsolete",

164 "比喻": "figuratively",

165 "古": "archaic",

166 "陽": "masculine",

167 "陰": "feminine",

168 "喻": "figuratively",

169 "書": "literary",

170 "口": "colloquial",

171 "俚": "slang",

172 "俗": "slang",

173 "方": "dialectal",

174 "废": "obsolete",

175 "貶": "derogatory",

176 "罕": "rare",

177 "引": "broadly",

178 "現已罕用": "archaic",

179 # Module:Labels/data

180 "back slang": "slang",

181 "synecdochically": "synecdoche",

182 "不再自由造詞": "idiomatic",

183 "不及物": "intransitive",

184 "不可數": "uncountable",

185 "不定": "indefinite",

186 "不常見": "uncommon",

187 "不推薦使用": "proscribed",

188 "中性": "neuter",

189 "中間被動語態": "mediopassive",

190 "中間語態": "middle",

191 "主動語態": "active",

192 "主要用於否定": ["usually", "with-negation"],

193 "交互": "reciprocal",

194 "以單數形式": "singular",

195 "以複數形式": "in-plural",

196 "作定語": "attributive",

197 "作格": "ergative",

198 "作表語": "predicative",

199 "使役": "causative",

200 "俗語": "idiomatic",

201 "俚語": "slang",

202 "俚语": "slang",

203 "兒童用語": "childish",

204 "公文": "bureaucratese",

205 "冒犯": "offensive",

206 "分詞": "participle",

207 "前古典": "pre-Classical",

208 "助動詞": "auxiliary",

209 "助記符": "mnemonic",

210 "及物": "transitive",

211 "反問句": "rhetoric",

212 "反身": "reflexive",

213 "口語": "colloquial",

214 "口语": "colloquial",

215 "古舊": "archaic",

216 "可數": "countable",

217 "同性戀俚語": ["slang", "LGBT"],

218 "名詞化": "noun-from-verb",

219 "唯單": "singular-only",

220 "唯複": "plural-only",

221 "國際音標": "IPA",

222 "基數詞": "cardinal",

223 "大寫": "capitalized",

224 "委婉": "euphemistic",

225 "字面義": "literally",

226 "完整": "perfect",

227 "完整體": "perfective",

228 "定語": "attributive",

229 "實詞": "substantive",

230 "尊敬": "honorific",

231 "敬語": "honorific",

232 "敬语": "honorific",

233 "常用複數": "plural-normally",

234 "幽默": "humorous",

235 "序數詞": "ordinal",

236 "廣義來說": "broadly",

237 "引申": "broadly",

238 "弱祈使式": "jussive",

239 "強調": "emphatic",

240 "後古典": "obsolete",

241 "性別中立": "gender-neutral",

242 "情態": "modal",

243 "愛稱": "endearing",

244 "所有格代詞": ["possessive", "pronoun", "without-noun"],

245 "押韻俚語": "slang",

246 "抽象名詞": "abstract-noun",

247 "擬態詞": "ideophonic",

248 "擬聲詞": "onomatopoeic",

249 "新詞": "neologism",

250 "方言": "dialectal",

251 "書面": "literary",

252 "书面": "literary",

253 "有比較級": "comparable",

254 "有生": "animate",

255 "正式": "formal",

256 "歷史": "historical",

257 "比喻義": "figuratively",

258 "無人稱": "impersonal",

259 "無比較級": "not-comparable",

260 "無生": "inanimate",

261 "焦點": "focus",

262 "狹義": "narrowly",

263 "監獄俚語": "slang",

264 "直陳語氣": "indicative",

265 "短信": "Internet",

266 "祈使語氣": "imperative",

267 "禮貌": "polite",

268 "種族歧視語": "slur",

269 "粉絲用語": ["slang", "lifestyle"],

270 "粗俗": "vulgar",

271 "系動詞": "copulative",

272 "網路用語": "Internet",

273 "縮寫": "abbreviation",

274 "罕用": "rare",

275 "臨時語": "nonce-word",

276 "虛擬語氣": "subjunctive",

277 "表語": "predicative",

278 "被動語態": "passive",

279 "視覺方言": "pronunciation-spelling",

280 "親切": "familiar",

281 "詈語": "expletive",

282 "詩歌": "poetic",

283 "誇飾": "excessive",

284 "語中音省略": "syncope",

285 "諷刺": "sarcastic",

286 "謙遜": "humble",

287 "貶義": "derogatory",

288 "轉喻義": "metonymically",

289 "返璞詞": "retronym",

290 "過時": "dated",

291 "陰性": "feminine",

292 "陽性": "masculine",

293 "雙及物動詞": "ditransitive",

294 "靜態動詞": "stative",

295 "非完整": "imperfect",

296 "非完整體": "imperfective",

297 "非常罕用": "rare",

298 "非標準": "nonstandard",

299 "非标准": "nonstandard",

300 "非標準形式": "nonstandard",

301 "非正式": "informal",

302 "首字母縮略詞": "initialism",

303 "駭客語": ["Leet", "Internet"],

304 "高語域": "honorific",

305 "中醫": "Traditional-Chinese-Medicine",

306 "修辭學": "rhetoric",

307 "印度教": "Hinduism",

308 "摩門教": "Mormonism",

309 "物理": "particle",

310 "猶太教": "Judaism",

311 "納粹主義": "Nazism",

312 "網際網路": "Internet",

313 "耆那教": "Jainism",

314 "聖經": "Biblical",

315 "解剖學": "anatomy",

316 "貴格會": "Quakerism",

317 "錫克教": "Sikhism",

318 "馬克思主義": "Marxism",

319 # also from Module:Labels/data, but translated manually

320 "喃字": "Chu-Nom",

321 "反身代詞": "reflexive",

322 "字面意義": "literally",

323 "成語": "Chengyu",

324 "及物、不及物": ["transitive", "intransitive"],

325 "集合名詞": "collective",

326 "控制動詞": "control-verb",

327 "省略": "ellipsis",

328 "分數": "fractional",

329 "以雙數形式": "dual",

330 "主要用於否定複數": ["negative", "plural"],

331 "數詞縮寫": ["numeral", "abbreviation"],

332 "主要用於肯定": "positive",

333 "古典": "Classical",

334 "中國大陸": "Mainland-China",

335 "書面語": "literary",

336}

337

338# example sentence template

339# https://zh.wiktionary.org/wiki/Template:Zh-x

340# https://zh.wiktionary.org/wiki/Module:Zh-usex/data

341ZH_X_TAGS = {

342 "繁體": "Traditional-Chinese",

343 "繁體和": "Traditional-Chinese",

344 "簡體": "Simplified-Chinese",

345 "繁體和簡體": ["Traditional-Chinese", "Simplified-Chinese"],

346 "漢語拼音": "Pinyin",

347 "粵拼": "Jyutping",

348 "現代標準漢語": "Standard-Chinese",

349 "文言文": "Classical-Chinese",

350 "官話白話文": "Written-vernacular-Chinese",

351 "粵語": "Cantonese",

352 "吳語": "Wu",

353 "廣州話": "Cantonese",

354}

355

356# classifier tags

357# https://zh.wiktionary.org/wiki/Template:zh-mw

358# https://zh.wiktionary.org/wiki/Module:Zh/templates

359ZH_TAGS = {

360 "官話": "Mandarin",

361 "贛語": "Gan",

362 "客家話": "Hakka",

363 "晉語": "Jin",

364 "閩北語": "Min-Bei",

365 "閩東語": "Min-Dong",

366 "閩南語": "Min-Nan",

367 "潮州話": "Teochew",

368 "湘語": "Xiang",

369}

370

371# https://zh.wiktionary.org/wiki/Template:Zh-pron

372# https://zh.wiktionary.org/wiki/Module:Zh-pron

373ZH_PRON_TAGS = {

374 "拼音": "Pinyin",

375 "注音": "Bopomofo",

376 "潮州話拼音": "Peng'im",

377 "上海": "Shanghai",

378 "吳語學堂拼音": "Wugniu",

379 "通用拼音": "Tongyong-Pinyin",

380 "威妥瑪拼音": "Wade–Giles",

381 "耶魯官話拼音": "Yale",

382 "國語羅馬字": "Gwoyeu-Romatsyh",

383 "西里爾字母轉寫": "Cyrillic",

384 "西里爾字母": "Cyrillic",

385 "漢語國際音標": "Sinological-IPA",

386 "耶魯粵拼": ["Yale", "Jyutping"],

387 "廣州話拼音": ["Cantonese", "Pinyin"],

388 "廣東拼音": "Guangdong-Romanization",

389 "國際音標": "IPA",

390 "模仿白話字": "POJ",

391 "標準粵語": "Standard-Cantonese",

392 "廣州–香港話": ["Guangzhou", "Hong Kong"],

393 "福州話": "Fuzhou",

394 "平話字": "Foochow-Romanized",

395 "客家語": "Hakka",

396 "白話字": "Phak-fa-su",

397 "泉漳話": "Hokkien",

398 "泉州": "Quanzhou",

399 "廈門": "Xiamen",

400 "輕尾聲異讀": "toneless-final-syllable-variant",

401 "維基詞典": "Wiktionary-specific",

402 "維基詞典拼音": ["Wiktionary-specific", "Pinyin"],

403 "維基詞典轉寫": "Wiktionary-specific",

404 "成都話": "Chengdu",

405 "四川話拼音": ["Sichuanese", "Pinyin"],

406 "東干語": "Dongan",

407 "台山話": "Taishanese",

408 "四縣": "Sixian",

409 "長沙話": "Changsha",

410 "四川話拉丁化新文字": "Latinxua-Sin-Wenz",

411 "台城": "Taicheng",

412 "南昌話": "Nanchang",

413 "四縣話": "Sixian",

414 "苗栗": "Miaoli",

415 "美濃": "Neipu",

416 "客家語拼音": "Hakka-Romanization-System",

417 "客家話拼音方案": "Hagfa-Pinyim",

418 "太原話": "Taiyuan",

419 "老派": "dated",

420 "新加坡": "Singapore",

421 "臺羅": "Tâi-lô",

422 "普實台文": "Phofsit-Daibuun",

423 "太湖片": "Northern",

424 "吳音小字典": "MiniDict",

425 "維基詞典羅馬化": ["Wiktionary-specific", "romanization"],

426 "上海話": "Shanghai",

427 "中古漢語": "Middle-Chinese",

428 "莆仙語": "Puxian-Min",

429 "莆仙話拼音": "Pouseng-Ping'ing",

430 "莆田": "Putian",

431 "仙遊": "Xianyou",

432 "漳州": "Zhangzhou",

433 "臺北": "Taibei",

434 "高雄": "Kaohsiung",

435 "實際讀音": "phonetic",

436 "臺灣話": "Taiwanese",

437 "常用": "general",

438 "檳城": "Penang",

439 "兒化": "Erhua",

440}

441

442ZH_DIAL_TAGS = {

443 "白話文": "Written-vernacular-Chinese",

444 "北京": "Beijing",

445 "燕京官話": "Northeastern-Mandarin",

446 "冀魯官話": "Jilu-Mandarin",

447 "膠遼官話": "Jiaoliao-Mandarin",

448 "中原官話": "Central-Plains-Mandarin",

449 "蘭銀官話": "Lanyin-Mandarin",

450 "西南官話": "Southwestern-Mandarin",

451 "江淮官話": "Jianghuai-Mandarin",

452 "徽語": "Huizhou",

453 "南部平話": "Southern-Pinghua",

454 "濟南": "Jinan",

455}

456

457

458ALL_TAGS = {

459 **GRAMMATICAL_TAGS,

460 **LABEL_TAGS,

461 **ZH_X_TAGS,

462 **ZH_TAGS,

463 **ZH_PRON_TAGS,

464 **ZH_DIAL_TAGS,

465}

466

467

468def translate_raw_tags(data: WordEntry) -> WordEntry:

469 raw_tags = []

470 for raw_tag in data.raw_tags:

471 if raw_tag in ALL_TAGS:

472 tr_tag = ALL_TAGS[raw_tag]

473 if isinstance(tr_tag, str) and tr_tag not in data.tags:

474 data.tags.append(tr_tag)

475 elif isinstance(tr_tag, list):

476 data.tags.extend(tr_tag)

477 elif raw_tag in LABEL_TOPICS and hasattr(data, "topics"):

478 data.topics.append(LABEL_TOPICS[raw_tag])

479 elif raw_tag not in raw_tags: 479 ↛ 470line 479 didn't jump to line 470 because the condition on line 479 was always true

480 raw_tags.append(raw_tag)

481 data.raw_tags = raw_tags

482 return data

483

484

485# https://zh.wiktionary.org/wiki/Template:T

486# https://zh.wiktionary.org/wiki/Template:Head

487# https://zh.wiktionary.org/wiki/Module:Gender_and_number

488TEMPLATE_TAG_ARGS = {

489 "f": "feminine",

490 "m": "masculine",

491 "n": "neuter",

492 "c": "common",

493 # Animacy

494 "an": "animate",

495 "in": "inanimate",

496 # Animal (for Ukrainian, Belarusian, Polish)

497 "anml": "animal",

498 # Personal (for Ukrainian, Belarusian, Polish)

499 "pr": "personal",

500 # Nonpersonal not currently used

501 "np": "nonpersonal",

502 # Virility (for Polish)

503 "vr": "virile",

504 "nv": "nonvirile",

505 # Numbers

506 "s": "singular number",

507 "d": "dual number",

508 "p": "plural number",

509 # Verb qualifiers

510 "impf": "imperfective",

511 "pf": "perfective",

512 "mf": ["masculine", "feminine"],

513}