Coverage for src/wiktextract/extractor/ko/tags.py: 90%

20 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1from .models import WordEntry 

2 

3# https://ko.wiktionary.org/wiki/모듈:labels/data/topical 

4# https://ko.wiktionary.org/wiki/모듈:labels/data 

5GLOSS_TAGS = { 

6 "인명": "name", 

7 "고어": "archaic", 

8 "구식": "archaic", 

9 # "대명동사": "", 

10 # "말고름": "", 

11 "비유": "metaphoric", 

12 "사어": "obsolete", # dead language 

13 "유아어": "baby-talk", 

14 "자동사": "intransitive", 

15 "직역": "literally", 

16 "타동사": "transitive", 

17 "드물게": "rare", 

18 "원래의 의미": "naturally", 

19 "문학적": "literary", 

20 "해학적": "humorous", 

21 "완곡적": "euphemistic", 

22} 

23 

24SOUND_TAGS = { 

25 # 틀:ko-IPA 

26 "Revised Romanization": ["revised", "romanization"], 

27 "Revised Romanization (translit.)": [ 

28 "revised", 

29 "romanization", 

30 "transliteration", 

31 ], 

32 "McCune-Reischauer": "McCune-Reischauer", 

33 "Yale Romanization": ["Yale", "romanization"], 

34 "표준어/서울": ["SK-Standard", "Seoul"], 

35 # 틀:ja-pron 

36 "도쿄": "Tokyo", 

37 # 틀:발음 듣기, 틀:IPA 

38 "영국": "UK", 

39 "미국": "US", 

40 "영": "UK", 

41 "미": "US", 

42 "표준": "standard", 

43 "남부": "South", 

44 "북부": "North", 

45 "고대": "archaic", 

46 "동부": "East", 

47 "서부": "West", 

48 "포르투갈": "Portugal", 

49 "이집트": "Egypt", 

50 "시리아": "Syria", 

51 "브라질": "Brazil", 

52 "독일": "Germany", 

53 "현대": "modern", 

54 "캐나다": "Canada", 

55 "하노이": "Hanoi", 

56 "브라질 남부": "Southern-Brazil", 

57 "벨기에": "Belgium", 

58 "이란": "Iran", 

59 "파리": "Paris", 

60 "모로코": "Morocco", 

61 "베를린": "Berlin", 

62 "비격식체": "informal", 

63 "민난어 장저우": ["Min-Nan", "Zhangzhou"], 

64} 

65 

66HEADER_TAGS = { 

67 # 틀:한국어_동사 

68 "부정사형": "infinitive", 

69 "연결어미형": "sequential", 

70 "명사형": "noun", 

71 "사동사": "causative", 

72} 

73 

74# also in linkage lists 

75TRANSLATION_TAGS = { 

76 "남성": "masculine", 

77 "여성": "feminine", 

78 "라틴": "Latin", 

79 "중성": "neuter", 

80 "간체": "Simplified Chinese", 

81 "번체": "Traditional Chinese", 

82 "번체자": "Traditional Chinese", 

83 "오스트리아": "Austria", 

84 "표준어": "standard", 

85 "히브리 문자": ["Hebrew", "letter"], 

86 "아랍 문자": ["Arabic", "letter"], 

87 "복수형": "plural", 

88 "단수": "singular", 

89 "복수": "plural", 

90 "불완료체": "imperfect", 

91 "완료체": "completive", 

92 "양성": "masculine", 

93 "바이에른 방언": ["Bavarian", "dialectal"], 

94 "광둥어": "Cantonese", 

95 "오스트레일리아": "Australia", 

96 "글라골 문자": ["Glagolitic", "letter"], 

97 "속어": "slang", 

98 "멕시코 속어": ["Mexico", "slang"], 

99 "에스파냐": "Spain", 

100 "가타카나": "katakana", 

101 "고어": "archaic", 

102 "쯔놈": "Chu-Nom", 

103 "형용사": "adjective", 

104 "사투리": "dialectal", 

105 "약자": "abbreviation", 

106 "동사": "verb", 

107 "드문 단어": "rare", 

108} 

109 

110TAGS = {**GLOSS_TAGS, **SOUND_TAGS, **HEADER_TAGS, **TRANSLATION_TAGS} 

111 

112TOPICS = { 

113 "금융": "finance", 

114 "광고": "advertising", 

115 "군사": "military", 

116 "어류": "fish", 

117 "물리": "physics", 

118 "법률": "law", 

119 "식물": "botany", 

120 "역사": "history", 

121 "의류": "clothing", 

122 "의학": "medicine", 

123 "전기": "electricity", 

124 # "조류": "birds", 

125 "지리": "geography", 

126 "프로그래밍": "programming", 

127 "컴퓨터": "computer", 

128 "해부학": "anatomy", 

129 "정치": "politics", 

130 "종교": "religion", 

131 "가톨릭": "Catholicism", 

132 "축구": "football", 

133 # "체육": "physical-education", 

134} 

135 

136 

137def translate_raw_tags(data: WordEntry) -> None: 

138 raw_tags = [] 

139 for raw_tag in data.raw_tags: 

140 if raw_tag in TAGS: 

141 tr_tag = TAGS[raw_tag] 

142 if isinstance(tr_tag, str): 

143 data.tags.append(tr_tag) 

144 elif isinstance(tr_tag, list): 144 ↛ 139line 144 didn't jump to line 139 because the condition on line 144 was always true

145 data.tags.extend(tr_tag) 

146 elif hasattr(data, "topics") and raw_tag in TOPICS: 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true

147 data.topics.append(TOPICS[raw_tag]) 

148 else: 

149 raw_tags.append(raw_tag) 

150 data.raw_tags = raw_tags