Coverage for src/wiktextract/extractor/ja/tags.py: 91%
25 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from .models import WordEntry
3TAGS = {
4 "男性": "masculine",
5 "女性": "feminine",
6 "通性": "common",
7 "中性": "neuter",
8 "単数": "singular",
9 "複数": "plural",
10 "不変": "invariable",
11 "男性複数": ["masculine", "plural"],
12 # テンプレート:context/data
13 "くだけた表現": "informal",
14 "しばしば": "often",
15 "まれ": "rare",
16 "アイルランド": "Ireland",
17 "アフリカ": "Africa",
18 "アメカメカ": "Amecameca",
19 "アメリカ合衆国": "US",
20 "アルスター": "Ulster",
21 "アルゼンチン": "Argentina",
22 "アングロ・ノルマン": "Anglo-Norman",
23 "アンダルシア": "Andalusia",
24 "イェ方言": "Ijekavian",
25 "イギリス": "UK",
26 "イラン": "Iran",
27 "イロン方言": "Iron",
28 "インターネット": "Internet",
29 "インターネットスラング": "Internet",
30 "ウクライナ": "Ukraine",
31 "ウルグアイ": "Uruguay",
32 "ウーリ語": "Uri",
33 "エクアドル": "Ecuador",
34 "エ方言": "Ekavian",
35 "オノマトペ": "onomatopoeic",
36 "オルムルム": "Ormulum",
37 "オークニー": "Orkney",
38 "オーストリア": "Austrian",
39 "カイピラ方言": "Caipira",
40 "カイ方言": "Kajkavian",
41 "カサレヴサ": "Katharevousa",
42 "カナダ": "Canada",
43 "カルコーフォロ語": "Carcoforo",
44 "ガスコーニュ": "Gascony",
45 "ガーンジー": "Guernsey",
46 "キューバ": "Cuba",
47 "キリスト教": "Christian",
48 "クロアチア": "Croatian",
49 "グアテマラ": "Guatemala",
50 "グレッソネイ語": "Gressoney",
51 "ケニア": "Kenya",
52 "ケベック": "Quebec",
53 "ケルン語": "Kölsch",
54 "ケント": "Kentish",
55 "コイネー": "Koine",
56 "コノート": "Connacht",
57 "コロンビア": "Colombia",
58 "コンゴ": "Congo",
59 "サカティアングイス": "Zacatianguis",
60 "サーク": "Sark",
61 "シェットランド": "Shetland",
62 "シク教": "Sikhism",
63 "シンガポール": "Singapore",
64 "ジャイナ教": "Jainism",
65 "ジャマイカ": "Jamaica",
66 "ジャージー": "Jersey",
67 "ジンバブエ": "Zimbabwe",
68 "スイス": "Switzerland",
69 "スコットランド": "Scotland",
70 "ストゥシルヴァン": "Sutsilvan",
71 "スペイン": "Spain",
72 "スルシルヴァン": "Sursilvan",
73 "スルミラン": "Surmiran",
74 "セルビア": "Serbian",
75 "セルリング": "Sylt",
76 "タイ英語": "Thailand",
77 "タラシケヴィツァ": "Taraškievica",
78 "タントユカ": "Tantoyuca",
79 "ダリー語": "Dari",
80 "チコナメル": "Chiconamel",
81 "チコンテペク": "Chicontepec",
82 "チャ方言": "Chakavian",
83 "チリ": "Chile",
84 "ティマウ": "Timau",
85 "テペツィントラ": "Tepetzintla",
86 "テマパチェ": "Temapache",
87 "ディゴル方言": "Digor",
88 "デモティキ": "Demotic",
89 "トスカナ語": "Tuscany",
90 "ドイツ": "Germany",
91 "ドイツ南部": "Southern-Germany",
92 "ナチズム": "Nazism",
93 "ナミビア": "Namibia",
94 "ニカラグア": "Nicaragua",
95 "ニューカッスル": "Tyneside",
96 "ニュージーランド": "New-Zealand",
97 "ヌオロ": "Nuorese",
98 "バレンシア": "Valencia",
99 "パナマ": "Panama",
100 "ヒンズー教": "Hinduism",
101 "ビザンツ": "Byzantine",
102 "ビバロ・アルピーネ語": "Vivaro-Alpine",
103 "フィリピン": "Philippines",
104 "フェリング・エームラング": "Föhr-Amrum",
105 "フォルマッツァ語": "Formazza",
106 "フランス": "French",
107 "ブラジル": "Brazil",
108 "プロテスタント": "Protestant",
109 "プロヴァンス": "Provençal",
110 "プーター": "Puter",
111 "ヘルゴラント": "Helgoland",
112 "ベネズエラ": "Venezuela",
113 "ベルギー": "Belgium",
114 "ペルー": "Peru",
115 "ホンジュラス": "Honduras",
116 "ボスニア": "Bosnian",
117 "ボリビア": "Bolivia",
118 "ポルトガル": "Portugal",
119 "マンスター": "Munster",
120 "マーシア": "Mercian",
121 "ミストラル式綴り": "Mistralian",
122 "メキシコ": "Mexico",
123 "モンテネグロ": "Montenegro",
124 "モーリング": "Mooring",
125 "ユダヤ教": "Judaism",
126 "ヨーロッパ": "Europe",
127 "ラングドック": "Languedoc",
128 "リヒテンシュタイン": "Liechtenstein",
129 "リプアーリ語": "Ripuarian",
130 "リムーザン": "Limousin",
131 "リメッラ語": "Rimella",
132 "ルイジアナ": "Louisiana",
133 "ルゼルナ": "Luserna",
134 "ログドーロ": "Logudorese",
135 "ロシア": "Russia",
136 "ヴァラダール": "Vallander",
137 "ヴィーディングハルデ": "Wiedingharde",
138 "不可算": "uncountable",
139 "不変化名詞": "indeclinable",
140 "不活動体": "inanimate",
141 "他動詞": "transitive",
142 "代名詞的用法": "pronominal",
143 "俗語": "slang",
144 "修辞学": "rhetoric",
145 "倒語": "slang",
146 "再帰動詞": "reflexive",
147 "初期中英語": "Early-Middle-English",
148 "助動詞": "auxiliary",
149 "卑語": "vulgar",
150 "単数形で": "singular",
151 "単数形のみ": "singular singular-only singular",
152 "印": "India",
153 "叙法": "modal",
154 "叙述用法のみ": "predicative",
155 "口語": "informal",
156 "古用法": "dated",
157 "古語・廃語": "archaic",
158 "可算": "countable",
159 "地名": "place",
160 "地域": "regional",
161 "基数": "cardinal",
162 "多文化的ロンドン英語": "Multicultural-London-English",
163 "婉曲表現": "euphemistic",
164 "幼児語": "childish",
165 "序数": "ordinal",
166 "廃語": "obsolete",
167 "強い": "strong",
168 "形容詞的": "attributive",
169 "後期中英語": "Late-Middle-English",
170 "恐らく": "possibly",
171 "慣用的表現": "idiomatic",
172 "排斥された語": "proscribed",
173 "控えめに": "mildly",
174 "換喩的に": "metonymically",
175 "文章語": "literary",
176 "方言": "dialectal",
177 "時々": "sometimes",
178 "欠如動詞": "defective",
179 "正式・堅": "formal",
180 "歴史": "historical",
181 "比喩": "figuratively",
182 "比喩的に": "figuratively",
183 "比較形有り": "comparable",
184 "比較形無し": "not-comparable",
185 "活動体": "animate",
186 "滑稽": "humorous",
187 "特に": "especially",
188 "状態動詞": "stative",
189 "略語": "abbreviation",
190 "疑問詞": "interrogative",
191 "皮肉": "ironic",
192 "破格": "nonstandard",
193 "筋肉": "anatomy",
194 "米語": "US",
195 "絶対単数": "singular-only singular",
196 "絶対複数": "plural-only plural",
197 "能格動詞": "ergative",
198 "自他動詞": "ambitransitive",
199 "自動詞": "intransitive",
200 "英連邦": "Commonwealth",
201 "蔑称": "offensive",
202 "複合語で": "in-compounds",
203 "複数形で": "plural",
204 "西部": "Western",
205 "視覚方言": "pronunciation-spelling",
206 "詩的表現": "poetic",
207 "豪": "Australian",
208 "転じて": "broadly",
209 "軽侮語": "pejorative",
210 "近代ラテン語": "Netherlands",
211 "逐語的に": "literally",
212 "通常": "usually",
213 "通常複数形で": "plural-normally",
214 "造語": "neologism",
215 "関係詞": "relative",
216 "限定": "definite",
217 "集合名詞": "collective",
218 "集合的に": "collective",
219 "非人称": "impersonal",
220 "非標準": "uncommon",
221 "頭字語": "initialism",
222 "首都": "uppercase",
223 # "en-verb" template
224 "三単現": ["third-person", "singular", "present"],
225 "現在分詞": ["present", "participle"],
226 "過去形": "past",
227 "過去分詞": ["past", "participle"],
228 "繁": "Traditional Chinese",
229 "簡": "Simplified Chinese",
230 # zh sound
231 "ピンイン": "Pinyin",
232 "注音符号": "Bopomofo",
233 "広東語": "Cantonese",
234 "閩南語": "Min-Nan",
235 "客家語": "Hakka",
236}
238TOPICS = {
239 # テンプレート:context/data
240 "BDSM": "BDSM",
241 "LGBT": "LGBT",
242 "SF": "science fiction",
243 "アイスホッケー": "ice hockey",
244 "アメリカンフットボール": "American football",
245 "アーチェリー": "archery",
246 "イスラム教": "Islam",
247 "イデオロギー": "ideology",
248 "ウイルス学": "virology",
249 "エネルギー": "energy",
250 "カトリック": "Catholicism",
251 "カードゲーム": "cards",
252 "カーリング": "curling",
253 "キリスト教": "Christianity",
254 "クリケット": "cricket",
255 "グラフィカルユーザインタフェース": "graphical user interface",
256 "グラフ理論": "graph theory",
257 "コンピュータグラフィックス": "computer graphics",
258 "ゴルフ": "golf",
259 "サイクリング": "cycling",
260 "サッカー": "soccer",
261 "サーフィン": "surfing",
262 "シャンチー": "xiangqi",
263 "スカッシュ": "squash",
264 "スキー": "skiing",
265 "スケート": "skating",
266 "スケートボード": "skateboarding",
267 "スヌーカー": "snooker",
268 "スノーボード": "snowboarding",
269 "スポーツ": "sports",
270 "ソフトウェア": "software",
271 "ソフトボール": "softball",
272 "ゾロアスター教": "Zoroastrianism",
273 "ダンス": "dance",
274 "ダーツ": "darts",
275 "チアリーディング": "cheerleading",
276 "チェス": "chess",
277 "テニス": "tennis",
278 "テレビ": "television",
279 "ハンドボール": "handball",
280 "ハードウェア": "computer hardware",
281 "バスケットボール": "basketball",
282 "バレーボール": "volleyball",
283 "ビジネス": "business",
284 "ビリヤード": "billiards",
285 "ファシズム": "fascism",
286 "ファッション": "fashion",
287 "フェミニズム": "feminism",
288 "フェンシング": "fencing",
289 "フットボール": "football",
290 "ブリッジ": "bridge",
291 "プログラミング": "programming",
292 "ボウリング": "bowling",
293 "ボクシング": "boxing",
294 "ボディビル": "bodybuilding",
295 "ボート競技": "rowing",
296 "ポーカー": "poker",
297 "モータースポーツ": "motor racing",
298 "ラクロス": "lacrosse",
299 "ラグビー": "rugby",
300 "レスリング": "wrestling",
301 "交通": "transport",
302 "人口学": "demography",
303 "仏教": "Buddhism",
304 "代数学": "algebra",
305 "代数幾何学": "algebraic geometry",
306 "会計": "accounting",
307 "体操": "gymnastics",
308 "保険": "insurance",
309 "倫理学": "ethics",
310 "光学": "optics",
311 "免疫学": "immunology",
312 "共産主義": "communism",
313 "写真": "photography",
314 "分類学": "taxonomy",
315 "力学": "mechanics",
316 "動物学": "zoology",
317 "化学": "chemistry",
318 "化粧品": "cosmetics",
319 "医学": "medicine",
320 "医療": "healthcare",
321 "単位": "units of measure",
322 "占星術": "astrology",
323 "印刷": "printing",
324 "古生物学": "paleontology",
325 "哲学": "philosophy",
326 "哺乳類学": "mammalogy",
327 "商取引": "trading",
328 "園芸": "horticulture",
329 "地理": "geography",
330 "地質学": "geology",
331 "地震学": "seismology",
332 "外科学": "surgery",
333 "大工仕事": "carpentry",
334 "天体物理学": "astrophysics",
335 "天文学": "astronomy",
336 "娯楽": "entertainment",
337 "季節": "seasons",
338 "宗教": "religion",
339 "宝飾": "jewelry",
340 "家具": "furniture",
341 "寄生虫学": "parasitology",
342 "将棋": "shogi",
343 "岩石学": "petrology",
344 "工学": "engineering",
345 "幾何学": "geometry",
346 "建築": "architecture",
347 "微生物学": "microbiology",
348 "心理学": "psychology",
349 "性": "sexuality",
350 "性行為": "sex",
351 "情報学": "information science",
352 "情報技術": "computing",
353 "戦争": "war",
354 "技術": "technology",
355 "政府": "government",
356 "政治": "politics",
357 "教育": "education",
358 "数学": "mathematics",
359 "数論": "number theory",
360 "文学": "literature",
361 "文法": "grammar",
362 "文献学": "philology",
363 "料理": "cuisine",
364 "旅行": "travel",
365 "昆虫学": "entomology",
366 "時間": "time",
367 "有機化学": "organic chemistry",
368 "林業": "forestry",
369 "柔道": "judo",
370 "植物学": "botany",
371 "武器": "weapon",
372 "歯学": "dentistry",
373 "歴史": "history",
374 "歴史学": "historiography",
375 "気候": "climatology",
376 "気象": "weather",
377 "水泳": "swimming",
378 "泌尿器科学": "urology",
379 "法律": "legal",
380 "活版印刷": "typography",
381 "流体力学": "fluid dynamics",
382 "海事": "nautical",
383 "海洋学": "oceanography",
384 "消防": "firefighting",
385 "火器": "firearms",
386 "火山学": "volcanology",
387 "無機化学": "inorganic chemistry",
388 "熱力学": "thermodynamics",
389 "物理学": "physics",
390 "犯罪学": "criminology",
391 "狩猟": "hunting",
392 "生化学": "biochemistry",
393 "生態学": "ecology",
394 "生物学": "biology",
395 "生理学": "physiology",
396 "疑似科学": "pseudoscience",
397 "疫学": "epidemiology",
398 "病理学": "pathology",
399 "発生学": "embryology",
400 "相撲": "sumo",
401 "眼科学": "ophthalmology",
402 "社会学": "sociology",
403 "社会科学": "social science",
404 "社会言語学": "sociolinguistics",
405 "神学": "theology",
406 "神経学": "neurology",
407 "神経解剖学": "neuroanatomy",
408 "神話": "mythology",
409 "神道": "Shinto",
410 "科学": "sciences",
411 "競馬": "horse racing",
412 "精神医学": "psychiatry",
413 "紋章学": "heraldry",
414 "紡績": "weaving",
415 "細胞学": "cytology",
416 "細菌学": "bacteriology",
417 "経営学": "management",
418 "経済": "economics",
419 "統計学": "statistics",
420 "線型代数学": "linear algebra",
421 "翻訳研究": "translation studies",
422 "老年学": "gerontology",
423 "考古学": "archaeology",
424 "肉": "meat",
425 "腫瘍学": "oncology",
426 "自動車": "automobile",
427 "自動車機器": "automotive",
428 "航空": "aviation",
429 "航空工学": "aeronautics",
430 "色": "color",
431 "花粉学": "palynology",
432 "芸術": "arts",
433 "著作権": "copyright",
434 "薬理学": "pharmacology",
435 "藻類学": "phycology",
436 "蠍": "beer",
437 "血液学": "hematology",
438 "衣類": "clothing",
439 "製造": "manufacturing",
440 "解剖学": "anatomy",
441 "解析学": "mathematical analysis",
442 "言語": "language",
443 "言語学": "linguistics",
444 "詩": "poetry",
445 "語彙論": "lexicology",
446 "語用論": "pragmatics",
447 "調理": "cooking",
448 "論理学": "logic",
449 "資本主義": "capitalism",
450 "超心理学": "parapsychology",
451 "軍事": "military",
452 "辞書学": "lexicography",
453 "農業": "agriculture",
454 "通貨": "numismatics",
455 "運動": "exercise",
456 "道路": "road",
457 "遺伝学": "genetics",
458 "都市": "city",
459 "都道府県": "prefectures of Japan",
460 "酒": "beer",
461 "重量挙げ": "weightlifting",
462 "野球": "baseball",
463 "野菜": "vegetable",
464 "金融": "finance",
465 "釣り": "fishing",
466 "鉄道": "rail transport",
467 "鉱物学": "mineralogy",
468 "陸上競技": "athletics",
469 "集合論": "set theory",
470 "電子工学": "electronics",
471 "電気": "electricity",
472 "電磁気学": "electromagnetism",
473 "電話": "telephone",
474 "音声学": "phonetics",
475 "音楽": "music",
476 "音韻論": "phonology",
477 "韻律": "prosody",
478 "食品": "food",
479 "馬術": "equestrianism",
480 "魚": "fish",
481 "魚類学": "ichthyology",
482 "鳥類学": "ornithology",
483 "麻雀": "mahjong",
484}
487def translate_raw_tags(data: WordEntry) -> None:
488 raw_tags = []
489 for raw_tag in data.raw_tags:
490 if raw_tag in TAGS:
491 add_tag(raw_tag, data)
492 elif "/" in raw_tag:
493 for r_tag in raw_tag.split("/"):
494 r_tag = r_tag.strip()
495 if r_tag in TAGS: 495 ↛ 493line 495 didn't jump to line 493 because the condition on line 495 was always true
496 add_tag(r_tag, data)
497 elif raw_tag in TOPICS and hasattr(data, "topics"): 497 ↛ 500line 497 didn't jump to line 500 because the condition on line 497 was always true
498 data.topics.append(TOPICS[raw_tag])
499 else:
500 raw_tags.append(raw_tag)
501 data.raw_tags = raw_tags
504def add_tag(raw_tag: str, data: WordEntry) -> None:
505 tr_tag = TAGS[raw_tag]
506 if isinstance(tr_tag, str) and tr_tag not in data.tags:
507 data.tags.append(TAGS[raw_tag])
508 elif isinstance(tr_tag, list):
509 for t_tag in tr_tag:
510 if t_tag not in data.tags: 510 ↛ 509line 510 didn't jump to line 509 because the condition on line 510 was always true
511 data.tags.append(t_tag)