Coverage for src/wiktextract/extractor/ja/tags.py: 96%
25 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1from .models import WordEntry
3TAGS = {
4 "男性": "masculine",
5 "女性": "feminine",
6 "通性": "common",
7 "中性": "neuter",
8 "単数": "singular",
9 "複数": "plural",
10 "不変": "invariable",
11 "男性複数": ["masculine", "plural"],
12 "女性複数": ["feminine", "plural"],
13 # テンプレート:context/data
14 "くだけた表現": "informal",
15 "しばしば": "often",
16 "まれ": "rare",
17 "アイルランド": "Ireland",
18 "アフリカ": "Africa",
19 "アメカメカ": "Amecameca",
20 "アメリカ合衆国": "US",
21 "アルスター": "Ulster",
22 "アルゼンチン": "Argentina",
23 "アングロ・ノルマン": "Anglo-Norman",
24 "アンダルシア": "Andalusia",
25 "イェ方言": "Ijekavian",
26 "イギリス": "UK",
27 "イラン": "Iran",
28 "イロン方言": "Iron",
29 "インターネット": "Internet",
30 "インターネットスラング": "Internet",
31 "ウクライナ": "Ukraine",
32 "ウルグアイ": "Uruguay",
33 "ウーリ語": "Uri",
34 "エクアドル": "Ecuador",
35 "エ方言": "Ekavian",
36 "オノマトペ": "onomatopoeic",
37 "オルムルム": "Ormulum",
38 "オークニー": "Orkney",
39 "オーストリア": "Austrian",
40 "カイピラ方言": "Caipira",
41 "カイ方言": "Kajkavian",
42 "カサレヴサ": "Katharevousa",
43 "カナダ": "Canada",
44 "カルコーフォロ語": "Carcoforo",
45 "ガスコーニュ": "Gascony",
46 "ガーンジー": "Guernsey",
47 "キューバ": "Cuba",
48 "キリスト教": "Christian",
49 "クロアチア": "Croatian",
50 "グアテマラ": "Guatemala",
51 "グレッソネイ語": "Gressoney",
52 "ケニア": "Kenya",
53 "ケベック": "Quebec",
54 "ケルン語": "Kölsch",
55 "ケント": "Kentish",
56 "コイネー": "Koine",
57 "コノート": "Connacht",
58 "コロンビア": "Colombia",
59 "コンゴ": "Congo",
60 "サカティアングイス": "Zacatianguis",
61 "サーク": "Sark",
62 "シェットランド": "Shetland",
63 "シク教": "Sikhism",
64 "シンガポール": "Singapore",
65 "ジャイナ教": "Jainism",
66 "ジャマイカ": "Jamaica",
67 "ジャージー": "Jersey",
68 "ジンバブエ": "Zimbabwe",
69 "スイス": "Switzerland",
70 "スコットランド": "Scotland",
71 "ストゥシルヴァン": "Sutsilvan",
72 "スペイン": "Spain",
73 "スルシルヴァン": "Sursilvan",
74 "スルミラン": "Surmiran",
75 "セルビア": "Serbian",
76 "セルリング": "Sylt",
77 "タイ英語": "Thailand",
78 "タラシケヴィツァ": "Taraškievica",
79 "タントユカ": "Tantoyuca",
80 "ダリー語": "Dari",
81 "チコナメル": "Chiconamel",
82 "チコンテペク": "Chicontepec",
83 "チャ方言": "Chakavian",
84 "チリ": "Chile",
85 "ティマウ": "Timau",
86 "テペツィントラ": "Tepetzintla",
87 "テマパチェ": "Temapache",
88 "ディゴル方言": "Digor",
89 "デモティキ": "Demotic",
90 "トスカナ語": "Tuscany",
91 "ドイツ": "Germany",
92 "ドイツ南部": "Southern-Germany",
93 "ナチズム": "Nazism",
94 "ナミビア": "Namibia",
95 "ニカラグア": "Nicaragua",
96 "ニューカッスル": "Tyneside",
97 "ニュージーランド": "New-Zealand",
98 "ヌオロ": "Nuorese",
99 "バレンシア": "Valencia",
100 "パナマ": "Panama",
101 "ヒンズー教": "Hinduism",
102 "ビザンツ": "Byzantine",
103 "ビバロ・アルピーネ語": "Vivaro-Alpine",
104 "フィリピン": "Philippines",
105 "フェリング・エームラング": "Föhr-Amrum",
106 "フォルマッツァ語": "Formazza",
107 "フランス": "French",
108 "ブラジル": "Brazil",
109 "プロテスタント": "Protestant",
110 "プロヴァンス": "Provençal",
111 "プーター": "Puter",
112 "ヘルゴラント": "Helgoland",
113 "ベネズエラ": "Venezuela",
114 "ベルギー": "Belgium",
115 "ペルー": "Peru",
116 "ホンジュラス": "Honduras",
117 "ボスニア": "Bosnian",
118 "ボリビア": "Bolivia",
119 "ポルトガル": "Portugal",
120 "マンスター": "Munster",
121 "マーシア": "Mercian",
122 "ミストラル式綴り": "Mistralian",
123 "メキシコ": "Mexico",
124 "モンテネグロ": "Montenegro",
125 "モーリング": "Mooring",
126 "ユダヤ教": "Judaism",
127 "ヨーロッパ": "Europe",
128 "ラングドック": "Languedoc",
129 "リヒテンシュタイン": "Liechtenstein",
130 "リプアーリ語": "Ripuarian",
131 "リムーザン": "Limousin",
132 "リメッラ語": "Rimella",
133 "ルイジアナ": "Louisiana",
134 "ルゼルナ": "Luserna",
135 "ログドーロ": "Logudorese",
136 "ロシア": "Russia",
137 "ヴァラダール": "Vallander",
138 "ヴィーディングハルデ": "Wiedingharde",
139 "不可算": "uncountable",
140 "不変化名詞": "indeclinable",
141 "不活動体": "inanimate",
142 "他動詞": "transitive",
143 "代名詞的用法": "pronominal",
144 "俗語": "slang",
145 "修辞学": "rhetoric",
146 "倒語": "slang",
147 "再帰動詞": "reflexive",
148 "初期中英語": "Early-Middle-English",
149 "助動詞": "auxiliary",
150 "卑語": "vulgar",
151 "単数形で": "singular",
152 "単数形のみ": "singular singular-only singular",
153 "印": "India",
154 "叙法": "modal",
155 "叙述用法のみ": "predicative",
156 "口語": "informal",
157 "古用法": "dated",
158 "古語・廃語": "archaic",
159 "可算": "countable",
160 "地名": "place",
161 "地域": "regional",
162 "基数": "cardinal",
163 "多文化的ロンドン英語": "Multicultural-London-English",
164 "婉曲表現": "euphemistic",
165 "幼児語": "childish",
166 "序数": "ordinal",
167 "廃語": "obsolete",
168 "強い": "strong",
169 "形容詞的": "attributive",
170 "後期中英語": "Late-Middle-English",
171 "恐らく": "possibly",
172 "慣用的表現": "idiomatic",
173 "排斥された語": "proscribed",
174 "控えめに": "mildly",
175 "換喩的に": "metonymically",
176 "文章語": "literary",
177 "方言": "dialectal",
178 "時々": "sometimes",
179 "欠如動詞": "defective",
180 "正式・堅": "formal",
181 "歴史": "historical",
182 "比喩": "figuratively",
183 "比喩的に": "figuratively",
184 "比較形有り": "comparable",
185 "比較形無し": "not-comparable",
186 "活動体": "animate",
187 "滑稽": "humorous",
188 "特に": "especially",
189 "状態動詞": "stative",
190 "略語": "abbreviation",
191 "疑問詞": "interrogative",
192 "皮肉": "ironic",
193 "破格": "nonstandard",
194 "筋肉": "anatomy",
195 "米語": "US",
196 "絶対単数": "singular-only singular",
197 "絶対複数": "plural-only plural",
198 "能格動詞": "ergative",
199 "自他動詞": "ambitransitive",
200 "自動詞": "intransitive",
201 "英連邦": "Commonwealth",
202 "蔑称": "offensive",
203 "複合語で": "in-compounds",
204 "複数形で": "plural",
205 "西部": "Western",
206 "視覚方言": "pronunciation-spelling",
207 "詩的表現": "poetic",
208 "豪": "Australian",
209 "転じて": "broadly",
210 "軽侮語": "pejorative",
211 "近代ラテン語": "Netherlands",
212 "逐語的に": "literally",
213 "通常": "usually",
214 "通常複数形で": "plural-normally",
215 "造語": "neologism",
216 "関係詞": "relative",
217 "限定": "definite",
218 "集合名詞": "collective",
219 "集合的に": "collective",
220 "非人称": "impersonal",
221 "人称": "personal",
222 "非標準": "uncommon",
223 "頭字語": "initialism",
224 "首都": "uppercase",
225 # "en-verb" template
226 "三単現": ["third-person", "singular", "present"],
227 "現在分詞": ["present", "participle"],
228 "過去形": "past",
229 "過去分詞": ["past", "participle"],
230 "繁": "Traditional-Chinese",
231 "簡": "Simplified-Chinese",
232 # zh sound
233 "標準中国語": "Standard-Chinese",
234 "ピンイン": "Pinyin",
235 "注音符号": "Bopomofo",
236 "ウェード式": "Wade–Giles",
237 "IPA": "IPA",
238 "広東語": "Cantonese",
239 "改イェール式": ["Yale", "romanization", "Cantonese"],
240 "イェール式": "Yale",
241 "粤拼": "Jyutping",
242 "教院式": ["ILE", "romanization", "Cantonese"],
243 "広東拼音方案": "Guangdong-Romanization",
244 "台山語": "Taishanese",
245 "閩南語": "Min-Nan",
246 "漳州": "Zhangzhou",
247 "漳浦": "Zhangpu",
248 "高雄": "Kaohsiung",
249 "ペナン州": "Penang",
250 "白話字": "POJ",
251 "台湾ローマ字": "Tai-lo",
252 "普実台文": "Phofsit-Daibuun",
253 "廈門": "Xiamen",
254 "泉州": "Quanzhou",
255 "台北": "Taipei",
256 "潮州語": "Teochew",
257 "莆仙語": "Puxian-Min",
258 "客家語": "Hakka",
259 "呉語": "Wu",
260 "晋語": "Jin",
261 "ドンガン語": "Dungan",
262 # モジュール:gender and number
263 "非有生": "inanimate",
264 "有生": "animate",
265 "男性人間": "virile",
266 "非男性人間": "nonvirile",
267 # Template:ru-noun+
268 "生格": "genitive",
269 "複数主格": ["nominative", "plural"],
270 "複数生格": ["genitive", "plural"],
271 "形容詞": ["relational", "adjective"],
272 "指小形": "diminutive",
273 "不完了体": "imperfective",
274 "完了体": "perfective",
275 # Template:日本語ダ活用 日本語サ変活用
276 "未然形": "imperfective",
277 "連用形": "continuative",
278 "終止形": "terminal",
279 "連体形": "attributive",
280 "仮定形": "hypothetical",
281 "命令形": "imperative",
282 "命令": "imperative",
283 "過去・完了": ["past", "completive"],
284 "過去・完了・状態": ["past", "completive"],
285 "否定形": "negative",
286 "否定": "negative",
287 "否定(古風)": ["negative", "archaic"],
288 "自動詞化": "intransitive",
289 "言い切り": "definitive",
290 "名詞化": "noun-from-verb",
291 "自発・受身\n可能・尊敬": ["active", "passive", "possibly", "honorific"],
292 "丁寧": "polite",
293 "東京式": "Tokyo",
294 "京阪式": ["Kyoto", "Osaka"],
295 # Template:hu-noun
296 "複数・主格": "plural",
297 "・主格": "nominative",
298 "単数対格": ["singular", "accusative"],
299 "複数対格": ["plural", "accusative"],
300 "属格": "genitive",
301 "ラテン文字": "romanization",
302 "キリル文字": "Cyrillic",
303 # Template:de-noun
304 "複数形無し": "no-plural",
305 "複数形": "plural",
306 "指小辞無し": "no-diminutive",
307 "指小辞": "diminutive",
308 "廃用": "obsolete",
309}
311TOPICS = {
312 # テンプレート:context/data
313 "BDSM": "BDSM",
314 "LGBT": "LGBT",
315 "SF": "science-fiction",
316 "アイスホッケー": "ice-hockey",
317 "アメリカンフットボール": "American-football",
318 "アーチェリー": "archery",
319 "イスラム教": "Islam",
320 "イデオロギー": "ideology",
321 "ウイルス学": "virology",
322 "エネルギー": "energy",
323 "カトリック": "Catholicism",
324 "カードゲーム": "cards",
325 "カーリング": "curling",
326 "キリスト教": "Christianity",
327 "クリケット": "cricket",
328 "グラフィカルユーザインタフェース": "graphical-user-interface",
329 "グラフ理論": "graph-theory",
330 "コンピュータグラフィックス": "computer-graphics",
331 "ゴルフ": "golf",
332 "サイクリング": "cycling",
333 "サッカー": "soccer",
334 "サーフィン": "surfing",
335 "シャンチー": "xiangqi",
336 "スカッシュ": "squash",
337 "スキー": "skiing",
338 "スケート": "skating",
339 "スケートボード": "skateboarding",
340 "スヌーカー": "snooker",
341 "スノーボード": "snowboarding",
342 "スポーツ": "sports",
343 "ソフトウェア": "software",
344 "ソフトボール": "softball",
345 "ゾロアスター教": "Zoroastrianism",
346 "ダンス": "dance",
347 "ダーツ": "darts",
348 "チアリーディング": "cheerleading",
349 "チェス": "chess",
350 "テニス": "tennis",
351 "テレビ": "television",
352 "ハンドボール": "handball",
353 "ハードウェア": "computer-hardware",
354 "バスケットボール": "basketball",
355 "バレーボール": "volleyball",
356 "ビジネス": "business",
357 "ビリヤード": "billiards",
358 "ファシズム": "fascism",
359 "ファッション": "fashion",
360 "フェミニズム": "feminism",
361 "フェンシング": "fencing",
362 "フットボール": "football",
363 "ブリッジ": "bridge",
364 "プログラミング": "programming",
365 "ボウリング": "bowling",
366 "ボクシング": "boxing",
367 "ボディビル": "bodybuilding",
368 "ボート競技": "rowing",
369 "ポーカー": "poker",
370 "モータースポーツ": "motor-racing",
371 "ラクロス": "lacrosse",
372 "ラグビー": "rugby",
373 "レスリング": "wrestling",
374 "交通": "transport",
375 "人口学": "demography",
376 "仏教": "Buddhism",
377 "代数学": "algebra",
378 "代数幾何学": "algebraic-geometry",
379 "会計": "accounting",
380 "体操": "gymnastics",
381 "保険": "insurance",
382 "倫理学": "ethics",
383 "光学": "optics",
384 "免疫学": "immunology",
385 "共産主義": "communism",
386 "写真": "photography",
387 "分類学": "taxonomy",
388 "力学": "mechanics",
389 "動物学": "zoology",
390 "化学": "chemistry",
391 "化粧品": "cosmetics",
392 "医学": "medicine",
393 "医療": "healthcare",
394 "単位": "units-of-measure",
395 "占星術": "astrology",
396 "印刷": "printing",
397 "古生物学": "paleontology",
398 "哲学": "philosophy",
399 "哺乳類学": "mammalogy",
400 "商取引": "trading",
401 "園芸": "horticulture",
402 "地理": "geography",
403 "地質学": "geology",
404 "地震学": "seismology",
405 "外科学": "surgery",
406 "大工仕事": "carpentry",
407 "天体物理学": "astrophysics",
408 "天文学": "astronomy",
409 "娯楽": "entertainment",
410 "季節": "seasons",
411 "宗教": "religion",
412 "宝飾": "jewelry",
413 "家具": "furniture",
414 "寄生虫学": "parasitology",
415 "将棋": "shogi",
416 "岩石学": "petrology",
417 "工学": "engineering",
418 "幾何学": "geometry",
419 "建築": "architecture",
420 "微生物学": "microbiology",
421 "心理学": "psychology",
422 "性": "sexuality",
423 "性行為": "sex",
424 "情報学": "information-science",
425 "情報技術": "computing",
426 "戦争": "war",
427 "技術": "technology",
428 "政府": "government",
429 "政治": "politics",
430 "教育": "education",
431 "数学": "mathematics",
432 "数論": "number-theory",
433 "文学": "literature",
434 "文法": "grammar",
435 "文献学": "philology",
436 "料理": "cuisine",
437 "旅行": "travel",
438 "昆虫学": "entomology",
439 "時間": "time",
440 "有機化学": "organic-chemistry",
441 "林業": "forestry",
442 "柔道": "judo",
443 "植物学": "botany",
444 "武器": "weapon",
445 "歯学": "dentistry",
446 "歴史": "history",
447 "歴史学": "historiography",
448 "気候": "climatology",
449 "気象": "weather",
450 "水泳": "swimming",
451 "泌尿器科学": "urology",
452 "法律": "legal",
453 "活版印刷": "typography",
454 "流体力学": "fluid-dynamics",
455 "海事": "nautical",
456 "海洋学": "oceanography",
457 "消防": "firefighting",
458 "火器": "firearms",
459 "火山学": "volcanology",
460 "無機化学": "inorganic-chemistry",
461 "熱力学": "thermodynamics",
462 "物理学": "physics",
463 "犯罪学": "criminology",
464 "狩猟": "hunting",
465 "生化学": "biochemistry",
466 "生態学": "ecology",
467 "生物学": "biology",
468 "生理学": "physiology",
469 "疑似科学": "pseudoscience",
470 "疫学": "epidemiology",
471 "病理学": "pathology",
472 "発生学": "embryology",
473 "相撲": "sumo",
474 "眼科学": "ophthalmology",
475 "社会学": "sociology",
476 "社会科学": "social-science",
477 "社会言語学": "sociolinguistics",
478 "神学": "theology",
479 "神経学": "neurology",
480 "神経解剖学": "neuroanatomy",
481 "神話": "mythology",
482 "神道": "Shinto",
483 "科学": "sciences",
484 "競馬": "horse-racing",
485 "精神医学": "psychiatry",
486 "紋章学": "heraldry",
487 "紡績": "weaving",
488 "細胞学": "cytology",
489 "細菌学": "bacteriology",
490 "経営学": "management",
491 "経済": "economics",
492 "統計学": "statistics",
493 "線型代数学": "linear-algebra",
494 "翻訳研究": "translation-studies",
495 "老年学": "gerontology",
496 "考古学": "archaeology",
497 "肉": "meat",
498 "腫瘍学": "oncology",
499 "自動車": "automobile",
500 "自動車機器": "automotive",
501 "航空": "aviation",
502 "航空工学": "aeronautics",
503 "色": "color",
504 "花粉学": "palynology",
505 "芸術": "arts",
506 "著作権": "copyright",
507 "薬理学": "pharmacology",
508 "藻類学": "phycology",
509 "蠍": "beer",
510 "血液学": "hematology",
511 "衣類": "clothing",
512 "製造": "manufacturing",
513 "解剖学": "anatomy",
514 "解析学": "mathematical-analysis",
515 "言語": "language",
516 "言語学": "linguistics",
517 "詩": "poetry",
518 "語彙論": "lexicology",
519 "語用論": "pragmatics",
520 "調理": "cooking",
521 "論理学": "logic",
522 "資本主義": "capitalism",
523 "超心理学": "parapsychology",
524 "軍事": "military",
525 "辞書学": "lexicography",
526 "農業": "agriculture",
527 "通貨": "numismatics",
528 "運動": "exercise",
529 "道路": "road",
530 "遺伝学": "genetics",
531 "都市": "city",
532 "都道府県": "prefectures-of-Japan",
533 "酒": "beer",
534 "重量挙げ": "weightlifting",
535 "野球": "baseball",
536 "野菜": "vegetable",
537 "金融": "finance",
538 "釣り": "fishing",
539 "鉄道": "rail-transport",
540 "鉱物学": "mineralogy",
541 "陸上競技": "athletics",
542 "集合論": "set-theory",
543 "電子工学": "electronics",
544 "電気": "electricity",
545 "電磁気学": "electromagnetism",
546 "電話": "telephone",
547 "音声学": "phonetics",
548 "音楽": "music",
549 "音韻論": "phonology",
550 "韻律": "prosody",
551 "食品": "food",
552 "馬術": "equestrianism",
553 "魚": "fish",
554 "魚類学": "ichthyology",
555 "鳥類学": "ornithology",
556 "麻雀": "mahjong",
557 "演算": "arithmetic",
558 "ゲーム": "games",
559}
562def translate_raw_tags(data: WordEntry) -> None:
563 raw_tags = []
564 for raw_tag in data.raw_tags:
565 if raw_tag in TAGS:
566 add_tag(raw_tag, data)
567 elif "/" in raw_tag:
568 for r_tag in raw_tag.split("/"):
569 r_tag = r_tag.strip()
570 if r_tag in TAGS: 570 ↛ 568line 570 didn't jump to line 568 because the condition on line 570 was always true
571 add_tag(r_tag, data)
572 elif raw_tag in TOPICS and hasattr(data, "topics"):
573 data.topics.append(TOPICS[raw_tag])
574 else:
575 raw_tags.append(raw_tag)
576 data.raw_tags = raw_tags
579def add_tag(raw_tag: str, data: WordEntry) -> None:
580 tr_tag = TAGS[raw_tag]
581 if isinstance(tr_tag, str) and tr_tag not in data.tags:
582 data.tags.append(TAGS[raw_tag])
583 elif isinstance(tr_tag, list):
584 for t_tag in tr_tag:
585 if t_tag not in data.tags: 585 ↛ 584line 585 didn't jump to line 584 because the condition on line 585 was always true
586 data.tags.append(t_tag)