Coverage for src/wiktextract/extractor/ja/tags.py: 96%
25 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1from .models import WordEntry
3TAGS = {
4 "男性": "masculine",
5 "女性": "feminine",
6 "通性": "common",
7 "中性": "neuter",
8 "単数": "singular",
9 "複数": "plural",
10 "不変": "invariable",
11 "男性複数": ["masculine", "plural"],
12 "女性複数": ["feminine", "plural"],
13 # テンプレート:context/data
14 "くだけた表現": "informal",
15 "しばしば": "often",
16 "まれ": "rare",
17 "アイルランド": "Ireland",
18 "アフリカ": "Africa",
19 "アメカメカ": "Amecameca",
20 "アメリカ合衆国": "US",
21 "アルスター": "Ulster",
22 "アルゼンチン": "Argentina",
23 "アングロ・ノルマン": "Anglo-Norman",
24 "アンダルシア": "Andalusia",
25 "イェ方言": "Ijekavian",
26 "イギリス": "UK",
27 "イラン": "Iran",
28 "イロン方言": "Iron",
29 "インターネット": "Internet",
30 "インターネットスラング": "Internet",
31 "ウクライナ": "Ukraine",
32 "ウルグアイ": "Uruguay",
33 "ウーリ語": "Uri",
34 "エクアドル": "Ecuador",
35 "エ方言": "Ekavian",
36 "オノマトペ": "onomatopoeic",
37 "オルムルム": "Ormulum",
38 "オークニー": "Orkney",
39 "オーストリア": "Austrian",
40 "カイピラ方言": "Caipira",
41 "カイ方言": "Kajkavian",
42 "カサレヴサ": "Katharevousa",
43 "カナダ": "Canada",
44 "カルコーフォロ語": "Carcoforo",
45 "ガスコーニュ": "Gascony",
46 "ガーンジー": "Guernsey",
47 "キューバ": "Cuba",
48 "キリスト教": "Christian",
49 "クロアチア": "Croatian",
50 "グアテマラ": "Guatemala",
51 "グレッソネイ語": "Gressoney",
52 "ケニア": "Kenya",
53 "ケベック": "Quebec",
54 "ケルン語": "Kölsch",
55 "ケント": "Kentish",
56 "コイネー": "Koine",
57 "コノート": "Connacht",
58 "コロンビア": "Colombia",
59 "コンゴ": "Congo",
60 "サカティアングイス": "Zacatianguis",
61 "サーク": "Sark",
62 "シェットランド": "Shetland",
63 "シク教": "Sikhism",
64 "シンガポール": "Singapore",
65 "ジャイナ教": "Jainism",
66 "ジャマイカ": "Jamaica",
67 "ジャージー": "Jersey",
68 "ジンバブエ": "Zimbabwe",
69 "スイス": "Switzerland",
70 "スコットランド": "Scotland",
71 "ストゥシルヴァン": "Sutsilvan",
72 "スペイン": "Spain",
73 "スルシルヴァン": "Sursilvan",
74 "スルミラン": "Surmiran",
75 "セルビア": "Serbian",
76 "セルリング": "Sylt",
77 "タイ英語": "Thailand",
78 "タラシケヴィツァ": "Taraškievica",
79 "タントユカ": "Tantoyuca",
80 "ダリー語": "Dari",
81 "チコナメル": "Chiconamel",
82 "チコンテペク": "Chicontepec",
83 "チャ方言": "Chakavian",
84 "チリ": "Chile",
85 "ティマウ": "Timau",
86 "テペツィントラ": "Tepetzintla",
87 "テマパチェ": "Temapache",
88 "ディゴル方言": "Digor",
89 "デモティキ": "Demotic",
90 "トスカナ語": "Tuscany",
91 "ドイツ": "Germany",
92 "ドイツ南部": "Southern-Germany",
93 "ナチズム": "Nazism",
94 "ナミビア": "Namibia",
95 "ニカラグア": "Nicaragua",
96 "ニューカッスル": "Tyneside",
97 "ニュージーランド": "New-Zealand",
98 "ヌオロ": "Nuorese",
99 "バレンシア": "Valencia",
100 "パナマ": "Panama",
101 "ヒンズー教": "Hinduism",
102 "ビザンツ": "Byzantine",
103 "ビバロ・アルピーネ語": "Vivaro-Alpine",
104 "フィリピン": "Philippines",
105 "フェリング・エームラング": "Föhr-Amrum",
106 "フォルマッツァ語": "Formazza",
107 "フランス": "French",
108 "ブラジル": "Brazil",
109 "プロテスタント": "Protestant",
110 "プロヴァンス": "Provençal",
111 "プーター": "Puter",
112 "ヘルゴラント": "Helgoland",
113 "ベネズエラ": "Venezuela",
114 "ベルギー": "Belgium",
115 "ペルー": "Peru",
116 "ホンジュラス": "Honduras",
117 "ボスニア": "Bosnian",
118 "ボリビア": "Bolivia",
119 "ポルトガル": "Portugal",
120 "マンスター": "Munster",
121 "マーシア": "Mercian",
122 "ミストラル式綴り": "Mistralian",
123 "メキシコ": "Mexico",
124 "モンテネグロ": "Montenegro",
125 "モーリング": "Mooring",
126 "ユダヤ教": "Judaism",
127 "ヨーロッパ": "Europe",
128 "ラングドック": "Languedoc",
129 "リヒテンシュタイン": "Liechtenstein",
130 "リプアーリ語": "Ripuarian",
131 "リムーザン": "Limousin",
132 "リメッラ語": "Rimella",
133 "ルイジアナ": "Louisiana",
134 "ルゼルナ": "Luserna",
135 "ログドーロ": "Logudorese",
136 "ロシア": "Russia",
137 "ヴァラダール": "Vallander",
138 "ヴィーディングハルデ": "Wiedingharde",
139 "不可算": "uncountable",
140 "不変化名詞": "indeclinable",
141 "不活動体": "inanimate",
142 "他動詞": "transitive",
143 "代名詞的用法": "pronominal",
144 "俗語": "slang",
145 "修辞学": "rhetoric",
146 "倒語": "slang",
147 "再帰動詞": "reflexive",
148 "初期中英語": "Early-Middle-English",
149 "助動詞": "auxiliary",
150 "卑語": "vulgar",
151 "単数形で": "singular",
152 "単数形のみ": "singular singular-only singular",
153 "印": "India",
154 "叙法": "modal",
155 "叙述用法のみ": "predicative",
156 "口語": "informal",
157 "古用法": "dated",
158 "古語・廃語": "archaic",
159 "可算": "countable",
160 "地名": "place",
161 "地域": "regional",
162 "基数": "cardinal",
163 "多文化的ロンドン英語": "Multicultural-London-English",
164 "婉曲表現": "euphemistic",
165 "幼児語": "childish",
166 "序数": "ordinal",
167 "廃語": "obsolete",
168 "強い": "strong",
169 "形容詞的": "attributive",
170 "後期中英語": "Late-Middle-English",
171 "恐らく": "possibly",
172 "慣用的表現": "idiomatic",
173 "排斥された語": "proscribed",
174 "控えめに": "mildly",
175 "換喩的に": "metonymically",
176 "文章語": "literary",
177 "方言": "dialectal",
178 "時々": "sometimes",
179 "欠如動詞": "defective",
180 "正式・堅": "formal",
181 "歴史": "historical",
182 "比喩": "figuratively",
183 "比喩的に": "figuratively",
184 "比較形有り": "comparable",
185 "比較形無し": "not-comparable",
186 "活動体": "animate",
187 "滑稽": "humorous",
188 "特に": "especially",
189 "状態動詞": "stative",
190 "略語": "abbreviation",
191 "疑問詞": "interrogative",
192 "皮肉": "ironic",
193 "破格": "nonstandard",
194 "筋肉": "anatomy",
195 "米語": "US",
196 "絶対単数": "singular-only singular",
197 "絶対複数": "plural-only plural",
198 "能格動詞": "ergative",
199 "自他動詞": "ambitransitive",
200 "自動詞": "intransitive",
201 "英連邦": "Commonwealth",
202 "蔑称": "offensive",
203 "複合語で": "in-compounds",
204 "複数形で": "plural",
205 "西部": "Western",
206 "視覚方言": "pronunciation-spelling",
207 "詩的表現": "poetic",
208 "豪": "Australian",
209 "転じて": "broadly",
210 "軽侮語": "pejorative",
211 "近代ラテン語": "Netherlands",
212 "逐語的に": "literally",
213 "通常": "usually",
214 "通常複数形で": "plural-normally",
215 "造語": "neologism",
216 "関係詞": "relative",
217 "限定": "definite",
218 "集合名詞": "collective",
219 "集合的に": "collective",
220 "非人称": "impersonal",
221 "人称": "personal",
222 "非標準": "uncommon",
223 "頭字語": "initialism",
224 "首都": "uppercase",
225 # "en-verb" template
226 "三単現": ["third-person", "singular", "present"],
227 "現在分詞": ["present", "participle"],
228 "過去形": "past",
229 "過去分詞": ["past", "participle"],
230 "繁": "Traditional-Chinese",
231 "簡": "Simplified-Chinese",
232 # zh sound
233 "ピンイン": "Pinyin",
234 "注音符号": "Bopomofo",
235 "広東語": "Cantonese",
236 "閩南語": "Min-Nan",
237 "客家語": "Hakka",
238 # モジュール:gender and number
239 "非有生": "inanimate",
240 "有生": "animate",
241 "男性人間": "virile",
242 "非男性人間": "nonvirile",
243 # Template:ru-noun+
244 "生格": "genitive",
245 "複数主格": ["nominative", "plural"],
246 "複数生格": ["genitive", "plural"],
247 "形容詞": ["relational", "adjective"],
248 "指小形": "diminutive",
249 "不完了体": "imperfective",
250 "完了体": "perfective",
251 # Template:日本語ダ活用 日本語サ変活用
252 "未然形": "imperfective",
253 "連用形": "continuative",
254 "終止形": "terminal",
255 "連体形": "attributive",
256 "仮定形": "hypothetical",
257 "命令形": "imperative",
258 "命令": "imperative",
259 "過去・完了": ["past", "completive"],
260 "過去・完了・状態": ["past", "completive"],
261 "否定形": "negative",
262 "否定": "negative",
263 "否定(古風)": ["negative", "archaic"],
264 "自動詞化": "intransitive",
265 "言い切り": "definitive",
266 "名詞化": "noun-from-verb",
267 "自発・受身\n可能・尊敬": ["active", "passive", "possibly", "honorific"],
268 "丁寧": "polite",
269 "東京式": "Tokyo",
270 "京阪式": ["Kyoto", "Osaka"],
271 # Template:hu-noun
272 "複数・主格": "plural",
273 "・主格": "nominative",
274 "単数対格": ["singular", "accusative"],
275 "複数対格": ["plural", "accusative"],
276 "属格": "genitive",
277 "ラテン文字": "romanization",
278 "キリル文字": "Cyrillic",
279 # Template:de-noun
280 "複数形無し": "no-plural",
281 "複数形": "plural",
282 "指小辞無し": "no-diminutive",
283 "指小辞": "diminutive",
284}
286TOPICS = {
287 # テンプレート:context/data
288 "BDSM": "BDSM",
289 "LGBT": "LGBT",
290 "SF": "science fiction",
291 "アイスホッケー": "ice hockey",
292 "アメリカンフットボール": "American football",
293 "アーチェリー": "archery",
294 "イスラム教": "Islam",
295 "イデオロギー": "ideology",
296 "ウイルス学": "virology",
297 "エネルギー": "energy",
298 "カトリック": "Catholicism",
299 "カードゲーム": "cards",
300 "カーリング": "curling",
301 "キリスト教": "Christianity",
302 "クリケット": "cricket",
303 "グラフィカルユーザインタフェース": "graphical user interface",
304 "グラフ理論": "graph theory",
305 "コンピュータグラフィックス": "computer graphics",
306 "ゴルフ": "golf",
307 "サイクリング": "cycling",
308 "サッカー": "soccer",
309 "サーフィン": "surfing",
310 "シャンチー": "xiangqi",
311 "スカッシュ": "squash",
312 "スキー": "skiing",
313 "スケート": "skating",
314 "スケートボード": "skateboarding",
315 "スヌーカー": "snooker",
316 "スノーボード": "snowboarding",
317 "スポーツ": "sports",
318 "ソフトウェア": "software",
319 "ソフトボール": "softball",
320 "ゾロアスター教": "Zoroastrianism",
321 "ダンス": "dance",
322 "ダーツ": "darts",
323 "チアリーディング": "cheerleading",
324 "チェス": "chess",
325 "テニス": "tennis",
326 "テレビ": "television",
327 "ハンドボール": "handball",
328 "ハードウェア": "computer hardware",
329 "バスケットボール": "basketball",
330 "バレーボール": "volleyball",
331 "ビジネス": "business",
332 "ビリヤード": "billiards",
333 "ファシズム": "fascism",
334 "ファッション": "fashion",
335 "フェミニズム": "feminism",
336 "フェンシング": "fencing",
337 "フットボール": "football",
338 "ブリッジ": "bridge",
339 "プログラミング": "programming",
340 "ボウリング": "bowling",
341 "ボクシング": "boxing",
342 "ボディビル": "bodybuilding",
343 "ボート競技": "rowing",
344 "ポーカー": "poker",
345 "モータースポーツ": "motor racing",
346 "ラクロス": "lacrosse",
347 "ラグビー": "rugby",
348 "レスリング": "wrestling",
349 "交通": "transport",
350 "人口学": "demography",
351 "仏教": "Buddhism",
352 "代数学": "algebra",
353 "代数幾何学": "algebraic geometry",
354 "会計": "accounting",
355 "体操": "gymnastics",
356 "保険": "insurance",
357 "倫理学": "ethics",
358 "光学": "optics",
359 "免疫学": "immunology",
360 "共産主義": "communism",
361 "写真": "photography",
362 "分類学": "taxonomy",
363 "力学": "mechanics",
364 "動物学": "zoology",
365 "化学": "chemistry",
366 "化粧品": "cosmetics",
367 "医学": "medicine",
368 "医療": "healthcare",
369 "単位": "units of measure",
370 "占星術": "astrology",
371 "印刷": "printing",
372 "古生物学": "paleontology",
373 "哲学": "philosophy",
374 "哺乳類学": "mammalogy",
375 "商取引": "trading",
376 "園芸": "horticulture",
377 "地理": "geography",
378 "地質学": "geology",
379 "地震学": "seismology",
380 "外科学": "surgery",
381 "大工仕事": "carpentry",
382 "天体物理学": "astrophysics",
383 "天文学": "astronomy",
384 "娯楽": "entertainment",
385 "季節": "seasons",
386 "宗教": "religion",
387 "宝飾": "jewelry",
388 "家具": "furniture",
389 "寄生虫学": "parasitology",
390 "将棋": "shogi",
391 "岩石学": "petrology",
392 "工学": "engineering",
393 "幾何学": "geometry",
394 "建築": "architecture",
395 "微生物学": "microbiology",
396 "心理学": "psychology",
397 "性": "sexuality",
398 "性行為": "sex",
399 "情報学": "information science",
400 "情報技術": "computing",
401 "戦争": "war",
402 "技術": "technology",
403 "政府": "government",
404 "政治": "politics",
405 "教育": "education",
406 "数学": "mathematics",
407 "数論": "number theory",
408 "文学": "literature",
409 "文法": "grammar",
410 "文献学": "philology",
411 "料理": "cuisine",
412 "旅行": "travel",
413 "昆虫学": "entomology",
414 "時間": "time",
415 "有機化学": "organic chemistry",
416 "林業": "forestry",
417 "柔道": "judo",
418 "植物学": "botany",
419 "武器": "weapon",
420 "歯学": "dentistry",
421 "歴史": "history",
422 "歴史学": "historiography",
423 "気候": "climatology",
424 "気象": "weather",
425 "水泳": "swimming",
426 "泌尿器科学": "urology",
427 "法律": "legal",
428 "活版印刷": "typography",
429 "流体力学": "fluid dynamics",
430 "海事": "nautical",
431 "海洋学": "oceanography",
432 "消防": "firefighting",
433 "火器": "firearms",
434 "火山学": "volcanology",
435 "無機化学": "inorganic chemistry",
436 "熱力学": "thermodynamics",
437 "物理学": "physics",
438 "犯罪学": "criminology",
439 "狩猟": "hunting",
440 "生化学": "biochemistry",
441 "生態学": "ecology",
442 "生物学": "biology",
443 "生理学": "physiology",
444 "疑似科学": "pseudoscience",
445 "疫学": "epidemiology",
446 "病理学": "pathology",
447 "発生学": "embryology",
448 "相撲": "sumo",
449 "眼科学": "ophthalmology",
450 "社会学": "sociology",
451 "社会科学": "social science",
452 "社会言語学": "sociolinguistics",
453 "神学": "theology",
454 "神経学": "neurology",
455 "神経解剖学": "neuroanatomy",
456 "神話": "mythology",
457 "神道": "Shinto",
458 "科学": "sciences",
459 "競馬": "horse racing",
460 "精神医学": "psychiatry",
461 "紋章学": "heraldry",
462 "紡績": "weaving",
463 "細胞学": "cytology",
464 "細菌学": "bacteriology",
465 "経営学": "management",
466 "経済": "economics",
467 "統計学": "statistics",
468 "線型代数学": "linear algebra",
469 "翻訳研究": "translation studies",
470 "老年学": "gerontology",
471 "考古学": "archaeology",
472 "肉": "meat",
473 "腫瘍学": "oncology",
474 "自動車": "automobile",
475 "自動車機器": "automotive",
476 "航空": "aviation",
477 "航空工学": "aeronautics",
478 "色": "color",
479 "花粉学": "palynology",
480 "芸術": "arts",
481 "著作権": "copyright",
482 "薬理学": "pharmacology",
483 "藻類学": "phycology",
484 "蠍": "beer",
485 "血液学": "hematology",
486 "衣類": "clothing",
487 "製造": "manufacturing",
488 "解剖学": "anatomy",
489 "解析学": "mathematical analysis",
490 "言語": "language",
491 "言語学": "linguistics",
492 "詩": "poetry",
493 "語彙論": "lexicology",
494 "語用論": "pragmatics",
495 "調理": "cooking",
496 "論理学": "logic",
497 "資本主義": "capitalism",
498 "超心理学": "parapsychology",
499 "軍事": "military",
500 "辞書学": "lexicography",
501 "農業": "agriculture",
502 "通貨": "numismatics",
503 "運動": "exercise",
504 "道路": "road",
505 "遺伝学": "genetics",
506 "都市": "city",
507 "都道府県": "prefectures of Japan",
508 "酒": "beer",
509 "重量挙げ": "weightlifting",
510 "野球": "baseball",
511 "野菜": "vegetable",
512 "金融": "finance",
513 "釣り": "fishing",
514 "鉄道": "rail transport",
515 "鉱物学": "mineralogy",
516 "陸上競技": "athletics",
517 "集合論": "set theory",
518 "電子工学": "electronics",
519 "電気": "electricity",
520 "電磁気学": "electromagnetism",
521 "電話": "telephone",
522 "音声学": "phonetics",
523 "音楽": "music",
524 "音韻論": "phonology",
525 "韻律": "prosody",
526 "食品": "food",
527 "馬術": "equestrianism",
528 "魚": "fish",
529 "魚類学": "ichthyology",
530 "鳥類学": "ornithology",
531 "麻雀": "mahjong",
532 "演算": "arithmetic",
533}
536def translate_raw_tags(data: WordEntry) -> None:
537 raw_tags = []
538 for raw_tag in data.raw_tags:
539 if raw_tag in TAGS:
540 add_tag(raw_tag, data)
541 elif "/" in raw_tag:
542 for r_tag in raw_tag.split("/"):
543 r_tag = r_tag.strip()
544 if r_tag in TAGS: 544 ↛ 542line 544 didn't jump to line 542 because the condition on line 544 was always true
545 add_tag(r_tag, data)
546 elif raw_tag in TOPICS and hasattr(data, "topics"):
547 data.topics.append(TOPICS[raw_tag])
548 else:
549 raw_tags.append(raw_tag)
550 data.raw_tags = raw_tags
553def add_tag(raw_tag: str, data: WordEntry) -> None:
554 tr_tag = TAGS[raw_tag]
555 if isinstance(tr_tag, str) and tr_tag not in data.tags:
556 data.tags.append(TAGS[raw_tag])
557 elif isinstance(tr_tag, list):
558 for t_tag in tr_tag:
559 if t_tag not in data.tags: 559 ↛ 558line 559 didn't jump to line 558 because the condition on line 559 was always true
560 data.tags.append(t_tag)