Coverage for src/wiktextract/extractor/ja/kanji.py: 92%
40 statements
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 09:14 +0000
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 09:14 +0000
1import re
3from wikitextprocessor.parser import TemplateNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, WordEntry
9JA_KANJI_TEMPLATE_PARAMS = {
10 "呉音": "go-on",
11 "漢音": "kan-on",
12 "唐音": "to-on",
13 "慣用音": "kan-yo-on",
14 "音": "on",
15 "訓": "kun",
16 "古訓": "ko-kun",
17 "名乗り": "nanori",
18}
19"""Parameters of the ja-kanji template that contain readings.
21On'yomi (音読み); Chinese-derived readings (katakana):
22- 呉音 (go-on): Historical reading
23- 漢音 (kan-on): Historical reading
24- 唐音 (to-on): Historical reading
25- 慣用音 (kan-yo-on): Customary reading, often corrupted or non-standard
26- 音 (on): Generic on'yomi when the specific type is unknown
28Kun'yomi (訓読み); native Japanese readings (hiragana):
29- 訓 (kun): Standard kun'yomi
30- 古訓 (ko-kun): Archaic kun'yomi
31- 名乗り (nanori): Readings used exclusively in personal names
32"""
35def _parse_readings(raw: str) -> list[str]:
36 return [
37 r
38 for raw_r in raw.split(",")
39 if (r := re.sub(r"[<;/].*$", "", raw_r).strip())
40 ]
43def extract_ja_kanji(
44 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
45) -> None:
46 # https://ja.wiktionary.org/wiki/テンプレート:ja-kanji
47 # First collect 常用 (joyo) readings (readings in active use)
48 # it mixes on'yomi (katakana) and kun'yomi (hiragana)
49 #
50 # We can find 呉音=*, meaning the reading is the same as the kan-on 漢音
51 joyo = set()
52 joyo_value = t_node.template_parameters.get("常用", "")
53 if joyo_value: 53 ↛ 60line 53 didn't jump to line 60 because the condition on line 53 was always true
54 joyo_reading = clean_node(wxr, base_data, joyo_value)
55 for r in _parse_readings(joyo_reading):
56 joyo.add(r)
58 # Collect kan-on readings first so 呉音=* can reference them
59 # (we can't repeat the previous logic due to numbering: 漢音1=, 漢音2=)
60 kan_on_readings: list[str] = []
61 for param, value in t_node.template_parameters.items():
62 if not isinstance(param, str): 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true
63 continue
64 base_param = re.sub(r"\d+$", "", param)
65 reading_type = JA_KANJI_TEMPLATE_PARAMS.get(base_param)
66 if reading_type == "kan-on":
67 raw = clean_node(wxr, base_data, value)
68 for r in _parse_readings(raw):
69 kan_on_readings.append(r)
71 # Then extract the remaining readings to append them, tagging joyo ones
72 # accordingly and resolving if necessary.
73 for param, value in t_node.template_parameters.items():
74 if not isinstance(param, str): 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 continue
76 base_param = re.sub(r"\d+$", "", param)
77 reading_type = JA_KANJI_TEMPLATE_PARAMS.get(base_param)
78 if reading_type is None:
79 continue
80 raw = clean_node(wxr, base_data, value)
81 readings = (
82 kan_on_readings
83 if (reading_type == "go-on" and raw.strip() == "*")
84 else _parse_readings(raw)
85 )
86 for r in readings:
87 tags = [reading_type]
88 if r in joyo:
89 tags.append("joyo")
90 base_data.forms.append(
91 Form(form=r, tags=["transliteration", *tags])
92 )