Coverage for src / wiktextract / extractor / ja / kanji.py: 88%
32 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-17 07:22 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-17 07:22 +0000
1import re
3from wikitextprocessor.parser import TemplateNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, WordEntry
9JA_KANJI_TEMPLATE_PARAMS = {
10 "呉音": "go-on",
11 "漢音": "kan-on",
12 "唐音": "to-on",
13 "慣用音": "kan-yo-on",
14 "音": "on",
15 "訓": "kun",
16 "古訓": "ko-kun",
17 "名乗り": "nanori",
18}
19"""Parameters of the ja-kanji template that contain readings.
21On'yomi (音読み); Chinese-derived readings (katakana):
22- 呉音 (go-on): Historical reading
23- 漢音 (kan-on): Historical reading
24- 唐音 (to-on): Historical reading
25- 慣用音 (kan-yo-on): Customary reading, often corrupted or non-standard
26- 音 (on): Generic on'yomi when the specific type is unknown
28Kun'yomi (訓読み); native Japanese readings (hiragana):
29- 訓 (kun): Standard kun'yomi
30- 古訓 (ko-kun): Archaic kun'yomi
31- 名乗り (nanori): Readings used exclusively in personal names
32"""
35def extract_ja_kanji(
36 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
37) -> None:
38 # https://ja.wiktionary.org/wiki/テンプレート:ja-kanji
39 # First collect 常用 (joyo) readings (readings in active use)
40 # it mixes on'yomi (katakana) and kun'yomi (hiragana)
41 joyo = set()
42 joyo_value = t_node.template_parameters.get("常用", "")
43 if joyo_value: 43 ↛ 51line 43 didn't jump to line 51 because the condition on line 43 was always true
44 joyo_reading = clean_node(wxr, base_data, joyo_value)
45 for r in joyo_reading.split(","):
46 r = re.sub(r"[<;/].*$", "", r).strip()
47 if r: 47 ↛ 45line 47 didn't jump to line 45 because the condition on line 47 was always true
48 joyo.add(r)
50 # Then extract all readings, tagging joyo ones accordingly
51 for param, value in t_node.template_parameters.items():
52 if not isinstance(param, str): 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true
53 continue
54 base_param = re.sub(r"\d+$", "", param)
55 reading_type = JA_KANJI_TEMPLATE_PARAMS.get(base_param)
56 if reading_type is None:
57 continue
58 reading = clean_node(wxr, base_data, value)
59 if not reading: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true
60 continue
61 for r in reading.split(","):
62 r = re.sub(r"[<;/].*$", "", r).strip()
63 tags = [reading_type]
64 if r in joyo:
65 tags.append("joyo")
66 base_data.forms.append(
67 Form(form=r, tags=["transliteration", *tags])
68 )