Coverage for src/wiktextract/extractor/ja/kanji.py: 92%

40 statements  

« prev     ^ index     » next       coverage.py v7.14.3, created at 2026-06-23 09:14 +0000

1import re 

2 

3from wikitextprocessor.parser import TemplateNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8 

9JA_KANJI_TEMPLATE_PARAMS = { 

10 "呉音": "go-on", 

11 "漢音": "kan-on", 

12 "唐音": "to-on", 

13 "慣用音": "kan-yo-on", 

14 "音": "on", 

15 "訓": "kun", 

16 "古訓": "ko-kun", 

17 "名乗り": "nanori", 

18} 

19"""Parameters of the ja-kanji template that contain readings. 

20 

21On'yomi (音読み); Chinese-derived readings (katakana): 

22- 呉音 (go-on): Historical reading 

23- 漢音 (kan-on): Historical reading 

24- 唐音 (to-on): Historical reading 

25- 慣用音 (kan-yo-on): Customary reading, often corrupted or non-standard 

26- 音 (on): Generic on'yomi when the specific type is unknown 

27 

28Kun'yomi (訓読み); native Japanese readings (hiragana): 

29- 訓 (kun): Standard kun'yomi 

30- 古訓 (ko-kun): Archaic kun'yomi 

31- 名乗り (nanori): Readings used exclusively in personal names 

32""" 

33 

34 

35def _parse_readings(raw: str) -> list[str]: 

36 return [ 

37 r 

38 for raw_r in raw.split(",") 

39 if (r := re.sub(r"[<;/].*$", "", raw_r).strip()) 

40 ] 

41 

42 

43def extract_ja_kanji( 

44 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

45) -> None: 

46 # https://ja.wiktionary.org/wiki/テンプレート:ja-kanji 

47 # First collect 常用 (joyo) readings (readings in active use) 

48 # it mixes on'yomi (katakana) and kun'yomi (hiragana) 

49 # 

50 # We can find 呉音=*, meaning the reading is the same as the kan-on 漢音 

51 joyo = set() 

52 joyo_value = t_node.template_parameters.get("常用", "") 

53 if joyo_value: 53 ↛ 60line 53 didn't jump to line 60 because the condition on line 53 was always true

54 joyo_reading = clean_node(wxr, base_data, joyo_value) 

55 for r in _parse_readings(joyo_reading): 

56 joyo.add(r) 

57 

58 # Collect kan-on readings first so 呉音=* can reference them 

59 # (we can't repeat the previous logic due to numbering: 漢音1=, 漢音2=) 

60 kan_on_readings: list[str] = [] 

61 for param, value in t_node.template_parameters.items(): 

62 if not isinstance(param, str): 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 continue 

64 base_param = re.sub(r"\d+$", "", param) 

65 reading_type = JA_KANJI_TEMPLATE_PARAMS.get(base_param) 

66 if reading_type == "kan-on": 

67 raw = clean_node(wxr, base_data, value) 

68 for r in _parse_readings(raw): 

69 kan_on_readings.append(r) 

70 

71 # Then extract the remaining readings to append them, tagging joyo ones 

72 # accordingly and resolving if necessary. 

73 for param, value in t_node.template_parameters.items(): 

74 if not isinstance(param, str): 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 continue 

76 base_param = re.sub(r"\d+$", "", param) 

77 reading_type = JA_KANJI_TEMPLATE_PARAMS.get(base_param) 

78 if reading_type is None: 

79 continue 

80 raw = clean_node(wxr, base_data, value) 

81 readings = ( 

82 kan_on_readings 

83 if (reading_type == "go-on" and raw.strip() == "*") 

84 else _parse_readings(raw) 

85 ) 

86 for r in readings: 

87 tags = [reading_type] 

88 if r in joyo: 

89 tags.append("joyo") 

90 base_data.forms.append( 

91 Form(form=r, tags=["transliteration", *tags]) 

92 )