Coverage for src / wiktextract / extractor / ja / kanji.py: 88%

32 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-17 07:22 +0000

1import re 

2 

3from wikitextprocessor.parser import TemplateNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Form, WordEntry 

8 

9JA_KANJI_TEMPLATE_PARAMS = { 

10 "呉音": "go-on", 

11 "漢音": "kan-on", 

12 "唐音": "to-on", 

13 "慣用音": "kan-yo-on", 

14 "音": "on", 

15 "訓": "kun", 

16 "古訓": "ko-kun", 

17 "名乗り": "nanori", 

18} 

19"""Parameters of the ja-kanji template that contain readings. 

20 

21On'yomi (音読み); Chinese-derived readings (katakana): 

22- 呉音 (go-on): Historical reading 

23- 漢音 (kan-on): Historical reading 

24- 唐音 (to-on): Historical reading 

25- 慣用音 (kan-yo-on): Customary reading, often corrupted or non-standard 

26- 音 (on): Generic on'yomi when the specific type is unknown 

27 

28Kun'yomi (訓読み); native Japanese readings (hiragana): 

29- 訓 (kun): Standard kun'yomi 

30- 古訓 (ko-kun): Archaic kun'yomi 

31- 名乗り (nanori): Readings used exclusively in personal names 

32""" 

33 

34 

35def extract_ja_kanji( 

36 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

37) -> None: 

38 # https://ja.wiktionary.org/wiki/テンプレート:ja-kanji 

39 # First collect 常用 (joyo) readings (readings in active use) 

40 # it mixes on'yomi (katakana) and kun'yomi (hiragana) 

41 joyo = set() 

42 joyo_value = t_node.template_parameters.get("常用", "") 

43 if joyo_value: 43 ↛ 51line 43 didn't jump to line 51 because the condition on line 43 was always true

44 joyo_reading = clean_node(wxr, base_data, joyo_value) 

45 for r in joyo_reading.split(","): 

46 r = re.sub(r"[<;/].*$", "", r).strip() 

47 if r: 47 ↛ 45line 47 didn't jump to line 45 because the condition on line 47 was always true

48 joyo.add(r) 

49 

50 # Then extract all readings, tagging joyo ones accordingly 

51 for param, value in t_node.template_parameters.items(): 

52 if not isinstance(param, str): 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true

53 continue 

54 base_param = re.sub(r"\d+$", "", param) 

55 reading_type = JA_KANJI_TEMPLATE_PARAMS.get(base_param) 

56 if reading_type is None: 

57 continue 

58 reading = clean_node(wxr, base_data, value) 

59 if not reading: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 continue 

61 for r in reading.split(","): 

62 r = re.sub(r"[<;/].*$", "", r).strip() 

63 tags = [reading_type] 

64 if r in joyo: 

65 tags.append("joyo") 

66 base_data.forms.append( 

67 Form(form=r, tags=["transliteration", *tags]) 

68 )