Coverage for src/wiktextract/extractor/ja/kanji.py: 88%

1import re

3from wikitextprocessor.parser import TemplateNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from .models import Form, WordEntry

9JA_KANJI_TEMPLATE_PARAMS = {

10 "呉音": "go-on",

11 "漢音": "kan-on",

12 "唐音": "to-on",

13 "慣用音": "kan-yo-on",

14 "音": "on",

15 "訓": "kun",

16 "古訓": "ko-kun",

17 "名乗り": "nanori",

18}

19"""Parameters of the ja-kanji template that contain readings.

21On'yomi (音読み); Chinese-derived readings (katakana):

22- 呉音 (go-on): Historical reading

23- 漢音 (kan-on): Historical reading

24- 唐音 (to-on): Historical reading

25- 慣用音 (kan-yo-on): Customary reading, often corrupted or non-standard

26- 音 (on): Generic on'yomi when the specific type is unknown

28Kun'yomi (訓読み); native Japanese readings (hiragana):

29- 訓 (kun): Standard kun'yomi

30- 古訓 (ko-kun): Archaic kun'yomi

31- 名乗り (nanori): Readings used exclusively in personal names

32"""

35def extract_ja_kanji(

36 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

37) -> None:

38 # https://ja.wiktionary.org/wiki/テンプレート:ja-kanji

39 # First collect 常用 (joyo) readings (readings in active use)

40 # it mixes on'yomi (katakana) and kun'yomi (hiragana)

41 joyo = set()

42 joyo_value = t_node.template_parameters.get("常用", "")

43 if joyo_value: 43 ↛ 51line 43 didn't jump to line 51 because the condition on line 43 was always true

44 joyo_reading = clean_node(wxr, base_data, joyo_value)

45 for r in joyo_reading.split(","):

46 r = re.sub(r"[<;/].*$", "", r).strip()

47 if r: 47 ↛ 45line 47 didn't jump to line 45 because the condition on line 47 was always true

48 joyo.add(r)

50 # Then extract all readings, tagging joyo ones accordingly

51 for param, value in t_node.template_parameters.items():

52 if not isinstance(param, str): 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true

53 continue

54 base_param = re.sub(r"\d+$", "", param)

55 reading_type = JA_KANJI_TEMPLATE_PARAMS.get(base_param)

56 if reading_type is None:

57 continue

58 reading = clean_node(wxr, base_data, value)

59 if not reading: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 continue

61 for r in reading.split(","):

62 r = re.sub(r"[<;/].*$", "", r).strip()

63 tags = [reading_type]

64 if r in joyo:

65 tags.append("joyo")

66 base_data.forms.append(

67 Form(form=r, tags=["transliteration", *tags])

68 )