Coverage for src/wiktextract/extractor/ja/sound.py: 90%

1import itertools

3from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from ..share import set_sound_file_url_fields

8from .models import Sound, WordEntry

9from .tags import translate_raw_tags

12def extract_sound_section(

13 wxr: WiktextractContext,

14 page_data: list[WordEntry],

15 base_data: WordEntry,

16 level_node: LevelNode,

17) -> None:

18 sounds = []

19 cats = {}

20 if base_data.lang_code == "zh":

21 extract_zh_sounds(wxr, level_node, sounds)

22 else:

23 for template_node in level_node.find_child_recursively(

24 NodeKind.TEMPLATE

25 ):

26 process_sound_template(wxr, template_node, sounds, cats)

28 if level_node.kind == NodeKind.LEVEL3:

29 base_data.sounds.extend(sounds)

30 base_data.categories.extend(cats.get("categories", []))

32 for data in page_data:

33 if data.lang_code == base_data.lang_code: 33 ↛ 32line 33 didn't jump to line 32 because the condition on line 33 was always true

34 data.sounds.extend(sounds)

35 data.categories.extend(cats.get("categories", []))

38def process_sound_template(

39 wxr: WiktextractContext,

40 template_node: TemplateNode,

41 sounds: list[Sound],

42 cats: dict[str, list[str]],

43) -> None:

44 if template_node.template_name == "音声":

45 audio_file = clean_node(

46 wxr, None, template_node.template_parameters.get(2, "")

47 )

48 if len(audio_file) > 0: 48 ↛ 86line 48 didn't jump to line 86 because the condition on line 48 was always true

49 sound = Sound()

50 raw_tag = clean_node(

51 wxr, None, template_node.template_parameters.get(3, "")

52 )

53 if len(raw_tag) > 0: 53 ↛ 55line 53 didn't jump to line 55 because the condition on line 53 was always true

54 sound.raw_tags.append(raw_tag)

55 set_sound_file_url_fields(wxr, audio_file, sound)

56 sounds.append(sound)

57 elif template_node.template_name in ["IPA", "X-SAMPA"]:

58 for index in itertools.count(1): 58 ↛ 86line 58 didn't jump to line 86 because the loop on line 58 didn't complete

59 if index not in template_node.template_parameters:

60 break

61 ipa = clean_node(

62 wxr, None, template_node.template_parameters[index]

63 )

64 if len(ipa) > 0: 64 ↛ 58line 64 didn't jump to line 58 because the condition on line 64 was always true

65 sound = Sound(ipa=ipa)

66 if template_node.template_name == "X-SAMPA":

67 sound.tags.append("X-SAMPA")

68 sounds.append(sound)

69 elif template_node.template_name == "homophones":

70 homophones = []

71 for index in itertools.count(1): 71 ↛ 79line 71 didn't jump to line 79 because the loop on line 71 didn't complete

72 if index not in template_node.template_parameters:

73 break

74 homophone = clean_node(

75 wxr, None, template_node.template_parameters[index]

76 )

77 if len(homophone) > 0: 77 ↛ 71line 77 didn't jump to line 71 because the condition on line 77 was always true

78 homophones.append(homophone)

79 if len(homophones) > 0: 79 ↛ 86line 79 didn't jump to line 86 because the condition on line 79 was always true

80 sounds.append(Sound(homophones=homophones))

81 elif template_node.template_name == "ja-pron":

82 process_ja_pron_template(wxr, template_node, sounds)

83 elif template_node.template_name == "ja-accent-common": 83 ↛ 86line 83 didn't jump to line 86 because the condition on line 83 was always true

84 process_ja_accent_common_template(wxr, template_node, sounds)

86 clean_node(wxr, cats, template_node)

89JA_PRON_ACCENTS = {

90 "中高型": "Nakadaka",

91 "平板型": "Heiban",

92 "頭高型": "Atamadaka",

93 "尾高型": "Odaka",

94}

97def process_ja_pron_template(

98 wxr: WiktextractContext,

99 template_node: TemplateNode,

100 sounds: list[Sound],

101) -> None:

102 # https://ja.wiktionary.org/wiki/テンプレート:ja-pron

103 expanded_node = wxr.wtp.parse(

104 wxr.wtp.node_to_wikitext(template_node), expand_all=True

105 )

106 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):

107 if list_item.contain_node(NodeKind.TABLE): 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true

108 continue

109 else:

110 sound = Sound()

111 for span_tag in list_item.find_html_recursively("span"):

112 span_classes = span_tag.attrs.get("class", "")

113 if "qualifier-content" in span_classes:

114 raw_tag = clean_node(wxr, None, span_tag)

115 if len(raw_tag) > 0: 115 ↛ 111line 115 didn't jump to line 111 because the condition on line 115 was always true

116 sound.raw_tags.append(raw_tag)

117 elif "IPA" in span_classes:

118 sound.ipa = clean_node(wxr, None, span_tag)

119 elif "Latn" in span_classes:

120 sound.roman = clean_node(wxr, None, span_tag)

121 elif "Jpan" in span_classes:

122 sound.form = clean_node(wxr, None, span_tag)

123 for link_node in list_item.find_child(NodeKind.LINK):

124 link_text = clean_node(wxr, None, link_node)

125 if link_text in JA_PRON_ACCENTS:

126 sound.tags.append(JA_PRON_ACCENTS[link_text])

127 if len(sound.model_dump(exclude_defaults=True)) > 0:

128 sounds.append(sound)

129

130 for arg in ["a", "audio"]:

131 audio_file = clean_node(

132 wxr, None, template_node.template_parameters.get(arg, "")

133 )

134 if len(audio_file) > 0:

135 sound = Sound()

136 set_sound_file_url_fields(wxr, audio_file, sound)

137 sounds.append(sound)

138

139

140JA_ACCENT_COMMON_TYPES = {

141 "h": "Heiban",

142 "a": "Atamadaka",

143 "n": "Nakadaka",

144 "o": "Odaka",

145}

146

147

148def process_ja_accent_common_template(

149 wxr: WiktextractContext,

150 template_node: TemplateNode,

151 sounds: list[Sound],

152) -> None:

153 # https://ja.wiktionary.org/wiki/テンプレート:ja-accent-common

154 expanded_node = wxr.wtp.parse(

155 wxr.wtp.node_to_wikitext(template_node), expand_all=True

156 )

157 sound = Sound()

158 for link_node in expanded_node.find_child_recursively(NodeKind.LINK): 158 ↛ 163line 158 didn't jump to line 163 because the loop on line 158 didn't complete

159 raw_tag = clean_node(wxr, None, link_node)

160 if raw_tag != "": 160 ↛ 158line 160 didn't jump to line 158 because the condition on line 160 was always true

161 sound.raw_tags.append(raw_tag)

162 break

163 for span_tag in expanded_node.find_html_recursively("span"): 163 ↛ 168line 163 didn't jump to line 168 because the loop on line 163 didn't complete

164 span_text = clean_node(wxr, None, span_tag)

165 if len(span_text) > 0: 165 ↛ 163line 165 didn't jump to line 163 because the condition on line 165 was always true

166 sound.form = span_text

167 break

168 accent_type = clean_node(

169 wxr, None, template_node.template_parameters.get(1, "")

170 )

171 if accent_type in JA_ACCENT_COMMON_TYPES: 171 ↛ 173line 171 didn't jump to line 173 because the condition on line 171 was always true

172 sound.tags.append(JA_ACCENT_COMMON_TYPES[accent_type])

173 if sound.form != "": 173 ↛ exitline 173 didn't return from function 'process_ja_accent_common_template' because the condition on line 173 was always true

174 sounds.append(sound)

175

176

177def extract_zh_sounds(

178 wxr: WiktextractContext, level_node: LevelNode, sounds: list[Sound]

179) -> None:

180 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):

181 after_colon = False

182 tag_nodes = []

183 value_nodes = []

184 for child in list_item.children:

185 if isinstance(child, str) and ":" in child and not after_colon:

186 tag_nodes.append(child[: child.index(":")])

187 value_nodes.append(child[child.index(":") + 1 :])

188 after_colon = True

189 elif not after_colon: 189 ↛ 192line 189 didn't jump to line 192 because the condition on line 189 was always true

190 tag_nodes.append(child)

191 else:

192 value_nodes.append(child)

193 sound = Sound(

194 zh_pron=clean_node(wxr, None, value_nodes),

195 raw_tags=[clean_node(wxr, None, tag_nodes)],

196 )

197 translate_raw_tags(sound)

198 sounds.append(sound)