Coverage for src/wiktextract/extractor/ja/sound.py: 86%

149 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1import itertools 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..share import set_sound_file_url_fields 

8from .models import Sound, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def extract_sound_section( 

13 wxr: WiktextractContext, 

14 page_data: list[WordEntry], 

15 base_data: WordEntry, 

16 level_node: LevelNode, 

17) -> None: 

18 sounds = [] 

19 cats = {} 

20 if base_data.lang_code == "zh": 

21 extract_zh_sounds(wxr, level_node, sounds) 

22 else: 

23 for template_node in level_node.find_child_recursively( 

24 NodeKind.TEMPLATE 

25 ): 

26 process_sound_template(wxr, template_node, sounds, cats) 

27 

28 if level_node.kind == NodeKind.LEVEL3: 

29 base_data.sounds.extend(sounds) 

30 base_data.categories.extend(cats.get("categories", [])) 

31 for data in page_data: 

32 if data.lang_code == base_data.lang_code: 32 ↛ 31line 32 didn't jump to line 31 because the condition on line 32 was always true

33 data.sounds.extend(sounds) 

34 data.categories.extend(cats.get("categories", [])) 

35 elif len(page_data) > 0: 35 ↛ 39line 35 didn't jump to line 39 because the condition on line 35 was always true

36 page_data[-1].sounds.extend(sounds) 

37 page_data[-1].categories.extend(cats.get("categories", [])) 

38 else: 

39 base_data.sounds.extend(sounds) 

40 base_data.categories.extend(cats.get("categories", [])) 

41 

42 

43def process_sound_template( 

44 wxr: WiktextractContext, 

45 template_node: TemplateNode, 

46 sounds: list[Sound], 

47 cats: dict[str, list[str]], 

48) -> None: 

49 if template_node.template_name == "音声": 

50 audio_file = clean_node( 

51 wxr, None, template_node.template_parameters.get(2, "") 

52 ) 

53 if audio_file not in ["", "-"]: 53 ↛ 91line 53 didn't jump to line 91 because the condition on line 53 was always true

54 sound = Sound() 

55 raw_tag = clean_node( 

56 wxr, None, template_node.template_parameters.get(3, "") 

57 ) 

58 if len(raw_tag) > 0: 58 ↛ 60line 58 didn't jump to line 60 because the condition on line 58 was always true

59 sound.raw_tags.append(raw_tag) 

60 set_sound_file_url_fields(wxr, audio_file, sound) 

61 sounds.append(sound) 

62 elif template_node.template_name in ["IPA", "X-SAMPA"]: 

63 for index in itertools.count(1): 63 ↛ 91line 63 didn't jump to line 91 because the loop on line 63 didn't complete

64 if index not in template_node.template_parameters: 

65 break 

66 ipa = clean_node( 

67 wxr, None, template_node.template_parameters[index] 

68 ) 

69 if len(ipa) > 0: 69 ↛ 63line 69 didn't jump to line 63 because the condition on line 69 was always true

70 sound = Sound(ipa=ipa) 

71 if template_node.template_name == "X-SAMPA": 

72 sound.tags.append("X-SAMPA") 

73 sounds.append(sound) 

74 elif template_node.template_name == "homophones": 

75 homophones = [] 

76 for index in itertools.count(1): 76 ↛ 84line 76 didn't jump to line 84 because the loop on line 76 didn't complete

77 if index not in template_node.template_parameters: 

78 break 

79 homophone = clean_node( 

80 wxr, None, template_node.template_parameters[index] 

81 ) 

82 if len(homophone) > 0: 82 ↛ 76line 82 didn't jump to line 76 because the condition on line 82 was always true

83 homophones.append(homophone) 

84 if len(homophones) > 0: 84 ↛ 91line 84 didn't jump to line 91 because the condition on line 84 was always true

85 sounds.append(Sound(homophones=homophones)) 

86 elif template_node.template_name == "ja-pron": 

87 process_ja_pron_template(wxr, template_node, sounds) 

88 elif template_node.template_name == "ja-accent-common": 88 ↛ 91line 88 didn't jump to line 91 because the condition on line 88 was always true

89 process_ja_accent_common_template(wxr, template_node, sounds) 

90 

91 clean_node(wxr, cats, template_node) 

92 

93 

94JA_PRON_ACCENTS = { 

95 "中高型": "Nakadaka", 

96 "平板型": "Heiban", 

97 "頭高型": "Atamadaka", 

98 "尾高型": "Odaka", 

99} 

100 

101 

102def process_ja_pron_template( 

103 wxr: WiktextractContext, 

104 template_node: TemplateNode, 

105 sounds: list[Sound], 

106) -> None: 

107 # https://ja.wiktionary.org/wiki/テンプレート:ja-pron 

108 expanded_node = wxr.wtp.parse( 

109 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

110 ) 

111 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

112 if list_item.contain_node(NodeKind.TABLE): 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 continue 

114 else: 

115 sound = Sound() 

116 for span_tag in list_item.find_html_recursively("span"): 

117 span_classes = span_tag.attrs.get("class", "") 

118 if "qualifier-content" in span_classes: 

119 raw_tag = clean_node(wxr, None, span_tag) 

120 if len(raw_tag) > 0: 120 ↛ 116line 120 didn't jump to line 116 because the condition on line 120 was always true

121 sound.raw_tags.append(raw_tag) 

122 elif "IPA" in span_classes: 

123 sound.ipa = clean_node(wxr, None, span_tag) 

124 elif "Latn" in span_classes: 

125 sound.roman = clean_node(wxr, None, span_tag) 

126 elif "Jpan" in span_classes: 

127 sound.form = clean_node(wxr, None, span_tag) 

128 for link_node in list_item.find_child(NodeKind.LINK): 

129 link_text = clean_node(wxr, None, link_node) 

130 if link_text in JA_PRON_ACCENTS: 

131 sound.tags.append(JA_PRON_ACCENTS[link_text]) 

132 if len(sound.model_dump(exclude_defaults=True)) > 0: 

133 sounds.append(sound) 

134 

135 for arg in ["a", "audio"]: 

136 audio_file = clean_node( 

137 wxr, None, template_node.template_parameters.get(arg, "") 

138 ) 

139 if len(audio_file) > 0: 

140 sound = Sound() 

141 set_sound_file_url_fields(wxr, audio_file, sound) 

142 sounds.append(sound) 

143 

144 

145JA_ACCENT_COMMON_TYPES = { 

146 "h": "Heiban", 

147 "a": "Atamadaka", 

148 "n": "Nakadaka", 

149 "o": "Odaka", 

150} 

151 

152 

153def process_ja_accent_common_template( 

154 wxr: WiktextractContext, 

155 template_node: TemplateNode, 

156 sounds: list[Sound], 

157) -> None: 

158 # https://ja.wiktionary.org/wiki/テンプレート:ja-accent-common 

159 expanded_node = wxr.wtp.parse( 

160 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

161 ) 

162 sound = Sound() 

163 for link_node in expanded_node.find_child_recursively(NodeKind.LINK): 163 ↛ 168line 163 didn't jump to line 168 because the loop on line 163 didn't complete

164 raw_tag = clean_node(wxr, None, link_node) 

165 if raw_tag != "": 165 ↛ 163line 165 didn't jump to line 163 because the condition on line 165 was always true

166 sound.raw_tags.append(raw_tag) 

167 break 

168 for span_tag in expanded_node.find_html_recursively("span"): 168 ↛ 173line 168 didn't jump to line 173 because the loop on line 168 didn't complete

169 span_text = clean_node(wxr, None, span_tag) 

170 if len(span_text) > 0: 170 ↛ 168line 170 didn't jump to line 168 because the condition on line 170 was always true

171 sound.form = span_text 

172 break 

173 accent_type = clean_node( 

174 wxr, None, template_node.template_parameters.get(1, "") 

175 ) 

176 if accent_type in JA_ACCENT_COMMON_TYPES: 176 ↛ 178line 176 didn't jump to line 178 because the condition on line 176 was always true

177 sound.tags.append(JA_ACCENT_COMMON_TYPES[accent_type]) 

178 if sound.form != "": 178 ↛ exitline 178 didn't return from function 'process_ja_accent_common_template' because the condition on line 178 was always true

179 sounds.append(sound) 

180 

181 

182def extract_zh_sounds( 

183 wxr: WiktextractContext, level_node: LevelNode, sounds: list[Sound] 

184) -> None: 

185 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

186 after_colon = False 

187 tag_nodes = [] 

188 value_nodes = [] 

189 for child in list_item.children: 

190 if isinstance(child, str) and ":" in child and not after_colon: 

191 tag_nodes.append(child[: child.index(":")]) 

192 value_nodes.append(child[child.index(":") + 1 :]) 

193 after_colon = True 

194 elif not after_colon: 194 ↛ 197line 194 didn't jump to line 197 because the condition on line 194 was always true

195 tag_nodes.append(child) 

196 else: 

197 value_nodes.append(child) 

198 sound = Sound( 

199 zh_pron=clean_node(wxr, None, value_nodes), 

200 raw_tags=[clean_node(wxr, None, tag_nodes)], 

201 ) 

202 translate_raw_tags(sound) 

203 sounds.append(sound) 

204 

205 

206def extract_homophone_section( 

207 wxr: WiktextractContext, 

208 page_data: list[WordEntry], 

209 base_data: WordEntry, 

210 level_node: LevelNode, 

211) -> None: 

212 sounds = [] 

213 for list_node in level_node.find_child(NodeKind.LIST): 

214 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

215 for node in list_item.children: 

216 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

217 word = clean_node(wxr, None, node) 

218 if word != "": 218 ↛ 215line 218 didn't jump to line 215 because the condition on line 218 was always true

219 sounds.append(Sound(homophones=[word])) 

220 elif ( 

221 isinstance(node, TemplateNode) and node.template_name == "l" 

222 ): 

223 from .linkage import extract_l_template 

224 

225 l_data = extract_l_template(wxr, node) 

226 if l_data.word != "": 226 ↛ 215line 226 didn't jump to line 215 because the condition on line 226 was always true

227 sounds.append( 

228 Sound( 

229 homophones=[l_data.word], 

230 sense=l_data.sense, 

231 tags=l_data.tags, 

232 raw_tags=l_data.raw_tags, 

233 ) 

234 ) 

235 

236 if level_node.kind == NodeKind.LEVEL3: 236 ↛ 241line 236 didn't jump to line 241 because the condition on line 236 was always true

237 base_data.sounds.extend(sounds) 

238 for data in page_data: 238 ↛ 239line 238 didn't jump to line 239 because the loop on line 238 never started

239 if data.lang_code == base_data.lang_code: 

240 data.sounds.extend(sounds) 

241 elif len(page_data) > 0: 

242 page_data[-1].sounds.extend(sounds) 

243 else: 

244 base_data.sounds.extend(sounds)