Coverage for src/wiktextract/extractor/ja/sound.py: 90%

194 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-24 07:36 +0000

1import itertools 

2import re 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..share import capture_text_in_parentheses, set_sound_file_url_fields 

15from .models import Sound, WordEntry 

16from .tags import translate_raw_tags 

17 

18 

19def extract_sound_section( 

20 wxr: WiktextractContext, 

21 page_data: list[WordEntry], 

22 base_data: WordEntry, 

23 level_node: LevelNode, 

24) -> None: 

25 sounds = [] 

26 cats = {} 

27 for node in level_node.children: 

28 if isinstance(node, TemplateNode): 

29 process_sound_template(wxr, node, sounds, cats) 

30 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

31 for list_item in node.find_child(NodeKind.LIST_ITEM): 

32 if base_data.lang_code == "zh": 

33 extract_zh_sound_list_item(wxr, list_item, sounds, []) 

34 else: 

35 for t_node in list_item.find_child(NodeKind.TEMPLATE): 

36 process_sound_template(wxr, t_node, sounds, cats) 

37 

38 if level_node.kind == NodeKind.LEVEL3: 

39 base_data.sounds.extend(sounds) 

40 base_data.categories.extend(cats.get("categories", [])) 

41 for data in page_data: 

42 if data.lang_code == base_data.lang_code: 42 ↛ 41line 42 didn't jump to line 41 because the condition on line 42 was always true

43 data.sounds.extend(sounds) 

44 data.categories.extend(cats.get("categories", [])) 

45 elif len(page_data) > 0: 45 ↛ 49line 45 didn't jump to line 49 because the condition on line 45 was always true

46 page_data[-1].sounds.extend(sounds) 

47 page_data[-1].categories.extend(cats.get("categories", [])) 

48 else: 

49 base_data.sounds.extend(sounds) 

50 base_data.categories.extend(cats.get("categories", [])) 

51 

52 

53def process_sound_template( 

54 wxr: WiktextractContext, 

55 t_node: TemplateNode, 

56 sounds: list[Sound], 

57 cats: dict[str, list[str]], 

58) -> None: 

59 if t_node.template_name in ["音声", "audio"]: 

60 extract_audio_template(wxr, t_node, sounds) 

61 elif t_node.template_name in ["IPA", "X-SAMPA"]: 

62 extract_ipa_template(wxr, t_node, sounds) 

63 elif t_node.template_name == "homophones": 

64 extract_homophones_template(wxr, t_node, sounds) 

65 elif t_node.template_name == "ja-pron": 

66 process_ja_pron_template(wxr, t_node, sounds) 

67 elif t_node.template_name == "ja-accent-common": 

68 process_ja_accent_common_template(wxr, t_node, sounds) 

69 elif t_node.template_name in [ 69 ↛ 79line 69 didn't jump to line 79 because the condition on line 69 was always true

70 "cmn-pron", 

71 "yue-pron", 

72 "nan-pron", 

73 "cdo-pron", 

74 "hak-pron", 

75 "wuu-pron", 

76 ]: 

77 extract_zh_sound_template(wxr, t_node, sounds) 

78 

79 clean_node(wxr, cats, t_node) 

80 

81 

82def extract_audio_template( 

83 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound] 

84): 

85 audio_file = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

86 if audio_file not in ["", "-"]: 86 ↛ exitline 86 didn't return from function 'extract_audio_template' because the condition on line 86 was always true

87 sound = Sound() 

88 raw_tag = clean_node(wxr, None, t_node.template_parameters.get(3, "")) 

89 if len(raw_tag) > 0: 

90 sound.raw_tags.append(raw_tag) 

91 set_sound_file_url_fields(wxr, audio_file, sound) 

92 sounds.append(sound) 

93 

94 

95def extract_ipa_template( 

96 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound] 

97): 

98 for index in itertools.count(1): 98 ↛ exitline 98 didn't return from function 'extract_ipa_template' because the loop on line 98 didn't complete

99 if index not in t_node.template_parameters: 

100 break 

101 ipa = clean_node(wxr, None, t_node.template_parameters[index]) 

102 if len(ipa) > 0: 102 ↛ 98line 102 didn't jump to line 98 because the condition on line 102 was always true

103 sound = Sound(ipa=f"/{ipa}/") 

104 if t_node.template_name == "X-SAMPA": 

105 sound.tags.append("X-SAMPA") 

106 sounds.append(sound) 

107 

108 

109def extract_homophones_template( 

110 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound] 

111): 

112 homophones = [] 

113 for index in itertools.count(1): 113 ↛ 119line 113 didn't jump to line 119 because the loop on line 113 didn't complete

114 if index not in t_node.template_parameters: 

115 break 

116 homophone = clean_node(wxr, None, t_node.template_parameters[index]) 

117 if len(homophone) > 0: 117 ↛ 113line 117 didn't jump to line 113 because the condition on line 117 was always true

118 homophones.append(homophone) 

119 if len(homophones) > 0: 119 ↛ exitline 119 didn't return from function 'extract_homophones_template' because the condition on line 119 was always true

120 sounds.append(Sound(homophones=homophones)) 

121 

122 

123JA_PRON_ACCENTS = { 

124 "中高型": "Nakadaka", 

125 "平板型": "Heiban", 

126 "頭高型": "Atamadaka", 

127 "尾高型": "Odaka", 

128} 

129 

130 

131def process_ja_pron_template( 

132 wxr: WiktextractContext, 

133 template_node: TemplateNode, 

134 sounds: list[Sound], 

135) -> None: 

136 # https://ja.wiktionary.org/wiki/テンプレート:ja-pron 

137 expanded_node = wxr.wtp.parse( 

138 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

139 ) 

140 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

141 if list_item.contain_node(NodeKind.TABLE): 141 ↛ 142line 141 didn't jump to line 142 because the condition on line 141 was never true

142 continue 

143 else: 

144 sound = Sound() 

145 for span_tag in list_item.find_html_recursively("span"): 

146 span_classes = span_tag.attrs.get("class", "") 

147 if "qualifier-content" in span_classes: 

148 raw_tag = clean_node(wxr, None, span_tag) 

149 if len(raw_tag) > 0: 149 ↛ 145line 149 didn't jump to line 145 because the condition on line 149 was always true

150 sound.raw_tags.append(raw_tag) 

151 elif "IPA" in span_classes: 

152 sound.ipa = clean_node(wxr, None, span_tag) 

153 elif "Latn" in span_classes: 

154 sound.roman = clean_node(wxr, None, span_tag) 

155 elif "Jpan" in span_classes: 

156 sound.form = clean_node(wxr, None, span_tag) 

157 for link_node in list_item.find_child(NodeKind.LINK): 

158 link_text = clean_node(wxr, None, link_node) 

159 if link_text in JA_PRON_ACCENTS: 

160 sound.tags.append(JA_PRON_ACCENTS[link_text]) 

161 if len(sound.model_dump(exclude_defaults=True)) > 0: 

162 sounds.append(sound) 

163 

164 for arg in ["a", "audio"]: 

165 audio_file = clean_node( 

166 wxr, None, template_node.template_parameters.get(arg, "") 

167 ) 

168 if len(audio_file) > 0: 

169 sound = Sound() 

170 set_sound_file_url_fields(wxr, audio_file, sound) 

171 sounds.append(sound) 

172 

173 

174JA_ACCENT_COMMON_TYPES = { 

175 "h": "Heiban", 

176 "a": "Atamadaka", 

177 "n": "Nakadaka", 

178 "o": "Odaka", 

179} 

180 

181 

182def process_ja_accent_common_template( 

183 wxr: WiktextractContext, 

184 template_node: TemplateNode, 

185 sounds: list[Sound], 

186) -> None: 

187 # https://ja.wiktionary.org/wiki/テンプレート:ja-accent-common 

188 expanded_node = wxr.wtp.parse( 

189 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

190 ) 

191 sound = Sound() 

192 for link_node in expanded_node.find_child_recursively(NodeKind.LINK): 192 ↛ 197line 192 didn't jump to line 197 because the loop on line 192 didn't complete

193 raw_tag = clean_node(wxr, None, link_node) 

194 if raw_tag != "": 194 ↛ 192line 194 didn't jump to line 192 because the condition on line 194 was always true

195 sound.raw_tags.append(raw_tag) 

196 break 

197 for span_tag in expanded_node.find_html_recursively("span"): 197 ↛ 202line 197 didn't jump to line 202 because the loop on line 197 didn't complete

198 span_text = clean_node(wxr, None, span_tag) 

199 if len(span_text) > 0: 199 ↛ 197line 199 didn't jump to line 197 because the condition on line 199 was always true

200 sound.form = span_text 

201 break 

202 accent_type = clean_node( 

203 wxr, None, template_node.template_parameters.get(1, "") 

204 ) 

205 if accent_type in JA_ACCENT_COMMON_TYPES: 205 ↛ 207line 205 didn't jump to line 207 because the condition on line 205 was always true

206 sound.tags.append(JA_ACCENT_COMMON_TYPES[accent_type]) 

207 if sound.form != "": 207 ↛ exitline 207 didn't return from function 'process_ja_accent_common_template' because the condition on line 207 was always true

208 sounds.append(sound) 

209 

210 

211def extract_homophone_section( 

212 wxr: WiktextractContext, 

213 page_data: list[WordEntry], 

214 base_data: WordEntry, 

215 level_node: LevelNode, 

216) -> None: 

217 sounds = [] 

218 for list_node in level_node.find_child(NodeKind.LIST): 

219 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

220 for node in list_item.children: 

221 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

222 word = clean_node(wxr, None, node) 

223 if word != "": 223 ↛ 220line 223 didn't jump to line 220 because the condition on line 223 was always true

224 sounds.append(Sound(homophones=[word])) 

225 elif ( 

226 isinstance(node, TemplateNode) and node.template_name == "l" 

227 ): 

228 from .linkage import extract_l_template 

229 

230 l_data = extract_l_template(wxr, node) 

231 if l_data.word != "": 231 ↛ 220line 231 didn't jump to line 220 because the condition on line 231 was always true

232 sounds.append( 

233 Sound( 

234 homophones=[l_data.word], 

235 sense=l_data.sense, 

236 tags=l_data.tags, 

237 raw_tags=l_data.raw_tags, 

238 ) 

239 ) 

240 

241 if level_node.kind == NodeKind.LEVEL3: 241 ↛ 246line 241 didn't jump to line 246 because the condition on line 241 was always true

242 base_data.sounds.extend(sounds) 

243 for data in page_data: 243 ↛ 244line 243 didn't jump to line 244 because the loop on line 243 never started

244 if data.lang_code == base_data.lang_code: 

245 data.sounds.extend(sounds) 

246 elif len(page_data) > 0: 

247 page_data[-1].sounds.extend(sounds) 

248 else: 

249 base_data.sounds.extend(sounds) 

250 

251 

252def extract_zh_sound_template( 

253 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound] 

254): 

255 # https://ja.wiktionary.org/wiki/カテゴリ:中国語_発音テンプレート 

256 expanded_node = wxr.wtp.parse( 

257 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

258 ) 

259 for list_node in expanded_node.find_child(NodeKind.LIST): 

260 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

261 raw_tags = [] 

262 raw_tag_nodes = [] 

263 for node in list_item.children: 

264 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

265 if len(raw_tags) == 0: 265 ↛ 272line 265 didn't jump to line 272 because the condition on line 265 was always true

266 for raw_tag in re.split( 

267 r":|,", clean_node(wxr, None, raw_tag_nodes) 

268 ): 

269 raw_tag = raw_tag.strip() 

270 if raw_tag != "": 

271 raw_tags.append(raw_tag) 

272 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

273 extract_zh_sound_list_item( 

274 wxr, child_list_item, sounds, raw_tags 

275 ) 

276 else: 

277 raw_tag_nodes.append(node) 

278 

279 

280def extract_zh_sound_list_item( 

281 wxr: WiktextractContext, 

282 list_item: WikiNode, 

283 sounds: list[Sound], 

284 raw_tags: list[str], 

285): 

286 after_colon = False 

287 tag_nodes = [] 

288 value_nodes = [] 

289 for node in list_item.children: 

290 if isinstance(node, str) and ":" in node and not after_colon: 

291 tag_nodes.append(node[: node.index(":")]) 

292 value_nodes.append(node[node.index(":") + 1 :]) 

293 after_colon = True 

294 elif not after_colon: 

295 if isinstance(node, TemplateNode) and node.template_name in [ 

296 "音声", 

297 "audio", 

298 ]: 

299 extract_audio_template(wxr, node, sounds) 

300 elif not (isinstance(node, HTMLNode) and node.tag == "small"): 

301 tag_nodes.append(node) 

302 else: 

303 value_nodes.append(node) 

304 for value in clean_node(wxr, None, value_nodes).split(","): 

305 value = value.strip() 

306 if value == "": 

307 continue 

308 sound = Sound(zh_pron=value, raw_tags=raw_tags) 

309 texts_in_p, text_out_p = capture_text_in_parentheses( 

310 clean_node(wxr, None, tag_nodes) 

311 ) 

312 text_out_p = text_out_p.strip() 

313 if text_out_p != "": 313 ↛ 315line 313 didn't jump to line 315 because the condition on line 313 was always true

314 sound.raw_tags.append(text_out_p) 

315 for raw_tag_str in texts_in_p: 

316 for raw_tag in raw_tag_str.split(","): 

317 raw_tag = raw_tag.strip() 

318 if raw_tag != "": 318 ↛ 316line 318 didn't jump to line 316 because the condition on line 318 was always true

319 sound.raw_tags.append(raw_tag) 

320 translate_raw_tags(sound) 

321 sounds.append(sound)