Coverage for src / wiktextract / extractor / ja / sound.py: 82%

239 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import itertools 

2import re 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..share import capture_text_in_parentheses, set_sound_file_url_fields 

15from .models import Hyphenation, Sound, WordEntry 

16from .tags import translate_raw_tags 

17 

18 

19def extract_sound_section( 

20 wxr: WiktextractContext, 

21 page_data: list[WordEntry], 

22 base_data: WordEntry, 

23 level_node: LevelNode, 

24) -> None: 

25 sounds = [] 

26 cats = {} 

27 for node in level_node.children: 

28 if isinstance(node, TemplateNode): 

29 process_sound_template(wxr, base_data, node, sounds, cats) 

30 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

31 for list_item in node.find_child(NodeKind.LIST_ITEM): 

32 if base_data.lang_code == "zh": 

33 extract_zh_sound_list_item(wxr, list_item, sounds, []) 

34 else: 

35 for t_node in list_item.find_child(NodeKind.TEMPLATE): 

36 process_sound_template( 

37 wxr, base_data, t_node, sounds, cats 

38 ) 

39 

40 if level_node.kind == NodeKind.LEVEL3: 

41 base_data.sounds.extend(sounds) 

42 base_data.categories.extend(cats.get("categories", [])) 

43 for data in page_data: 

44 if data.lang_code == base_data.lang_code: 44 ↛ 43line 44 didn't jump to line 43 because the condition on line 44 was always true

45 data.sounds.extend(sounds) 

46 data.categories.extend(cats.get("categories", [])) 

47 elif len(page_data) > 0: 47 ↛ 51line 47 didn't jump to line 51 because the condition on line 47 was always true

48 page_data[-1].sounds.extend(sounds) 

49 page_data[-1].categories.extend(cats.get("categories", [])) 

50 else: 

51 base_data.sounds.extend(sounds) 

52 base_data.categories.extend(cats.get("categories", [])) 

53 

54 

55def process_sound_template( 

56 wxr: WiktextractContext, 

57 base_data: WordEntry, 

58 t_node: TemplateNode, 

59 sounds: list[Sound], 

60 cats: dict[str, list[str]], 

61) -> None: 

62 if t_node.template_name in ["音声", "audio"]: 

63 extract_audio_template(wxr, t_node, sounds) 

64 elif t_node.template_name in [ 

65 "IPA", 

66 "X-SAMPA", 

67 ] or t_node.template_name.endswith("-IPA"): 

68 extract_ipa_template(wxr, t_node, sounds) 

69 elif t_node.template_name == "homophones": 

70 extract_homophones_template(wxr, t_node, sounds) 

71 elif t_node.template_name == "ja-pron": 

72 process_ja_pron_template(wxr, t_node, sounds) 

73 elif t_node.template_name == "ja-accent-common": 

74 process_ja_accent_common_template(wxr, t_node, sounds) 

75 elif t_node.template_name in [ 

76 "cmn-pron", 

77 "yue-pron", 

78 "nan-pron", 

79 "cdo-pron", 

80 "hak-pron", 

81 "wuu-pron", 

82 ]: 

83 extract_zh_sound_template(wxr, t_node, sounds) 

84 elif t_node.template_name in ["rhymes", "rhyme"]: 84 ↛ 86line 84 didn't jump to line 86 because the condition on line 84 was always true

85 extract_rhymes_template(wxr, t_node, sounds) 

86 elif t_node.template_name in ["hyphenation", "hyph"]: 

87 extract_hyphenation_template(wxr, base_data, t_node) 

88 

89 clean_node(wxr, cats, t_node) 

90 

91 

92def extract_audio_template( 

93 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound] 

94): 

95 audio_file = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

96 if audio_file not in ["", "-"]: 96 ↛ exitline 96 didn't return from function 'extract_audio_template' because the condition on line 96 was always true

97 sound = Sound() 

98 raw_tag = clean_node(wxr, None, t_node.template_parameters.get(3, "")) 

99 if len(raw_tag) > 0: 

100 sound.raw_tags.append(raw_tag) 

101 set_sound_file_url_fields(wxr, audio_file, sound) 

102 sounds.append(sound) 

103 

104 

105def extract_ipa_template( 

106 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound] 

107): 

108 expanded_node = wxr.wtp.parse( 

109 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

110 ) 

111 no_list_nodes = [] 

112 for node in expanded_node.children: 

113 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true

114 for list_item in node.find_child(NodeKind.LIST_ITEM): 

115 sounds.extend(extract_ipa_list_item(wxr, list_item)) 

116 else: 

117 no_list_nodes.append(node) 

118 if len(no_list_nodes) > 0: 118 ↛ exitline 118 didn't return from function 'extract_ipa_template' because the condition on line 118 was always true

119 tmp_node = WikiNode(NodeKind.ROOT, 0) 

120 tmp_node.children = no_list_nodes 

121 sounds.extend(extract_ipa_list_item(wxr, tmp_node)) 

122 

123 

124def extract_ipa_list_item( 

125 wxr: WiktextractContext, list_item: WikiNode 

126) -> list[Sound]: 

127 raw_tags = [] 

128 sounds = [] 

129 for span_tag in list_item.find_html_recursively("span"): 

130 span_class = span_tag.attrs.get("class", "").split() 

131 if "qualifier-content" in span_class or "ib-content" in span_class: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 for raw_tag in clean_node(wxr, None, span_tag).split(","): 

133 raw_tag = raw_tag.strip() 

134 if raw_tag != "": 

135 raw_tags.append(raw_tag) 

136 elif "IPA" in span_class or "SAMPA" in span_class: 

137 sound = Sound( 

138 ipa=clean_node(wxr, None, span_tag), raw_tags=raw_tags 

139 ) 

140 if sound.ipa != "": 140 ↛ 129line 140 didn't jump to line 129 because the condition on line 140 was always true

141 if "SAMPA" in span_class: 

142 sound.ipa = f"/{sound.ipa}/" 

143 sound.tags.append("X-SAMPA") 

144 translate_raw_tags(sound) 

145 sounds.append(sound) 

146 return sounds 

147 

148 

149def extract_homophones_template( 

150 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound] 

151): 

152 homophones = [] 

153 for index in itertools.count(1): 153 ↛ 159line 153 didn't jump to line 159 because the loop on line 153 didn't complete

154 if index not in t_node.template_parameters: 

155 break 

156 homophone = clean_node(wxr, None, t_node.template_parameters[index]) 

157 if len(homophone) > 0: 157 ↛ 153line 157 didn't jump to line 153 because the condition on line 157 was always true

158 homophones.append(homophone) 

159 if len(homophones) > 0: 159 ↛ exitline 159 didn't return from function 'extract_homophones_template' because the condition on line 159 was always true

160 sounds.append(Sound(homophones=homophones)) 

161 

162 

163JA_PRON_ACCENTS = { 

164 "中高型": "Nakadaka", 

165 "平板型": "Heiban", 

166 "頭高型": "Atamadaka", 

167 "尾高型": "Odaka", 

168} 

169 

170 

171def process_ja_pron_template( 

172 wxr: WiktextractContext, 

173 template_node: TemplateNode, 

174 sounds: list[Sound], 

175) -> None: 

176 # https://ja.wiktionary.org/wiki/テンプレート:ja-pron 

177 expanded_node = wxr.wtp.parse( 

178 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

179 ) 

180 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

181 if list_item.contain_node(NodeKind.TABLE): 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true

182 continue 

183 else: 

184 sound = Sound() 

185 for span_tag in list_item.find_html_recursively("span"): 

186 span_classes = span_tag.attrs.get("class", "").split() 

187 if "qualifier-content" in span_classes: 

188 raw_tag = clean_node(wxr, None, span_tag) 

189 if len(raw_tag) > 0: 189 ↛ 185line 189 didn't jump to line 185 because the condition on line 189 was always true

190 sound.raw_tags.append(raw_tag) 

191 elif "IPA" in span_classes: 

192 sound.ipa = clean_node(wxr, None, span_tag) 

193 elif "Latn" in span_classes: 

194 sound.roman = clean_node(wxr, None, span_tag) 

195 elif "Jpan" in span_classes: 

196 sound.other = clean_node(wxr, None, span_tag) 

197 for link_node in list_item.find_child(NodeKind.LINK): 

198 link_text = clean_node(wxr, None, link_node) 

199 if link_text in JA_PRON_ACCENTS: 

200 sound.tags.append(JA_PRON_ACCENTS[link_text]) 

201 if sound.ipa != "" or sound.other != "": 

202 translate_raw_tags(sound) 

203 sounds.append(sound) 

204 

205 for arg in ["a", "audio"]: 

206 audio_file = clean_node( 

207 wxr, None, template_node.template_parameters.get(arg, "") 

208 ) 

209 if len(audio_file) > 0: 

210 sound = Sound() 

211 set_sound_file_url_fields(wxr, audio_file, sound) 

212 sounds.append(sound) 

213 

214 

215JA_ACCENT_COMMON_TYPES = { 

216 "h": "Heiban", 

217 "a": "Atamadaka", 

218 "n": "Nakadaka", 

219 "o": "Odaka", 

220} 

221 

222 

223def process_ja_accent_common_template( 

224 wxr: WiktextractContext, 

225 template_node: TemplateNode, 

226 sounds: list[Sound], 

227) -> None: 

228 # https://ja.wiktionary.org/wiki/テンプレート:ja-accent-common 

229 expanded_node = wxr.wtp.parse( 

230 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

231 ) 

232 sound = Sound() 

233 for link_node in expanded_node.find_child_recursively(NodeKind.LINK): 233 ↛ 238line 233 didn't jump to line 238 because the loop on line 233 didn't complete

234 raw_tag = clean_node(wxr, None, link_node) 

235 if raw_tag != "": 235 ↛ 233line 235 didn't jump to line 233 because the condition on line 235 was always true

236 sound.raw_tags.append(raw_tag) 

237 break 

238 for span_tag in expanded_node.find_html_recursively("span"): 238 ↛ 243line 238 didn't jump to line 243 because the loop on line 238 didn't complete

239 span_text = clean_node(wxr, None, span_tag) 

240 if len(span_text) > 0: 240 ↛ 238line 240 didn't jump to line 238 because the condition on line 240 was always true

241 sound.other = span_text 

242 break 

243 accent_type = clean_node( 

244 wxr, None, template_node.template_parameters.get(1, "") 

245 ) 

246 if accent_type in JA_ACCENT_COMMON_TYPES: 246 ↛ 248line 246 didn't jump to line 248 because the condition on line 246 was always true

247 sound.tags.append(JA_ACCENT_COMMON_TYPES[accent_type]) 

248 if sound.other != "": 248 ↛ exitline 248 didn't return from function 'process_ja_accent_common_template' because the condition on line 248 was always true

249 translate_raw_tags(sound) 

250 sounds.append(sound) 

251 

252 

253def extract_homophone_section( 

254 wxr: WiktextractContext, 

255 page_data: list[WordEntry], 

256 base_data: WordEntry, 

257 level_node: LevelNode, 

258) -> None: 

259 sounds = [] 

260 for list_node in level_node.find_child(NodeKind.LIST): 

261 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

262 for node in list_item.children: 

263 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

264 word = clean_node(wxr, None, node) 

265 if word != "": 265 ↛ 262line 265 didn't jump to line 262 because the condition on line 265 was always true

266 sounds.append(Sound(homophones=[word])) 

267 elif ( 

268 isinstance(node, TemplateNode) and node.template_name == "l" 

269 ): 

270 from .linkage import extract_l_template 

271 

272 l_data = extract_l_template(wxr, node) 

273 if l_data.word != "": 273 ↛ 262line 273 didn't jump to line 262 because the condition on line 273 was always true

274 sounds.append( 

275 Sound( 

276 homophones=[l_data.word], 

277 sense=l_data.sense, 

278 tags=l_data.tags, 

279 raw_tags=l_data.raw_tags, 

280 ) 

281 ) 

282 

283 if level_node.kind == NodeKind.LEVEL3: 283 ↛ 288line 283 didn't jump to line 288 because the condition on line 283 was always true

284 base_data.sounds.extend(sounds) 

285 for data in page_data: 285 ↛ 286line 285 didn't jump to line 286 because the loop on line 285 never started

286 if data.lang_code == base_data.lang_code: 

287 data.sounds.extend(sounds) 

288 elif len(page_data) > 0: 

289 page_data[-1].sounds.extend(sounds) 

290 else: 

291 base_data.sounds.extend(sounds) 

292 

293 

294def extract_zh_sound_template( 

295 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound] 

296): 

297 # https://ja.wiktionary.org/wiki/カテゴリ:中国語_発音テンプレート 

298 expanded_node = wxr.wtp.parse( 

299 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

300 ) 

301 for list_node in expanded_node.find_child(NodeKind.LIST): 

302 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

303 raw_tags = [] 

304 raw_tag_nodes = [] 

305 for node in list_item.children: 

306 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

307 if len(raw_tags) == 0: 307 ↛ 314line 307 didn't jump to line 314 because the condition on line 307 was always true

308 for raw_tag in re.split( 

309 r":|,", clean_node(wxr, None, raw_tag_nodes) 

310 ): 

311 raw_tag = raw_tag.strip() 

312 if raw_tag != "": 

313 raw_tags.append(raw_tag) 

314 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

315 extract_zh_sound_list_item( 

316 wxr, child_list_item, sounds, raw_tags 

317 ) 

318 else: 

319 raw_tag_nodes.append(node) 

320 

321 

322def extract_zh_sound_list_item( 

323 wxr: WiktextractContext, 

324 list_item: WikiNode, 

325 sounds: list[Sound], 

326 raw_tags: list[str], 

327): 

328 after_colon = False 

329 tag_nodes = [] 

330 value_nodes = [] 

331 for node in list_item.children: 

332 if isinstance(node, str) and ":" in node and not after_colon: 

333 tag_nodes.append(node[: node.index(":")]) 

334 value_nodes.append(node[node.index(":") + 1 :]) 

335 after_colon = True 

336 elif not after_colon: 

337 if isinstance(node, TemplateNode) and node.template_name in [ 

338 "音声", 

339 "audio", 

340 ]: 

341 extract_audio_template(wxr, node, sounds) 

342 elif not (isinstance(node, HTMLNode) and node.tag == "small"): 

343 tag_nodes.append(node) 

344 else: 

345 value_nodes.append(node) 

346 for value in clean_node(wxr, None, value_nodes).split(","): 

347 value = value.strip() 

348 if value == "": 

349 continue 

350 sound = Sound(zh_pron=value, raw_tags=raw_tags) 

351 texts_in_p, text_out_p = capture_text_in_parentheses( 

352 clean_node(wxr, None, tag_nodes) 

353 ) 

354 text_out_p = text_out_p.strip() 

355 if text_out_p != "": 355 ↛ 357line 355 didn't jump to line 357 because the condition on line 355 was always true

356 sound.raw_tags.append(text_out_p) 

357 for raw_tag_str in texts_in_p: 

358 for raw_tag in raw_tag_str.split(","): 

359 raw_tag = raw_tag.strip() 

360 if raw_tag != "": 360 ↛ 358line 360 didn't jump to line 358 because the condition on line 360 was always true

361 sound.raw_tags.append(raw_tag) 

362 translate_raw_tags(sound) 

363 sounds.append(sound) 

364 

365 

366def extract_rhymes_template( 

367 wxr: WiktextractContext, t_node: TemplateNode, sounds: list[Sound] 

368): 

369 expanded_node = wxr.wtp.parse( 

370 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

371 ) 

372 for span_node in expanded_node.find_html( 

373 "span", attr_name="class", attr_value="IPA" 

374 ): 

375 rhyme = clean_node(wxr, None, span_node) 

376 if rhyme != "": 376 ↛ 372line 376 didn't jump to line 372 because the condition on line 376 was always true

377 sounds.append(Sound(rhymes=rhyme)) 

378 

379 

380def extract_hyphenation_template( 

381 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

382): 

383 expanded_node = wxr.wtp.parse( 

384 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

385 ) 

386 h_strs = [] 

387 if t_node.template_name == "hyph": 

388 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

389 for span_tag in expanded_node.find_html( 

390 "span", attr_name="lang", attr_value=lang_code 

391 ): 

392 h_strs.append(clean_node(wxr, None, span_tag)) 

393 else: 

394 h_strs.append( 

395 clean_node(wxr, base_data, t_node).removeprefix("分綴:").strip() 

396 ) 

397 for h_str in h_strs: 

398 h_data = Hyphenation( 

399 parts=list(filter(None, map(str.strip, h_str.split("‧")))) 

400 ) 

401 if len(h_data.parts) > 0: 

402 base_data.hyphenations.append(h_data)