Coverage for src/wiktextract/extractor/zh/pronunciation.py: 82%

235 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import itertools 

2import re 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from ..share import set_sound_file_url_fields 

14from .models import Sound, WordEntry 

15from .tags import translate_raw_tags 

16 

17 

18def extract_pronunciation_section( 

19 wxr: WiktextractContext, base_data: WordEntry, level_node: WikiNode 

20) -> None: 

21 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

22 if t_node.template_name == "zh-forms": 

23 from .page import process_zh_forms 

24 

25 process_zh_forms(wxr, base_data, t_node) 

26 else: 

27 new_sounds, new_cats = process_pron_template(wxr, t_node) 

28 base_data.sounds.extend(new_sounds) 

29 base_data.categories.extend(new_cats) 

30 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

31 new_sounds, new_cats = process_pron_item_list_item(wxr, list_item_node) 

32 base_data.sounds.extend(new_sounds) 

33 base_data.categories.extend(new_cats) 

34 

35 

36def process_pron_item_list_item( 

37 wxr: WiktextractContext, list_item_node: WikiNode 

38) -> tuple[list[Sound], list[str]]: 

39 raw_tags = [] 

40 sounds = [] 

41 categories = [] 

42 for template_node in list_item_node.find_child(NodeKind.TEMPLATE): 

43 new_sounds, new_cats = process_pron_template( 

44 wxr, template_node, raw_tags 

45 ) 

46 sounds.extend(new_sounds) 

47 categories.extend(new_cats) 

48 return sounds, categories 

49 

50 

51def process_pron_template( 

52 wxr: WiktextractContext, 

53 template_node: TemplateNode, 

54 raw_tags: list[str] = [], 

55) -> tuple[list[Sound], list[str]]: 

56 template_name = template_node.template_name.lower() 

57 sounds = [] 

58 categories = [] 

59 if template_name == "zh-pron": 

60 new_sounds, new_cats = process_zh_pron_template(wxr, template_node) 

61 sounds.extend(new_sounds) 

62 categories.extend(new_cats) 

63 elif template_name in ["homophones", "homophone", "hmp"]: 

64 sounds.extend(process_homophones_template(wxr, template_node)) 

65 elif template_name in ["a", "accent"]: 

66 # https://zh.wiktionary.org/wiki/Template:Accent 

67 raw_tags.append(clean_node(wxr, None, template_node).strip("()")) 

68 elif template_name in ["audio", "音"]: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 sounds.extend(process_audio_template(wxr, template_node, raw_tags)) 

70 elif template_name == "ipa": 

71 sounds.extend(process_ipa_template(wxr, template_node, raw_tags)) 

72 elif template_name == "enpr": 72 ↛ 74line 72 didn't jump to line 74 because the condition on line 72 was always true

73 sounds.extend(process_enpr_template(wxr, template_node, raw_tags)) 

74 elif template_name == "ja-pron": 

75 new_sounds, new_cats = extract_ja_pron_template(wxr, template_node) 

76 sounds.extend(new_sounds) 

77 categories.extend(new_cats) 

78 return sounds, categories 

79 

80 

81def process_zh_pron_template( 

82 wxr: WiktextractContext, template_node: TemplateNode 

83) -> tuple[list[Sound], list[str]]: 

84 # https://zh.wiktionary.org/wiki/Template:Zh-pron 

85 expanded_node = wxr.wtp.parse( 

86 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

87 ) 

88 seen_lists = set() 

89 sounds = [] 

90 categories = {} 

91 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

92 if list_node not in seen_lists: 

93 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

94 sounds.extend( 

95 process_zh_pron_list_item(wxr, list_item, [], seen_lists) 

96 ) 

97 clean_node(wxr, categories, expanded_node) 

98 for sound in sounds: 

99 translate_raw_tags(sound) 

100 return sounds, categories.get("categories", []) 

101 

102 

103def process_zh_pron_list_item( 

104 wxr: WiktextractContext, 

105 list_item_node: WikiNode, 

106 raw_tags: list[str], 

107 seen_lists: set[WikiNode], 

108) -> list[Sound]: 

109 current_tags = raw_tags[:] 

110 sounds = [] 

111 is_first_small_tag = True 

112 for node in list_item_node.children: 

113 if isinstance(node, WikiNode): 

114 if node.kind == NodeKind.LINK: 

115 link_str = clean_node(wxr, None, node.largs) 

116 node_str = clean_node(wxr, None, node) 

117 if link_str.startswith("File:"): 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true

118 filename = link_str.removeprefix("File:") 

119 sound_data = Sound(raw_tags=current_tags) 

120 set_sound_file_url_fields(wxr, filename, sound_data) 

121 sounds.append(sound_data) 

122 elif node_str != "": 

123 current_tags.append(node_str.strip("()")) 

124 elif isinstance(node, HTMLNode): 

125 if node.tag == "small": 

126 # remove "幫助"(help) <sup> tag 

127 if is_first_small_tag: 

128 raw_tag_text = clean_node( 

129 wxr, 

130 None, 

131 [ 

132 n 

133 for n in node.children 

134 if not ( 

135 isinstance(n, HTMLNode) and n.tag == "sup" 

136 ) 

137 ], 

138 ).rstrip(":") 

139 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text)) 

140 elif len(sounds) > 0: 140 ↛ 144line 140 didn't jump to line 144 because the condition on line 140 was always true

141 sounds[-1].raw_tags.extend( 

142 split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

143 ) 

144 is_first_small_tag = False 

145 elif node.tag == "span": 

146 sounds.extend(extract_zh_pron_span(wxr, node, current_tags)) 

147 elif ( 

148 node.tag == "table" 

149 and len(current_tags) > 0 

150 and current_tags[-1] == "同音詞" 

151 ): 

152 sounds.extend( 

153 extract_zh_pron_homophones_table( 

154 wxr, node, current_tags 

155 ) 

156 ) 

157 

158 elif node.kind == NodeKind.LIST: 158 ↛ 112line 158 didn't jump to line 112 because the condition on line 158 was always true

159 seen_lists.add(node) 

160 for next_list_item in node.find_child(NodeKind.LIST_ITEM): 

161 sounds.extend( 

162 process_zh_pron_list_item( 

163 wxr, 

164 next_list_item, 

165 current_tags, 

166 seen_lists, 

167 ) 

168 ) 

169 return sounds 

170 

171 

172def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]: 

173 raw_tags = [] 

174 if "(" not in raw_tag_text and "(" not in raw_tag_text: 

175 for raw_tag in re.split(r",|,|:|、|和", raw_tag_text): 

176 raw_tag = raw_tag.strip().removeprefix("包括").strip() 

177 if raw_tag != "": 

178 raw_tags.append(raw_tag) 

179 else: 

180 processed_offsets = [] 

181 for match in re.finditer(r"\([^()]+\)|([^()]+)", raw_tag_text): 

182 processed_offsets.append((match.start(), match.end())) 

183 raw_tags.extend( 

184 split_zh_pron_raw_tag( 

185 raw_tag_text[match.start() + 1 : match.end() - 1] 

186 ) 

187 ) 

188 not_processed = "" 

189 last_end = 0 

190 for start, end in processed_offsets: 

191 not_processed += raw_tag_text[last_end:start] 

192 last_end = end 

193 not_processed += raw_tag_text[last_end:] 

194 if not_processed != raw_tag_text: 

195 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags 

196 else: 

197 raw_tags.append(not_processed) 

198 return raw_tags 

199 

200 

201def extract_zh_pron_span( 

202 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str] 

203) -> list[Sound]: 

204 sounds = [] 

205 small_tags = [] 

206 pron_nodes = [] 

207 roman = "" 

208 phonetic_pron = "" 

209 for index, node in enumerate(span_tag.children): 

210 if isinstance(node, HTMLNode) and node.tag == "small": 

211 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

212 elif ( 

213 isinstance(node, HTMLNode) 

214 and node.tag == "span" 

215 and "-Latn" in node.attrs.get("lang", "") 

216 ): 

217 roman = clean_node(wxr, None, node).strip("() ") 

218 elif isinstance(node, str) and node.strip() == "[實際讀音:": 

219 phonetic_pron = clean_node( 

220 wxr, None, span_tag.children[index + 1 :] 

221 ).strip("] ") 

222 break 

223 else: 

224 pron_nodes.append(node) 

225 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)): 

226 zh_pron = zh_pron.strip("[]: ") 

227 if len(zh_pron) > 0: 227 ↛ 225line 227 didn't jump to line 225 because the condition on line 227 was always true

228 if "IPA" in span_tag.attrs.get("class", ""): 

229 sounds.append( 

230 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags) 

231 ) 

232 else: 

233 sounds.append( 

234 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags) 

235 ) 

236 if len(sounds) > 0: 

237 sounds[-1].raw_tags.extend(small_tags) 

238 if phonetic_pron != "": 

239 sounds.append( 

240 Sound( 

241 zh_pron=phonetic_pron, 

242 roman=roman, 

243 raw_tags=raw_tags + ["實際讀音"], 

244 ) 

245 ) 

246 return sounds 

247 

248 

249def split_zh_pron(zh_pron: str) -> list[str]: 

250 # split by comma and other symbols that outside parentheses 

251 parentheses = 0 

252 pron_list = [] 

253 pron = "" 

254 for c in zh_pron: 

255 if ( 

256 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/"))) 

257 and parentheses == 0 

258 and len(pron.strip()) > 0 

259 ): 

260 pron_list.append(pron.strip()) 

261 pron = "" 

262 elif c in ["(", "("]: 

263 parentheses += 1 

264 pron += c 

265 elif c in [")", ")"]: 

266 parentheses -= 1 

267 pron += c 

268 else: 

269 pron += c 

270 

271 if pron.strip() != "": 

272 pron_list.append(pron) 

273 return pron_list 

274 

275 

276def extract_zh_pron_homophones_table( 

277 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str] 

278) -> list[Sound]: 

279 sounds = [] 

280 for td_tag in table.find_html_recursively("td"): 

281 for span_tag in td_tag.find_html("span"): 

282 span_class = span_tag.attrs.get("class", "") 

283 span_lang = span_tag.attrs.get("lang", "") 

284 span_str = clean_node(wxr, None, span_tag) 

285 if ( 

286 span_str not in ["", "/"] 

287 and span_lang != "" 

288 and span_class in ["Hant", "Hans", "Hani"] 

289 ): 

290 sound = Sound(homophone=span_str, raw_tags=raw_tags) 

291 if span_class == "Hant": 

292 sound.tags.append("Traditional-Chinese") 

293 elif span_class == "Hans": 

294 sound.tags.append("Simplified-Chinese") 

295 sounds.append(sound) 

296 return sounds 

297 

298 

299def process_homophones_template( 

300 wxr: WiktextractContext, template_node: TemplateNode 

301) -> list[Sound]: 

302 # https://zh.wiktionary.org/wiki/Template:homophones 

303 sounds = [] 

304 for word_index in itertools.count(2): 304 ↛ 312line 304 didn't jump to line 312 because the loop on line 304 didn't complete

305 if word_index not in template_node.template_parameters: 

306 break 

307 homophone = clean_node( 

308 wxr, None, template_node.template_parameters.get(word_index, "") 

309 ) 

310 if len(homophone) > 0: 310 ↛ 304line 310 didn't jump to line 304 because the condition on line 310 was always true

311 sounds.append(Sound(homophone=homophone)) 

312 return sounds 

313 

314 

315def process_audio_template( 

316 wxr: WiktextractContext, template_node: TemplateNode, raw_tags: list[str] 

317) -> list[Sound]: 

318 # https://zh.wiktionary.org/wiki/Template:Audio 

319 sound_file = clean_node( 

320 wxr, None, template_node.template_parameters.get(2, "") 

321 ) 

322 sound_data = Sound() 

323 set_sound_file_url_fields(wxr, sound_file, sound_data) 

324 raw_tag = clean_node( 

325 wxr, None, template_node.template_parameters.get(3, "") 

326 ) 

327 if len(raw_tag) > 0: 

328 sound_data.raw_tags.append(raw_tag) 

329 sound_data.raw_tags.extend(raw_tags) 

330 return [sound_data] 

331 

332 

333def process_ipa_template( 

334 wxr: WiktextractContext, 

335 template_node: TemplateNode, 

336 raw_tags: list[str], 

337) -> list[Sound]: 

338 # https://zh.wiktionary.org/wiki/Template:IPA 

339 sounds = [] 

340 for index in itertools.count(2): 340 ↛ 350line 340 didn't jump to line 350 because the loop on line 340 didn't complete

341 if index not in template_node.template_parameters: 

342 break 

343 sound = Sound( 

344 ipa=clean_node( 

345 wxr, None, template_node.template_parameters.get(index) 

346 ), 

347 raw_tags=raw_tags, 

348 ) 

349 sounds.append(sound) 

350 return sounds 

351 

352 

353def process_enpr_template( 

354 wxr: WiktextractContext, 

355 template_node: TemplateNode, 

356 raw_tags: list[str], 

357) -> list[Sound]: 

358 # https://zh.wiktionary.org/wiki/Template:enPR 

359 sounds = [] 

360 for index in range(1, 4): 360 ↛ 370line 360 didn't jump to line 370 because the loop on line 360 didn't complete

361 if index not in template_node.template_parameters: 

362 break 

363 sound = Sound( 

364 enpr=clean_node( 

365 wxr, None, template_node.template_parameters.get(index) 

366 ), 

367 raw_tags=raw_tags, 

368 ) 

369 sounds.append(sound) 

370 return sounds 

371 

372 

373def extract_ja_pron_template( 

374 wxr: WiktextractContext, t_node: TemplateNode 

375) -> tuple[list[Sound], list[str]]: 

376 expanded_node = wxr.wtp.parse( 

377 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

378 ) 

379 cats = {} 

380 sounds = [] 

381 for li_tag in expanded_node.find_html_recursively("li"): 

382 sound = Sound() 

383 for span_tag in li_tag.find_html("span"): 

384 span_class = span_tag.attrs.get("class", "") 

385 if "usage-label-accent" in span_class: 

386 raw_tag = clean_node(wxr, None, span_tag).strip("() ") 

387 if raw_tag != "": 

388 sound.raw_tags.append(raw_tag) 

389 elif "IPA" in span_class: 

390 sound.ipa = clean_node(wxr, None, span_tag) 

391 elif "Latn" in span_class: 

392 sound.roman = clean_node(wxr, None, span_tag) 

393 elif span_tag.attrs.get("lang", "") == "ja": 

394 sound.other = clean_node(wxr, None, span_tag) 

395 if sound.ipa != "" or sound.other != "": 

396 translate_raw_tags(sound) 

397 sounds.append(sound) 

398 

399 clean_node(wxr, cats, expanded_node) 

400 return sounds, cats.get("categories", [])