Coverage for src/wiktextract/extractor/zh/pronunciation.py: 59%

317 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1import itertools 

2import re 

3from dataclasses import dataclass 

4 

5from wikitextprocessor import ( 

6 HTMLNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..share import set_sound_file_url_fields 

15from .models import Sound, WordEntry 

16from .tags import translate_raw_tags 

17 

18 

19def extract_pronunciation_section( 

20 wxr: WiktextractContext, base_data: WordEntry, level_node: WikiNode 

21) -> None: 

22 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

23 if t_node.template_name == "zh-forms": 

24 from .page import process_zh_forms 

25 

26 process_zh_forms(wxr, base_data, t_node) 

27 else: 

28 new_sounds, new_cats = process_pron_template(wxr, t_node) 

29 base_data.sounds.extend(new_sounds) 

30 base_data.categories.extend(new_cats) 

31 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

32 new_sounds, new_cats = process_pron_item_list_item(wxr, list_item_node) 

33 base_data.sounds.extend(new_sounds) 

34 base_data.categories.extend(new_cats) 

35 

36 

37def process_pron_item_list_item( 

38 wxr: WiktextractContext, list_item_node: WikiNode 

39) -> tuple[list[Sound], list[str]]: 

40 raw_tags = [] 

41 sounds = [] 

42 categories = [] 

43 for template_node in list_item_node.find_child(NodeKind.TEMPLATE): 

44 new_sounds, new_cats = process_pron_template( 

45 wxr, template_node, raw_tags 

46 ) 

47 sounds.extend(new_sounds) 

48 categories.extend(new_cats) 

49 return sounds, categories 

50 

51 

52def process_pron_template( 

53 wxr: WiktextractContext, 

54 template_node: TemplateNode, 

55 raw_tags: list[str] = [], 

56) -> tuple[list[Sound], list[str]]: 

57 template_name = template_node.template_name.lower() 

58 sounds = [] 

59 categories = [] 

60 if template_name == "zh-pron": 

61 new_sounds, new_cats = process_zh_pron_template(wxr, template_node) 

62 sounds.extend(new_sounds) 

63 categories.extend(new_cats) 

64 elif template_name in ["homophones", "homophone", "hmp"]: 

65 sounds.extend(process_homophones_template(wxr, template_node)) 

66 elif template_name in ["a", "accent"]: 

67 # https://zh.wiktionary.org/wiki/Template:Accent 

68 raw_tags.append(clean_node(wxr, None, template_node).strip("()")) 

69 elif template_name in ["audio", "音"]: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 sounds.extend(process_audio_template(wxr, template_node, raw_tags)) 

71 elif template_name == "ipa": 

72 sounds.extend(process_ipa_template(wxr, template_node, raw_tags)) 

73 elif template_name == "enpr": 73 ↛ 75line 73 didn't jump to line 75 because the condition on line 73 was always true

74 sounds.extend(process_enpr_template(wxr, template_node, raw_tags)) 

75 elif template_name == "ja-pron": 

76 new_sounds, new_cats = extract_ja_pron_template(wxr, template_node) 

77 sounds.extend(new_sounds) 

78 categories.extend(new_cats) 

79 elif template_name == "th-pron": 

80 new_sounds, new_cats = extract_th_pron_template(wxr, template_node) 

81 sounds.extend(new_sounds) 

82 categories.extend(new_cats) 

83 return sounds, categories 

84 

85 

86def process_zh_pron_template( 

87 wxr: WiktextractContext, template_node: TemplateNode 

88) -> tuple[list[Sound], list[str]]: 

89 # https://zh.wiktionary.org/wiki/Template:Zh-pron 

90 expanded_node = wxr.wtp.parse( 

91 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

92 ) 

93 seen_lists = set() 

94 sounds = [] 

95 categories = {} 

96 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

97 if list_node not in seen_lists: 

98 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

99 sounds.extend( 

100 process_zh_pron_list_item(wxr, list_item, [], seen_lists) 

101 ) 

102 clean_node(wxr, categories, expanded_node) 

103 for sound in sounds: 

104 translate_raw_tags(sound) 

105 return sounds, categories.get("categories", []) 

106 

107 

108def process_zh_pron_list_item( 

109 wxr: WiktextractContext, 

110 list_item_node: WikiNode, 

111 raw_tags: list[str], 

112 seen_lists: set[WikiNode], 

113) -> list[Sound]: 

114 current_tags = raw_tags[:] 

115 sounds = [] 

116 is_first_small_tag = True 

117 for node in list_item_node.children: 

118 if isinstance(node, WikiNode): 

119 if node.kind == NodeKind.LINK: 

120 link_str = clean_node(wxr, None, node.largs) 

121 node_str = clean_node(wxr, None, node) 

122 if link_str.startswith("File:"): 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true

123 filename = link_str.removeprefix("File:") 

124 sound_data = Sound(raw_tags=current_tags) 

125 set_sound_file_url_fields(wxr, filename, sound_data) 

126 sounds.append(sound_data) 

127 elif node_str != "": 

128 current_tags.append(node_str.strip("()")) 

129 elif isinstance(node, HTMLNode): 

130 if node.tag == "small": 

131 # remove "幫助"(help) <sup> tag 

132 if is_first_small_tag: 

133 raw_tag_text = clean_node( 

134 wxr, 

135 None, 

136 [ 

137 n 

138 for n in node.children 

139 if not ( 

140 isinstance(n, HTMLNode) and n.tag == "sup" 

141 ) 

142 ], 

143 ).rstrip(":") 

144 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text)) 

145 elif len(sounds) > 0: 145 ↛ 149line 145 didn't jump to line 149 because the condition on line 145 was always true

146 sounds[-1].raw_tags.extend( 

147 split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

148 ) 

149 is_first_small_tag = False 

150 elif node.tag == "span": 

151 sounds.extend(extract_zh_pron_span(wxr, node, current_tags)) 

152 elif ( 

153 node.tag == "table" 

154 and len(current_tags) > 0 

155 and current_tags[-1] == "同音詞" 

156 ): 

157 sounds.extend( 

158 extract_zh_pron_homophones_table( 

159 wxr, node, current_tags 

160 ) 

161 ) 

162 

163 elif node.kind == NodeKind.LIST: 163 ↛ 117line 163 didn't jump to line 117 because the condition on line 163 was always true

164 seen_lists.add(node) 

165 for next_list_item in node.find_child(NodeKind.LIST_ITEM): 

166 sounds.extend( 

167 process_zh_pron_list_item( 

168 wxr, 

169 next_list_item, 

170 current_tags, 

171 seen_lists, 

172 ) 

173 ) 

174 return sounds 

175 

176 

177def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]: 

178 raw_tags = [] 

179 if "(" not in raw_tag_text and "(" not in raw_tag_text: 

180 for raw_tag in re.split(r",|,|:|、|;|;|和(?!$)", raw_tag_text): 

181 raw_tag = raw_tag.strip().removeprefix("包括").strip() 

182 if raw_tag != "": 

183 raw_tags.append(raw_tag) 

184 else: 

185 processed_offsets = [] 

186 for match in re.finditer(r"\([^()]+\)|([^()]+)", raw_tag_text): 

187 processed_offsets.append((match.start(), match.end())) 

188 raw_tags.extend( 

189 split_zh_pron_raw_tag( 

190 raw_tag_text[match.start() + 1 : match.end() - 1] 

191 ) 

192 ) 

193 not_processed = "" 

194 last_end = 0 

195 for start, end in processed_offsets: 

196 not_processed += raw_tag_text[last_end:start] 

197 last_end = end 

198 not_processed += raw_tag_text[last_end:] 

199 if not_processed != raw_tag_text: 

200 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags 

201 else: 

202 raw_tags.append(not_processed) 

203 return raw_tags 

204 

205 

206def extract_zh_pron_span( 

207 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str] 

208) -> list[Sound]: 

209 sounds = [] 

210 small_tags = [] 

211 pron_nodes = [] 

212 roman = "" 

213 phonetic_pron = "" 

214 for index, node in enumerate(span_tag.children): 

215 if isinstance(node, HTMLNode) and node.tag == "small": 

216 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

217 elif ( 

218 isinstance(node, HTMLNode) 

219 and node.tag == "span" 

220 and "-Latn" in node.attrs.get("lang", "") 

221 ): 

222 roman = clean_node(wxr, None, node).strip("() ") 

223 elif isinstance(node, str) and node.strip() == "[實際讀音:": 

224 phonetic_pron = clean_node( 

225 wxr, None, span_tag.children[index + 1 :] 

226 ).strip("] ") 

227 break 

228 else: 

229 pron_nodes.append(node) 

230 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)): 

231 zh_pron = zh_pron.strip("[]: ") 

232 if len(zh_pron) > 0: 232 ↛ 230line 232 didn't jump to line 230 because the condition on line 232 was always true

233 if "IPA" in span_tag.attrs.get("class", ""): 

234 sounds.append( 

235 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags) 

236 ) 

237 else: 

238 sounds.append( 

239 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags) 

240 ) 

241 if len(sounds) > 0: 

242 sounds[-1].raw_tags.extend(small_tags) 

243 if phonetic_pron != "": 

244 sounds.append( 

245 Sound( 

246 zh_pron=phonetic_pron, 

247 roman=roman, 

248 raw_tags=raw_tags + ["實際讀音"], 

249 ) 

250 ) 

251 return sounds 

252 

253 

254def split_zh_pron(zh_pron: str) -> list[str]: 

255 # split by comma and other symbols that outside parentheses 

256 parentheses = 0 

257 pron_list = [] 

258 pron = "" 

259 for c in zh_pron: 

260 if ( 

261 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/"))) 

262 and parentheses == 0 

263 and len(pron.strip()) > 0 

264 ): 

265 pron_list.append(pron.strip()) 

266 pron = "" 

267 elif c in ["(", "("]: 

268 parentheses += 1 

269 pron += c 

270 elif c in [")", ")"]: 

271 parentheses -= 1 

272 pron += c 

273 else: 

274 pron += c 

275 

276 if pron.strip() != "": 

277 pron_list.append(pron) 

278 return pron_list 

279 

280 

281def extract_zh_pron_homophones_table( 

282 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str] 

283) -> list[Sound]: 

284 sounds = [] 

285 for td_tag in table.find_html_recursively("td"): 

286 for span_tag in td_tag.find_html("span"): 

287 span_class = span_tag.attrs.get("class", "") 

288 span_lang = span_tag.attrs.get("lang", "") 

289 span_str = clean_node(wxr, None, span_tag) 

290 if ( 

291 span_str not in ["", "/"] 

292 and span_lang != "" 

293 and span_class in ["Hant", "Hans", "Hani"] 

294 ): 

295 sound = Sound(homophone=span_str, raw_tags=raw_tags) 

296 if span_class == "Hant": 

297 sound.tags.append("Traditional-Chinese") 

298 elif span_class == "Hans": 

299 sound.tags.append("Simplified-Chinese") 

300 sounds.append(sound) 

301 return sounds 

302 

303 

304def process_homophones_template( 

305 wxr: WiktextractContext, template_node: TemplateNode 

306) -> list[Sound]: 

307 # https://zh.wiktionary.org/wiki/Template:homophones 

308 sounds = [] 

309 for word_index in itertools.count(2): 309 ↛ 317line 309 didn't jump to line 317 because the loop on line 309 didn't complete

310 if word_index not in template_node.template_parameters: 

311 break 

312 homophone = clean_node( 

313 wxr, None, template_node.template_parameters.get(word_index, "") 

314 ) 

315 if len(homophone) > 0: 315 ↛ 309line 315 didn't jump to line 309 because the condition on line 315 was always true

316 sounds.append(Sound(homophone=homophone)) 

317 return sounds 

318 

319 

320def process_audio_template( 

321 wxr: WiktextractContext, template_node: TemplateNode, raw_tags: list[str] 

322) -> list[Sound]: 

323 # https://zh.wiktionary.org/wiki/Template:Audio 

324 sound_file = clean_node( 

325 wxr, None, template_node.template_parameters.get(2, "") 

326 ) 

327 sound_data = Sound() 

328 set_sound_file_url_fields(wxr, sound_file, sound_data) 

329 raw_tag = clean_node( 

330 wxr, None, template_node.template_parameters.get(3, "") 

331 ) 

332 if len(raw_tag) > 0: 

333 sound_data.raw_tags.append(raw_tag) 

334 sound_data.raw_tags.extend(raw_tags) 

335 return [sound_data] 

336 

337 

338def process_ipa_template( 

339 wxr: WiktextractContext, 

340 template_node: TemplateNode, 

341 raw_tags: list[str], 

342) -> list[Sound]: 

343 # https://zh.wiktionary.org/wiki/Template:IPA 

344 sounds = [] 

345 for index in itertools.count(2): 345 ↛ 355line 345 didn't jump to line 355 because the loop on line 345 didn't complete

346 if index not in template_node.template_parameters: 

347 break 

348 sound = Sound( 

349 ipa=clean_node( 

350 wxr, None, template_node.template_parameters.get(index) 

351 ), 

352 raw_tags=raw_tags, 

353 ) 

354 sounds.append(sound) 

355 return sounds 

356 

357 

358def process_enpr_template( 

359 wxr: WiktextractContext, 

360 template_node: TemplateNode, 

361 raw_tags: list[str], 

362) -> list[Sound]: 

363 # https://zh.wiktionary.org/wiki/Template:enPR 

364 sounds = [] 

365 for index in range(1, 4): 365 ↛ 375line 365 didn't jump to line 375 because the loop on line 365 didn't complete

366 if index not in template_node.template_parameters: 

367 break 

368 sound = Sound( 

369 enpr=clean_node( 

370 wxr, None, template_node.template_parameters.get(index) 

371 ), 

372 raw_tags=raw_tags, 

373 ) 

374 sounds.append(sound) 

375 return sounds 

376 

377 

378def extract_ja_pron_template( 

379 wxr: WiktextractContext, t_node: TemplateNode 

380) -> tuple[list[Sound], list[str]]: 

381 expanded_node = wxr.wtp.parse( 

382 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

383 ) 

384 cats = {} 

385 sounds = [] 

386 for li_tag in expanded_node.find_html_recursively("li"): 

387 sound = Sound() 

388 for span_tag in li_tag.find_html("span"): 

389 span_class = span_tag.attrs.get("class", "") 

390 if "usage-label-accent" in span_class: 

391 raw_tag = clean_node(wxr, None, span_tag).strip("() ") 

392 if raw_tag != "": 

393 sound.raw_tags.append(raw_tag) 

394 elif "IPA" in span_class: 

395 sound.ipa = clean_node(wxr, None, span_tag) 

396 elif "Latn" in span_class: 

397 sound.roman = clean_node(wxr, None, span_tag) 

398 elif span_tag.attrs.get("lang", "") == "ja": 

399 sound.other = clean_node(wxr, None, span_tag) 

400 if sound.ipa != "" or sound.other != "": 

401 translate_raw_tags(sound) 

402 sounds.append(sound) 

403 

404 clean_node(wxr, cats, expanded_node) 

405 return sounds, cats.get("categories", []) 

406 

407 

408def extract_th_pron_template( 

409 wxr: WiktextractContext, t_node: TemplateNode 

410) -> tuple[list[Sound], list[str]]: 

411 @dataclass 

412 class TableHeader: 

413 raw_tags: list[str] 

414 rowspan: int 

415 

416 expanded_node = wxr.wtp.parse( 

417 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

418 ) 

419 cats = {} 

420 sounds = [] 

421 for table_tag in expanded_node.find_html("table"): 

422 row_headers = [] 

423 for tr_tag in table_tag.find_html("tr"): 

424 field = "other" 

425 new_headers = [] 

426 for header in row_headers: 

427 if header.rowspan > 1: 

428 header.rowspan -= 1 

429 new_headers.append(header) 

430 row_headers = new_headers 

431 for th_tag in tr_tag.find_html("th"): 

432 header_str = clean_node(wxr, None, th_tag) 

433 if header_str.startswith("(標準泰語) IPA"): 

434 field = "ipa" 

435 elif header_str.startswith("同音詞"): 

436 field = "homophone" 

437 elif header_str == "音頻": 

438 field = "audio" 

439 elif header_str != "": 

440 rowspan = 1 

441 rowspan_str = th_tag.attrs.get("rowspan", "1") 

442 if re.fullmatch(r"\d+", rowspan_str): 

443 rowspan = int(rowspan_str) 

444 header = TableHeader([], rowspan) 

445 for line in header_str.splitlines(): 

446 for raw_tag in line.strip("{}\n ").split(";"): 

447 raw_tag = raw_tag.strip() 

448 if raw_tag != "": 

449 header.raw_tags.append(raw_tag) 

450 row_headers.append(header) 

451 

452 for td_tag in tr_tag.find_html("td"): 

453 if field == "audio": 

454 for link_node in td_tag.find_child(NodeKind.LINK): 

455 filename = clean_node(wxr, None, link_node.largs[0]) 

456 if filename != "": 

457 sound = Sound() 

458 set_sound_file_url_fields(wxr, filename, sound) 

459 sounds.append(sound) 

460 elif field == "homophone": 

461 for span_tag in td_tag.find_html_recursively( 

462 "span", attr_name="lang", attr_value="th" 

463 ): 

464 word = clean_node(wxr, None, span_tag) 

465 if word != "": 

466 sounds.append(Sound(homophone=word)) 

467 else: 

468 raw_tags = [] 

469 for html_node in td_tag.find_child_recursively( 

470 NodeKind.HTML 

471 ): 

472 if html_node.tag == "small": 

473 node_str = clean_node(wxr, None, html_node) 

474 if node_str.startswith("[") and node_str.endswith( 

475 "]" 

476 ): 

477 for raw_tag in node_str.strip("[]").split(","): 

478 raw_tag = raw_tag.strip() 

479 if raw_tag != "": 

480 raw_tags.append(raw_tag) 

481 elif len(sounds) > 0: 

482 sounds[-1].roman = node_str 

483 elif html_node.tag == "span": 

484 node_str = clean_node(wxr, None, html_node) 

485 span_lang = html_node.attrs.get("lang", "") 

486 span_class = html_node.attrs.get("class", "") 

487 if node_str != "" and ( 

488 span_lang == "th" or span_class in ["IPA", "tr"] 

489 ): 

490 sound = Sound(raw_tags=raw_tags) 

491 for header in row_headers: 

492 sound.raw_tags.extend(header.raw_tags) 

493 translate_raw_tags(sound) 

494 if "romanization" in sound.tags: 

495 field = "roman" 

496 setattr(sound, field, node_str) 

497 sounds.append(sound) 

498 

499 clean_node(wxr, cats, expanded_node) 

500 return sounds, cats.get("categories", [])