Coverage for src / wiktextract / extractor / zh / pronunciation.py: 70%

510 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-19 11:25 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from ..share import set_sound_file_url_fields 

14from .models import Hyphenation, Sound, WordEntry 

15from .tags import translate_raw_tags 

16 

17 

18def extract_pronunciation_section( 

19 wxr: WiktextractContext, base_data: WordEntry, level_node: WikiNode 

20) -> None: 

21 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

22 if t_node.template_name == "zh-forms": 

23 from .page import process_zh_forms 

24 

25 process_zh_forms(wxr, base_data, t_node) 

26 else: 

27 new_sounds, new_cats = process_pron_template(wxr, base_data, t_node) 

28 base_data.sounds.extend(new_sounds) 

29 base_data.categories.extend(new_cats) 

30 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

31 new_sounds, new_cats = process_pron_item_list_item( 

32 wxr, base_data, list_item_node 

33 ) 

34 base_data.sounds.extend(new_sounds) 

35 base_data.categories.extend(new_cats) 

36 

37 

38def process_pron_item_list_item( 

39 wxr: WiktextractContext, base_data: WordEntry, list_item_node: WikiNode 

40) -> tuple[list[Sound], list[str]]: 

41 raw_tags = [] 

42 sounds = [] 

43 categories = [] 

44 for t_node in list_item_node.find_child(NodeKind.TEMPLATE): 

45 if t_node.template_name.lower() in ["hyph", "hyphenation"]: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 extract_hyphenation_template(wxr, base_data, t_node) 

47 else: 

48 new_sounds, new_cats = process_pron_template( 

49 wxr, base_data, t_node, raw_tags 

50 ) 

51 sounds.extend(new_sounds) 

52 categories.extend(new_cats) 

53 return sounds, categories 

54 

55 

56def process_pron_template( 

57 wxr: WiktextractContext, 

58 base_data: WordEntry, 

59 template_node: TemplateNode, 

60 raw_tags: list[str] = [], 

61) -> tuple[list[Sound], list[str]]: 

62 template_name = template_node.template_name.lower() 

63 sounds = [] 

64 categories = [] 

65 new_sounds = [] 

66 new_cats = [] 

67 if template_name == "zh-pron": 

68 new_sounds, new_cats = process_zh_pron_template(wxr, template_node) 

69 elif template_name in ["rhymes", "rhyme"]: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 new_sounds, new_cats = extract_rhymes_template(wxr, template_node) 

71 elif template_name in ["homophones", "homophone", "hmp"]: 

72 new_sounds, new_cats = extract_homophones_template(wxr, template_node) 

73 elif template_name in ["a", "accent"]: 

74 # https://zh.wiktionary.org/wiki/Template:Accent 

75 raw_tags.append(clean_node(wxr, None, template_node).strip("()")) 

76 elif template_name in ["audio", "音"]: 

77 new_sounds, new_cats = process_audio_template( 

78 wxr, template_node, raw_tags 

79 ) 

80 elif template_name == "ko-ipa": 

81 new_sounds, new_cats = extract_ko_ipa_template( 

82 wxr, template_node, raw_tags 

83 ) 

84 elif template_name == "ipa" or template_name.endswith("-ipa"): 

85 new_sounds, new_cats = extract_ipa_template( 

86 wxr, template_node, raw_tags 

87 ) 

88 elif template_name == "enpr": 

89 sounds.extend(process_enpr_template(wxr, template_node, raw_tags)) 

90 elif template_name == "ja-pron": 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true

91 new_sounds, new_cats = extract_ja_pron_template(wxr, template_node) 

92 elif template_name == "th-pron": 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true

93 new_sounds, new_cats = extract_th_pron_template(wxr, template_node) 

94 elif template_name.endswith("-pr"): 94 ↛ 98line 94 didn't jump to line 98 because the condition on line 94 was always true

95 new_sounds, new_cats = extract_pl_pr_template( 

96 wxr, base_data, template_node 

97 ) 

98 sounds.extend(new_sounds) 

99 categories.extend(new_cats) 

100 return sounds, categories 

101 

102 

103def process_zh_pron_template( 

104 wxr: WiktextractContext, template_node: TemplateNode 

105) -> tuple[list[Sound], list[str]]: 

106 # https://zh.wiktionary.org/wiki/Template:Zh-pron 

107 expanded_node = wxr.wtp.parse( 

108 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

109 ) 

110 seen_lists = set() 

111 sounds = [] 

112 categories = {} 

113 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

114 if list_node not in seen_lists: 

115 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

116 sounds.extend( 

117 process_zh_pron_list_item(wxr, list_item, [], seen_lists) 

118 ) 

119 clean_node(wxr, categories, expanded_node) 

120 for sound in sounds: 

121 translate_raw_tags(sound) 

122 return sounds, categories.get("categories", []) 

123 

124 

125def process_zh_pron_list_item( 

126 wxr: WiktextractContext, 

127 list_item_node: WikiNode, 

128 raw_tags: list[str], 

129 seen_lists: set[WikiNode], 

130) -> list[Sound]: 

131 current_tags = raw_tags[:] 

132 sounds = [] 

133 is_first_small_tag = True 

134 for node in list_item_node.children: 

135 if isinstance(node, WikiNode): 

136 if node.kind == NodeKind.LINK: 

137 link_str = clean_node(wxr, None, node.largs) 

138 node_str = clean_node(wxr, None, node) 

139 if link_str.startswith("File:"): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 filename = link_str.removeprefix("File:") 

141 sound_data = Sound(raw_tags=current_tags) 

142 set_sound_file_url_fields(wxr, filename, sound_data) 

143 sounds.append(sound_data) 

144 elif node_str != "": 

145 current_tags.append(node_str.strip("()")) 

146 elif isinstance(node, HTMLNode): 

147 if node.tag == "small": 

148 # remove "幫助"(help) <sup> tag 

149 if is_first_small_tag: 

150 raw_tag_text = clean_node( 

151 wxr, 

152 None, 

153 [ 

154 n 

155 for n in node.children 

156 if not ( 

157 isinstance(n, HTMLNode) and n.tag == "sup" 

158 ) 

159 ], 

160 ).rstrip(":") 

161 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text)) 

162 elif len(sounds) > 0: 162 ↛ 166line 162 didn't jump to line 166 because the condition on line 162 was always true

163 sounds[-1].raw_tags.extend( 

164 split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

165 ) 

166 is_first_small_tag = False 

167 elif node.tag == "span": 

168 sounds.extend(extract_zh_pron_span(wxr, node, current_tags)) 

169 elif ( 

170 node.tag == "table" 

171 and len(current_tags) > 0 

172 and current_tags[-1] == "同音詞" 

173 ): 

174 sounds.extend( 

175 extract_zh_pron_homophones_table( 

176 wxr, node, current_tags 

177 ) 

178 ) 

179 

180 elif node.kind == NodeKind.LIST: 180 ↛ 134line 180 didn't jump to line 134 because the condition on line 180 was always true

181 seen_lists.add(node) 

182 for next_list_item in node.find_child(NodeKind.LIST_ITEM): 

183 sounds.extend( 

184 process_zh_pron_list_item( 

185 wxr, 

186 next_list_item, 

187 current_tags, 

188 seen_lists, 

189 ) 

190 ) 

191 return sounds 

192 

193 

194def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]: 

195 raw_tags = [] 

196 if "(" not in raw_tag_text and "(" not in raw_tag_text: 

197 for raw_tag in re.split(r",|,|:|、|;|;|和(?!$)", raw_tag_text): 

198 raw_tag = raw_tag.strip().removeprefix("包括").strip() 

199 if raw_tag != "": 

200 raw_tags.append(raw_tag) 

201 else: 

202 processed_offsets = [] 

203 for match in re.finditer(r"\([^()]+\)|([^()]+)", raw_tag_text): 

204 processed_offsets.append((match.start(), match.end())) 

205 raw_tags.extend( 

206 split_zh_pron_raw_tag( 

207 raw_tag_text[match.start() + 1 : match.end() - 1] 

208 ) 

209 ) 

210 not_processed = "" 

211 last_end = 0 

212 for start, end in processed_offsets: 

213 not_processed += raw_tag_text[last_end:start] 

214 last_end = end 

215 not_processed += raw_tag_text[last_end:] 

216 if not_processed != raw_tag_text: 

217 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags 

218 else: 

219 raw_tags.append(not_processed) 

220 return raw_tags 

221 

222 

223def extract_zh_pron_span( 

224 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str] 

225) -> list[Sound]: 

226 sounds = [] 

227 small_tags = [] 

228 pron_nodes = [] 

229 roman = "" 

230 phonetic_pron = "" 

231 for index, node in enumerate(span_tag.children): 

232 if isinstance(node, HTMLNode) and node.tag == "small": 

233 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

234 elif ( 

235 isinstance(node, HTMLNode) 

236 and node.tag == "span" 

237 and "-Latn" in node.attrs.get("lang", "") 

238 ): 

239 roman = clean_node(wxr, None, node).strip("() ") 

240 elif isinstance(node, str) and node.strip() == "[實際讀音:": 

241 phonetic_pron = clean_node( 

242 wxr, None, span_tag.children[index + 1 :] 

243 ).strip("] ") 

244 break 

245 else: 

246 pron_nodes.append(node) 

247 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)): 

248 zh_pron = zh_pron.strip("[]: ") 

249 if len(zh_pron) > 0: 249 ↛ 247line 249 didn't jump to line 247 because the condition on line 249 was always true

250 if "IPA" in span_tag.attrs.get("class", ""): 

251 sounds.append( 

252 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags) 

253 ) 

254 else: 

255 sounds.append( 

256 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags) 

257 ) 

258 if len(sounds) > 0: 

259 sounds[-1].raw_tags.extend(small_tags) 

260 if phonetic_pron != "": 

261 sounds.append( 

262 Sound( 

263 zh_pron=phonetic_pron, 

264 roman=roman, 

265 raw_tags=raw_tags + ["實際讀音"], 

266 ) 

267 ) 

268 return sounds 

269 

270 

271def split_zh_pron(zh_pron: str) -> list[str]: 

272 # split by comma and other symbols that outside parentheses 

273 parentheses = 0 

274 pron_list = [] 

275 pron = "" 

276 for c in zh_pron: 

277 if ( 

278 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/"))) 

279 and parentheses == 0 

280 and len(pron.strip()) > 0 

281 ): 

282 pron_list.append(pron.strip()) 

283 pron = "" 

284 elif c in ["(", "("]: 

285 parentheses += 1 

286 pron += c 

287 elif c in [")", ")"]: 

288 parentheses -= 1 

289 pron += c 

290 else: 

291 pron += c 

292 

293 if pron.strip() != "": 

294 pron_list.append(pron) 

295 return pron_list 

296 

297 

298def extract_zh_pron_homophones_table( 

299 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str] 

300) -> list[Sound]: 

301 sounds = [] 

302 for td_tag in table.find_html_recursively("td"): 

303 for span_tag in td_tag.find_html("span"): 

304 span_class = span_tag.attrs.get("class", "") 

305 span_lang = span_tag.attrs.get("lang", "") 

306 span_str = clean_node(wxr, None, span_tag) 

307 if ( 

308 span_str not in ["", "/"] 

309 and span_lang != "" 

310 and span_class in ["Hant", "Hans", "Hani"] 

311 ): 

312 sound = Sound(homophone=span_str, raw_tags=raw_tags) 

313 if span_class == "Hant": 

314 sound.tags.append("Traditional-Chinese") 

315 elif span_class == "Hans": 

316 sound.tags.append("Simplified-Chinese") 

317 sounds.append(sound) 

318 return sounds 

319 

320 

321def extract_homophones_template( 

322 wxr: WiktextractContext, t_node: TemplateNode 

323) -> tuple[list[Sound], list[str]]: 

324 # https://zh.wiktionary.org/wiki/Template:homophones 

325 expanded_node = wxr.wtp.parse( 

326 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

327 ) 

328 homophones = [] 

329 cats = {} 

330 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

331 for top_span in expanded_node.find_html( 

332 "span", attr_name="class", attr_value="homophones" 

333 ): 

334 for span_tag in top_span.find_html("span"): 

335 span_lang = span_tag.attrs.get("lang", "") 

336 span_class = span_tag.attrs.get("class", "").split() 

337 if "Latn" in span_class and len(homophones) > 0: 337 ↛ 338line 337 didn't jump to line 338 because the condition on line 337 was never true

338 homophones[-1].roman = clean_node(wxr, None, span_tag) 

339 elif span_lang == lang_code: 339 ↛ 343line 339 didn't jump to line 343 because the condition on line 339 was always true

340 homophone = clean_node(wxr, None, span_tag) 

341 if homophone != "": 341 ↛ 334line 341 didn't jump to line 334 because the condition on line 341 was always true

342 homophones.append(Sound(homophone=homophone)) 

343 elif "qualifier-content" in span_class and len(homophones) > 0: 

344 raw_tag = clean_node(wxr, None, span_tag) 

345 if raw_tag != "": 

346 homophones[-1].raw_tags.append(raw_tag) 

347 translate_raw_tags(homophones[-1]) 

348 for link_node in expanded_node.find_child(NodeKind.LINK): 

349 clean_node(wxr, cats, link_node) 

350 return homophones, cats.get("categories", []) 

351 

352 

353def process_audio_template( 

354 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str] 

355) -> tuple[list[Sound], list[str]]: 

356 # https://zh.wiktionary.org/wiki/Template:Audio 

357 cats = {} 

358 sound_file = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

359 sound_data = Sound(raw_tags=raw_tags) 

360 set_sound_file_url_fields(wxr, sound_file, sound_data) 

361 raw_tag = clean_node(wxr, None, t_node.template_parameters.get(3, "")) 

362 if len(raw_tag) > 0: 362 ↛ 363line 362 didn't jump to line 363 because the condition on line 362 was never true

363 sound_data.raw_tags.append(raw_tag) 

364 expanded_node = wxr.wtp.parse( 

365 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

366 ) 

367 for span_node in expanded_node.find_html_recursively( 

368 "span", attr_name="class", attr_value="ib-content" 

369 ): 

370 for raw_tag in re.split(r",|,", clean_node(wxr, None, span_node)): 

371 raw_tag = raw_tag.strip() 

372 if raw_tag != "": 372 ↛ 370line 372 didn't jump to line 370 because the condition on line 372 was always true

373 sound_data.raw_tags.append(raw_tag) 

374 translate_raw_tags(sound_data) 

375 clean_node(wxr, cats, expanded_node) 

376 return [sound_data], cats.get("categories", []) 

377 

378 

379def extract_ipa_template( 

380 wxr: WiktextractContext, 

381 t_node: TemplateNode, 

382 raw_tags: list[str], 

383) -> tuple[list[Sound], list[str]]: 

384 # https://zh.wiktionary.org/wiki/Template:IPA 

385 cats = {} 

386 sounds = [] 

387 expanded_node = wxr.wtp.parse( 

388 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

389 ) 

390 clean_node(wxr, cats, expanded_node) 

391 no_list_nodes = [] 

392 for node in expanded_node.children: 

393 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

394 for list_item in node.find_child(NodeKind.LIST_ITEM): 

395 sounds.extend(extract_ipa_list_item(wxr, list_item, raw_tags)) 

396 else: 

397 no_list_nodes.append(node) 

398 if len(no_list_nodes) > 0: 398 ↛ 402line 398 didn't jump to line 402 because the condition on line 398 was always true

399 tmp_node = WikiNode(NodeKind.ROOT, 0) 

400 tmp_node.children = no_list_nodes 

401 sounds.extend(extract_ipa_list_item(wxr, tmp_node, raw_tags)) 

402 return sounds, cats.get("categories", []) 

403 

404 

405def extract_ipa_list_item( 

406 wxr: WiktextractContext, list_item: WikiNode, shared_raw_tags: list[str] 

407) -> list[Sound]: 

408 sounds = [] 

409 shared_raw_tags = shared_raw_tags[:] 

410 raw_tags = [] 

411 after_colon = False 

412 for node in list_item.children: 

413 if isinstance(node, str) and (":" in node or ":" in node): 

414 after_colon = True 

415 elif isinstance(node, HTMLNode) and node.tag == "span": 

416 span_class = node.attrs.get("class", "").split() 

417 if ( 

418 "qualifier-content" in span_class 

419 or "ib-content" in span_class 

420 or "usage-label-accent" in span_class 

421 ): 

422 for raw_tag in ( 

423 clean_node(wxr, None, node).strip("() ").split(",") 

424 ): 

425 raw_tag = raw_tag.strip() 

426 if raw_tag != "": 426 ↛ 422line 426 didn't jump to line 422 because the condition on line 426 was always true

427 if after_colon: 

428 raw_tags.append(raw_tag) 

429 else: 

430 shared_raw_tags.append(raw_tag) 

431 elif "IPA" in span_class: 

432 sound = Sound( 

433 ipa=clean_node(wxr, None, node), 

434 raw_tags=shared_raw_tags + raw_tags, 

435 ) 

436 if sound.ipa != "": 436 ↛ 439line 436 didn't jump to line 439 because the condition on line 436 was always true

437 translate_raw_tags(sound) 

438 sounds.append(sound) 

439 raw_tags.clear() 

440 elif "Latn" in span_class: 440 ↛ 441line 440 didn't jump to line 441 because the condition on line 440 was never true

441 sound = Sound( 

442 roman=clean_node(wxr, None, node), 

443 raw_tags=shared_raw_tags + raw_tags, 

444 ) 

445 if sound.roman != "": 

446 translate_raw_tags(sound) 

447 sounds.append(sound) 

448 raw_tags.clear() 

449 return sounds 

450 

451 

452def process_enpr_template( 

453 wxr: WiktextractContext, 

454 template_node: TemplateNode, 

455 raw_tags: list[str], 

456) -> list[Sound]: 

457 # https://zh.wiktionary.org/wiki/Template:enPR 

458 sounds = [] 

459 for index in range(1, 4): 459 ↛ 470line 459 didn't jump to line 470 because the loop on line 459 didn't complete

460 if index not in template_node.template_parameters: 

461 break 

462 sound = Sound( 

463 enpr=clean_node( 

464 wxr, None, template_node.template_parameters.get(index) 

465 ), 

466 raw_tags=raw_tags, 

467 ) 

468 translate_raw_tags(sound) 

469 sounds.append(sound) 

470 return sounds 

471 

472 

473def extract_ja_pron_template( 

474 wxr: WiktextractContext, t_node: TemplateNode 

475) -> tuple[list[Sound], list[str]]: 

476 JA_PRON_ACCENTS = { 

477 "中高型": "Nakadaka", 

478 "平板型": "Heiban", 

479 "頭高型": "Atamadaka", 

480 "尾高型": "Odaka", 

481 } 

482 expanded_node = wxr.wtp.parse( 

483 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

484 ) 

485 cats = {} 

486 sounds = [] 

487 for li_tag in expanded_node.find_html_recursively("li"): 

488 sound = Sound() 

489 for span_tag in li_tag.find_html("span"): 

490 span_class = span_tag.attrs.get("class", "").split() 

491 if "usage-label-accent" in span_class: 

492 raw_tag = clean_node(wxr, None, span_tag).strip("() ") 

493 if raw_tag != "": 

494 sound.raw_tags.append(raw_tag) 

495 elif "IPA" in span_class: 

496 sound.ipa = clean_node(wxr, None, span_tag) 

497 elif "Latn" in span_class: 

498 sound.roman = clean_node(wxr, None, span_tag) 

499 elif span_tag.attrs.get("lang", "") == "ja": 

500 sound.other = clean_node(wxr, None, span_tag) 

501 for link_node in li_tag.find_child(NodeKind.LINK): 

502 link_text = clean_node(wxr, None, link_node) 

503 if link_text in JA_PRON_ACCENTS: 

504 sound.tags.append(JA_PRON_ACCENTS[link_text]) 

505 if sound.ipa != "" or sound.other != "": 

506 translate_raw_tags(sound) 

507 sounds.append(sound) 

508 

509 audio_file = t_node.template_parameters.get( 

510 "a", t_node.template_parameters.get("audio", "") 

511 ).strip() 

512 if audio_file != "": 

513 sound = Sound() 

514 set_sound_file_url_fields(wxr, audio_file, sound) 

515 sounds.append(sound) 

516 clean_node(wxr, cats, expanded_node) 

517 return sounds, cats.get("categories", []) 

518 

519 

520def extract_th_pron_template( 

521 wxr: WiktextractContext, t_node: TemplateNode 

522) -> tuple[list[Sound], list[str]]: 

523 @dataclass 

524 class TableHeader: 

525 raw_tags: list[str] 

526 rowspan: int 

527 

528 expanded_node = wxr.wtp.parse( 

529 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

530 ) 

531 cats = {} 

532 sounds = [] 

533 for table_tag in expanded_node.find_html("table"): 

534 row_headers = [] 

535 for tr_tag in table_tag.find_html("tr"): 

536 field = "other" 

537 new_headers = [] 

538 for header in row_headers: 

539 if header.rowspan > 1: 

540 header.rowspan -= 1 

541 new_headers.append(header) 

542 row_headers = new_headers 

543 for th_tag in tr_tag.find_html("th"): 

544 header_str = clean_node(wxr, None, th_tag) 

545 if header_str.startswith("(標準泰語) IPA"): 

546 field = "ipa" 

547 elif header_str.startswith("同音詞"): 

548 field = "homophone" 

549 elif header_str == "音頻": 

550 field = "audio" 

551 elif header_str != "": 

552 rowspan = 1 

553 rowspan_str = th_tag.attrs.get("rowspan", "1") 

554 if re.fullmatch(r"\d+", rowspan_str): 

555 rowspan = int(rowspan_str) 

556 header = TableHeader([], rowspan) 

557 for line in header_str.splitlines(): 

558 for raw_tag in line.strip("{}\n ").split(";"): 

559 raw_tag = raw_tag.strip() 

560 if raw_tag != "": 

561 header.raw_tags.append(raw_tag) 

562 row_headers.append(header) 

563 

564 for td_tag in tr_tag.find_html("td"): 

565 if field == "audio": 

566 for link_node in td_tag.find_child(NodeKind.LINK): 

567 filename = clean_node(wxr, None, link_node.largs[0]) 

568 if filename != "": 

569 sound = Sound() 

570 set_sound_file_url_fields(wxr, filename, sound) 

571 sounds.append(sound) 

572 elif field == "homophone": 

573 for span_tag in td_tag.find_html_recursively( 

574 "span", attr_name="lang", attr_value="th" 

575 ): 

576 word = clean_node(wxr, None, span_tag) 

577 if word != "": 

578 sounds.append(Sound(homophone=word)) 

579 else: 

580 raw_tags = [] 

581 for html_node in td_tag.find_child_recursively( 

582 NodeKind.HTML 

583 ): 

584 if html_node.tag == "small": 

585 node_str = clean_node(wxr, None, html_node) 

586 if node_str.startswith("[") and node_str.endswith( 

587 "]" 

588 ): 

589 for raw_tag in node_str.strip("[]").split(","): 

590 raw_tag = raw_tag.strip() 

591 if raw_tag != "": 

592 raw_tags.append(raw_tag) 

593 elif len(sounds) > 0: 

594 sounds[-1].roman = node_str 

595 elif html_node.tag == "span": 

596 node_str = clean_node(wxr, None, html_node) 

597 span_lang = html_node.attrs.get("lang", "") 

598 span_class = html_node.attrs.get("class", "") 

599 if node_str != "" and ( 

600 span_lang == "th" or span_class in ["IPA", "tr"] 

601 ): 

602 sound = Sound(raw_tags=raw_tags) 

603 for header in row_headers: 

604 sound.raw_tags.extend(header.raw_tags) 

605 translate_raw_tags(sound) 

606 if "romanization" in sound.tags: 

607 field = "roman" 

608 setattr(sound, field, node_str) 

609 sounds.append(sound) 

610 

611 clean_node(wxr, cats, expanded_node) 

612 return sounds, cats.get("categories", []) 

613 

614 

615def extract_rhymes_template( 

616 wxr: WiktextractContext, t_node: TemplateNode 

617) -> tuple[list[Sound], list[str]]: 

618 expanded_node = wxr.wtp.parse( 

619 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

620 ) 

621 return extract_rhymes_list_item(wxr, expanded_node) 

622 

623 

624def extract_rhymes_list_item( 

625 wxr: WiktextractContext, list_item: WikiNode 

626) -> tuple[list[Sound], list[str]]: 

627 sounds = [] 

628 cats = {} 

629 for link_node in list_item.find_child(NodeKind.LINK): 

630 rhyme = clean_node(wxr, cats, link_node) 

631 if rhyme != "": 631 ↛ 629line 631 didn't jump to line 629 because the condition on line 631 was always true

632 sounds.append(Sound(rhymes=rhyme)) 

633 return sounds, cats.get("categories", []) 

634 

635 

636def extract_hyphenation_template( 

637 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

638): 

639 expanded_node = wxr.wtp.parse( 

640 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

641 ) 

642 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

643 extract_hyphenation_list_item(wxr, base_data, expanded_node, lang_code) 

644 

645 

646def extract_hyphenation_list_item( 

647 wxr: WiktextractContext, 

648 base_data: WordEntry, 

649 list_item: WikiNode, 

650 lang_code: str, 

651): 

652 for span_tag in list_item.find_html( 

653 "span", attr_name="lang", attr_value=lang_code 

654 ): 

655 h_str = clean_node(wxr, None, span_tag) 

656 h_data = Hyphenation( 

657 parts=list(filter(None, map(str.strip, h_str.split("‧")))) 

658 ) 

659 if len(h_data.parts) > 0: 659 ↛ 652line 659 didn't jump to line 652 because the condition on line 659 was always true

660 base_data.hyphenations.append(h_data) 

661 

662 

663def extract_pl_pr_template( 

664 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

665) -> tuple[list[Sound], list[str]]: 

666 sounds = [] 

667 cats = {} 

668 expanded_node = wxr.wtp.parse( 

669 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

670 ) 

671 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

672 skip_list = False 

673 for html_node in list_item.find_child(NodeKind.HTML): 

674 if html_node.tag == "table": 

675 sounds.extend(extract_pl_pr_sound_table(wxr, html_node)) 

676 skip_list = True 

677 break 

678 elif ( 

679 html_node.tag == "span" 

680 and "IPA" in html_node.attrs.get("class", "").split() 

681 ): 

682 sounds.extend(extract_ipa_list_item(wxr, list_item, [])) 

683 skip_list = True 

684 break 

685 if skip_list: 

686 continue 

687 for index, node in enumerate(list_item.children): 687 ↛ 671line 687 didn't jump to line 671 because the loop on line 687 didn't complete

688 if isinstance(node, str) and (":" in node or ":" in node): 688 ↛ 687line 688 didn't jump to line 687 because the condition on line 688 was always true

689 m = re.search(r":|:", node) 

690 list_type = clean_node( 

691 wxr, None, list_item.children[:index] + [node[: m.start()]] 

692 ) 

693 if list_type == "韻部": 

694 new_sounds, _ = extract_rhymes_list_item(wxr, list_item) 

695 sounds.extend(new_sounds) 

696 break 

697 elif list_type == "音節化": 697 ↛ 687line 697 didn't jump to line 687 because the condition on line 697 was always true

698 extract_hyphenation_list_item( 

699 wxr, base_data, list_item, "pl" 

700 ) 

701 break 

702 

703 clean_node(wxr, cats, expanded_node) 

704 return sounds, cats.get("categories", []) 

705 

706 

707def extract_pl_pr_sound_table( 

708 wxr: WiktextractContext, table_node: HTMLNode 

709) -> list[Sound]: 

710 sounds = [] 

711 for tr_node in table_node.find_html("tr"): 

712 raw_tag = "" 

713 for td_node in tr_node.find_html("td"): 

714 td_class = td_node.attrs.get("class", "").split() 

715 if td_class == []: 

716 for i_node in td_node.find_html("i"): 

717 raw_tag = clean_node(wxr, None, i_node) 

718 elif "audiofile" in td_class: 

719 for link_node in td_node.find_child(NodeKind.LINK): 

720 if len(link_node.largs) > 0 and len(link_node.largs[0]) > 0: 720 ↛ 719line 720 didn't jump to line 719 because the condition on line 720 was always true

721 file_name = clean_node( 

722 wxr, None, link_node.largs[0][0] 

723 ).removeprefix("File:") 

724 if file_name != "": 724 ↛ 719line 724 didn't jump to line 719 because the condition on line 724 was always true

725 sound = Sound() 

726 set_sound_file_url_fields(wxr, file_name, sound) 

727 if raw_tag != "": 727 ↛ 730line 727 didn't jump to line 730 because the condition on line 727 was always true

728 sound.raw_tags.append(raw_tag) 

729 translate_raw_tags(sound) 

730 sounds.append(sound) 

731 return sounds 

732 

733 

734def extract_ko_ipa_template( 

735 wxr: WiktextractContext, 

736 t_node: TemplateNode, 

737 raw_tags: list[str], 

738) -> tuple[list[Sound], list[str]]: 

739 cats = {} 

740 sounds = [] 

741 expanded_node = wxr.wtp.parse( 

742 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

743 ) 

744 clean_node(wxr, cats, expanded_node) 

745 for ul_node in expanded_node.find_html("ul"): 

746 for li_node in ul_node.find_html("li"): 

747 if "ko-pron__ph" in li_node.attrs.get("class", ""): 

748 for span_node in li_node.find_html( 

749 "span", attr_name="lang", attr_value="ko" 

750 ): 

751 hangeul_str = clean_node(wxr, None, span_node).strip("[]") 

752 for hangeul in hangeul_str.split("/"): 

753 if hangeul != "": 753 ↛ 752line 753 didn't jump to line 752 because the condition on line 753 was always true

754 sounds.append( 

755 Sound(hangeul=hangeul, tags=["phonetic"]) 

756 ) 

757 else: 

758 raw_tags = [] 

759 for link_node in li_node.find_child(NodeKind.LINK): 

760 raw_tag = clean_node(wxr, None, link_node) 

761 if raw_tag not in ["", "IPA"]: 

762 raw_tags.append(raw_tag) 

763 for span_node in li_node.find_html( 

764 "span", attr_name="class", attr_value="IPA" 

765 ): 

766 ipas = clean_node(wxr, None, span_node) 

767 for ipa in ipas.split("~"): 

768 ipa = ipa.strip() 

769 if ipa != "": 769 ↛ 767line 769 didn't jump to line 767 because the condition on line 769 was always true

770 sound = Sound(ipa=ipa, raw_tags=raw_tags) 

771 translate_raw_tags(sound) 

772 sounds.append(sound) 

773 

774 for table in expanded_node.find_html("table"): 

775 for tr in table.find_html("tr"): 

776 raw_tag = "" 

777 for th in tr.find_html("th"): 

778 raw_tag = clean_node(wxr, None, th) 

779 for td in tr.find_html("td"): 

780 roman = clean_node(wxr, None, td) 

781 if roman != "": 781 ↛ 779line 781 didn't jump to line 779 because the condition on line 781 was always true

782 sound = Sound(roman=roman) 

783 if raw_tag != "": 783 ↛ 786line 783 didn't jump to line 786 because the condition on line 783 was always true

784 sound.raw_tags.append(raw_tag) 

785 translate_raw_tags(sound) 

786 sounds.append(sound) 

787 

788 audio_file = clean_node( 

789 wxr, 

790 None, 

791 t_node.template_parameters.get( 

792 "a", t_node.template_parameters.get("audio", "") 

793 ), 

794 ) 

795 if audio_file != "": 795 ↛ 800line 795 didn't jump to line 800 because the condition on line 795 was always true

796 sound = Sound() 

797 set_sound_file_url_fields(wxr, audio_file, sound) 

798 sounds.append(sound) 

799 

800 return sounds, cats.get("categories", [])