Coverage for src / wiktextract / extractor / vi / sound.py: 27%

467 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2025-12-29 01:50 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..share import set_sound_file_url_fields 

15from .models import Hyphenation, Sound, WordEntry 

16from .tags import translate_raw_tags 

17 

18 

19def extract_sound_section( 

20 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

21): 

22 for node in level_node.children: 

23 if isinstance(node, TemplateNode): 

24 extract_sound_template(wxr, base_data, node) 

25 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

26 for list_item in node.find_child(NodeKind.LIST_ITEM): 

27 extract_sound_list_item(wxr, base_data, list_item) 

28 

29 

30def extract_sound_template( 

31 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

32): 

33 if t_node.template_name == "vie-pron": 

34 extract_vie_pron_template(wxr, base_data, t_node) 

35 elif t_node.template_name in [ 35 ↛ 40line 35 didn't jump to line 40 because the condition on line 35 was never true

36 "âm thanh-IPA", 

37 "pron-audio", 

38 "audio-for-pron", 

39 ]: 

40 extract_pron_audio_template(wxr, base_data, t_node) 

41 elif t_node.template_name in ["zh-pron", "zho-pron"]: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 extract_zh_pron_template(wxr, base_data, t_node) 

43 elif t_node.template_name in ["th-pron", "tha-pron"]: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 extract_th_pron_template(wxr, base_data, t_node) 

45 elif t_node.template_name in ["ja-pron", "ja-IPA", "jpn-IPA", "jpn-pron"]: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 extract_ja_pron_template(wxr, base_data, t_node) 

47 elif t_node.template_name in ["âm thanh", "Audio", "Âm thanh"]: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 extract_audio_template(wxr, base_data, t_node, 1) 

49 elif t_node.template_name in ["âm thanh-2", "audio"]: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 extract_audio_template(wxr, base_data, t_node, 2) 

51 elif t_node.template_name.lower() in ["ko-ipa", "kor-ipa"]: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 extract_ko_ipa_template(wxr, base_data, t_node) 

53 elif t_node.template_name in [ 

54 "IPA", 

55 "IPA2", 

56 "IPA3", 

57 "IPA4", 

58 ] or t_node.template_name.endswith("-IPA"): 

59 extract_ipa_template(wxr, base_data, t_node, "IPA") 

60 elif t_node.template_name in ["enPR", "AHD"]: 

61 extract_ipa_template(wxr, base_data, t_node, "enPR") 

62 elif t_node.template_name in ["rhymes", "rhyme"]: 

63 extract_rhymes_template(wxr, base_data, t_node) 

64 elif t_node.template_name in ["hyphenation", "hyph"]: 64 ↛ 66line 64 didn't jump to line 66 because the condition on line 64 was always true

65 extract_hyphenation_template(wxr, base_data, t_node) 

66 elif t_node.template_name in ["homophones", "homophone", "hmp"]: 

67 extract_homophones_template(wxr, base_data, t_node) 

68 

69 

70def extract_sound_list_item( 

71 wxr: WiktextractContext, base_data: WordEntry, list_item: WikiNode 

72): 

73 for node in list_item.children: 

74 if isinstance(node, TemplateNode): 

75 extract_sound_template(wxr, base_data, node) 

76 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true

77 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

78 extract_sound_list_item(wxr, base_data, child_list_item) 

79 

80 

81@dataclass 

82class TableHeader: 

83 text: str 

84 index: int 

85 span: int 

86 

87 

88def extract_vie_pron_template( 

89 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

90): 

91 expanded_node = wxr.wtp.parse( 

92 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

93 ) 

94 for table in expanded_node.find_child(NodeKind.TABLE): 

95 col_headers = [] 

96 for row in table.find_child(NodeKind.TABLE_ROW): 

97 col_index = 0 

98 for cell in row.find_child( 

99 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

100 ): 

101 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

102 if col_index == 0: 

103 col_headers.clear() 

104 colspan = int(cell.attrs.get("colspan", "1")) 

105 col_headers.append( 

106 TableHeader( 

107 clean_node(wxr, None, cell), col_index, colspan 

108 ) 

109 ) 

110 col_index += colspan 

111 else: 

112 colspan = int(cell.attrs.get("colspan", "1")) 

113 for span_tag in cell.find_html( 

114 "span", attr_name="class", attr_value="IPA" 

115 ): 

116 extract_vie_pron_span_tag( 

117 wxr, 

118 base_data, 

119 span_tag, 

120 col_index, 

121 colspan, 

122 col_headers, 

123 ) 

124 col_index += colspan 

125 for td_tag in cell.find_html("td"): 

126 colspan = int(td_tag.attrs.get("colspan", "1")) 

127 for span_tag in td_tag.find_html( 

128 "span", attr_name="class", attr_value="IPA" 

129 ): 

130 extract_vie_pron_span_tag( 

131 wxr, 

132 base_data, 

133 span_tag, 

134 col_index, 

135 colspan, 

136 col_headers, 

137 ) 

138 col_index += colspan 

139 

140 for link in expanded_node.find_child(NodeKind.LINK): 

141 clean_node(wxr, base_data, link) 

142 

143 

144def extract_vie_pron_span_tag( 

145 wxr: WiktextractContext, 

146 base_data: WordEntry, 

147 span_tag: HTMLNode, 

148 index: str, 

149 colspan: int, 

150 col_headers: list[TableHeader], 

151): 

152 ipa = clean_node(wxr, None, span_tag) 

153 if ipa != "": 153 ↛ exitline 153 didn't return from function 'extract_vie_pron_span_tag' because the condition on line 153 was always true

154 sound = Sound(ipa=ipa) 

155 for header in col_headers: 

156 if ( 

157 index < header.index + header.span 

158 and index + colspan > header.index 

159 ): 

160 sound.raw_tags.append(header.text) 

161 translate_raw_tags(sound) 

162 base_data.sounds.append(sound) 

163 

164 

165def extract_pron_audio_template( 

166 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

167): 

168 file = clean_node(wxr, None, t_node.template_parameters.get("file", "")) 

169 if file == "": 

170 return 

171 sound = Sound() 

172 set_sound_file_url_fields(wxr, file, sound) 

173 place = clean_node(wxr, None, t_node.template_parameters.get("place", "")) 

174 if place != "": 

175 sound.raw_tags.append(place) 

176 sound.ipa = clean_node( 

177 wxr, None, t_node.template_parameters.get("pron", "") 

178 ) 

179 translate_raw_tags(sound) 

180 base_data.sounds.append(sound) 

181 

182 

183def extract_audio_template( 

184 wxr: WiktextractContext, 

185 base_data: WordEntry, 

186 t_node: TemplateNode, 

187 start_arg: int, 

188): 

189 # https://vi.wiktionary.org/wiki/Bản_mẫu:âm_thanh 

190 # https://vi.wiktionary.org/wiki/Bản_mẫu:âm_thanh-2 

191 file = clean_node(wxr, None, t_node.template_parameters.get(start_arg, "")) 

192 if file == "": 

193 return 

194 sound = Sound() 

195 set_sound_file_url_fields(wxr, file, sound) 

196 raw_tag = clean_node( 

197 wxr, None, t_node.template_parameters.get(start_arg + 1, "") 

198 ) 

199 if raw_tag != "": 

200 sound.raw_tags.append(raw_tag) 

201 expanded_node = wxr.wtp.parse( 

202 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

203 ) 

204 for span_node in expanded_node.find_html_recursively( 

205 "span", attr_name="class", attr_value="ib-content" 

206 ): 

207 for raw_tag in clean_node(wxr, None, span_node).split(","): 

208 if raw_tag != "": 

209 sound.raw_tags.append(raw_tag) 

210 translate_raw_tags(sound) 

211 clean_node(wxr, base_data, expanded_node) 

212 base_data.sounds.append(sound) 

213 

214 

215def extract_ipa_template( 

216 wxr: WiktextractContext, 

217 base_data: WordEntry, 

218 t_node: TemplateNode, 

219 ipa_class: str, 

220): 

221 # https://vi.wiktionary.org/wiki/Bản_mẫu:IPA 

222 expanded_node = wxr.wtp.parse( 

223 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

224 ) 

225 no_list_nodes = [] 

226 for node in expanded_node.children: 

227 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

228 for list_item in node.find_child(NodeKind.LIST_ITEM): 

229 extract_ipa_list_item(wxr, base_data, list_item, ipa_class) 

230 else: 

231 no_list_nodes.append(node) 

232 if len(no_list_nodes) > 0: 232 ↛ 236line 232 didn't jump to line 236 because the condition on line 232 was always true

233 tmp_node = WikiNode(NodeKind.ROOT, 0) 

234 tmp_node.children = no_list_nodes 

235 extract_ipa_list_item(wxr, base_data, tmp_node, ipa_class) 

236 clean_node(wxr, base_data, expanded_node) 

237 

238 

239def extract_ipa_list_item( 

240 wxr: WiktextractContext, 

241 base_data: WordEntry, 

242 list_item: WikiNode, 

243 class_name: str, 

244): 

245 raw_tags = [] 

246 for italic_node in list_item.find_child(NodeKind.ITALIC): 

247 raw_tag = clean_node(wxr, None, italic_node) 

248 if raw_tag != "": 248 ↛ 246line 248 didn't jump to line 246 because the condition on line 248 was always true

249 raw_tags.append(raw_tag) 

250 for span_tag in list_item.find_html_recursively("span"): 

251 span_class = span_tag.attrs.get("class", "").split() 

252 if "qualifier-content" in span_class or "label-content" in span_class: 

253 for raw_tag in clean_node(wxr, None, span_tag).split(","): 

254 raw_tag = raw_tag.strip() 

255 if raw_tag != "": 255 ↛ 253line 255 didn't jump to line 253 because the condition on line 255 was always true

256 raw_tags.append(raw_tag) 

257 elif class_name in span_class: 

258 sound = Sound( 

259 ipa=clean_node(wxr, None, span_tag), raw_tags=raw_tags 

260 ) 

261 if sound.ipa != "": 261 ↛ 250line 261 didn't jump to line 250 because the condition on line 261 was always true

262 translate_raw_tags(sound) 

263 base_data.sounds.append(sound) 

264 

265 

266def extract_rhymes_template( 

267 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

268): 

269 # https://vi.wiktionary.org/wiki/Bản_mẫu:rhymes 

270 expanded_node = wxr.wtp.parse( 

271 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

272 ) 

273 for span_tag in expanded_node.find_html_recursively( 

274 "span", attr_name="class", attr_value="IPA" 

275 ): 

276 rhyme = clean_node(wxr, None, span_tag) 

277 if rhyme != "": 277 ↛ 273line 277 didn't jump to line 273 because the condition on line 277 was always true

278 base_data.sounds.append(Sound(rhymes=rhyme)) 

279 

280 for link in expanded_node.find_child(NodeKind.LINK): 

281 clean_node(wxr, base_data, link) 

282 

283 

284def extract_hyphenation_template( 

285 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

286): 

287 # https://vi.wiktionary.org/wiki/Bản_mẫu:hyphenation 

288 expanded_node = wxr.wtp.parse( 

289 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

290 ) 

291 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

292 for span_tag in expanded_node.find_html( 

293 "span", attr_name="lang", attr_value=lang_code 

294 ): 

295 h_str = clean_node(wxr, None, span_tag) 

296 h_data = Hyphenation( 

297 parts=list(filter(None, map(str.strip, h_str.split("‧")))) 

298 ) 

299 if len(h_data.parts) > 0: 299 ↛ 292line 299 didn't jump to line 292 because the condition on line 299 was always true

300 base_data.hyphenations.append(h_data) 

301 

302 

303def extract_homophone_section( 

304 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

305): 

306 for list in level_node.find_child(NodeKind.LIST): 

307 for list_item in list.find_child(NodeKind.LIST_ITEM): 

308 for link_node in list_item.find_child(NodeKind.LINK): 

309 homophone = clean_node(wxr, None, link_node) 

310 if homophone != "": 

311 base_data.sounds.append(Sound(homophone=homophone)) 

312 

313 

314def extract_zh_pron_template( 

315 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

316): 

317 expanded_node = wxr.wtp.parse( 

318 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

319 ) 

320 seen_lists = set() 

321 sounds = [] 

322 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

323 if list_node not in seen_lists: 

324 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

325 sounds.extend( 

326 extract_zh_pron_list_item(wxr, list_item, [], seen_lists) 

327 ) 

328 for sound in sounds: 

329 translate_raw_tags(sound) 

330 base_data.sounds.extend(sounds) 

331 clean_node(wxr, base_data, expanded_node) 

332 

333 

334def extract_zh_pron_list_item( 

335 wxr: WiktextractContext, 

336 list_item_node: WikiNode, 

337 raw_tags: list[str], 

338 seen_lists: set[WikiNode], 

339) -> list[Sound]: 

340 current_tags = raw_tags[:] 

341 sounds = [] 

342 is_first_small_tag = True 

343 for node in list_item_node.children: 

344 if isinstance(node, WikiNode): 

345 if node.kind == NodeKind.LINK: 

346 link_str = clean_node(wxr, None, node.largs) 

347 node_str = clean_node(wxr, None, node) 

348 if link_str.startswith(("File:", "Tập tin:")): 

349 filename = link_str.removeprefix("File:").removeprefix( 

350 "Tập tin:" 

351 ) 

352 sound_data = Sound(raw_tags=current_tags) 

353 set_sound_file_url_fields(wxr, filename, sound_data) 

354 sounds.append(sound_data) 

355 elif node_str != "": 

356 current_tags.append(node_str.strip("()")) 

357 elif isinstance(node, HTMLNode): 

358 if node.tag == "small": 

359 # remove "ghi chú"(help) <sup> tag 

360 if is_first_small_tag: 

361 raw_tag_text = clean_node( 

362 wxr, 

363 None, 

364 [ 

365 n 

366 for n in node.children 

367 if not ( 

368 isinstance(n, HTMLNode) and n.tag == "sup" 

369 ) 

370 ], 

371 ).rstrip(":") 

372 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text)) 

373 elif len(sounds) > 0: 

374 sounds[-1].raw_tags.extend( 

375 split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

376 ) 

377 is_first_small_tag = False 

378 elif node.tag == "span": 

379 sounds.extend(extract_zh_pron_span(wxr, node, current_tags)) 

380 elif ( 

381 node.tag == "table" 

382 and len(current_tags) > 0 

383 and current_tags[-1] == "Đồng âm" 

384 ): 

385 sounds.extend( 

386 extract_zh_pron_homophones_table( 

387 wxr, node, current_tags 

388 ) 

389 ) 

390 elif node.kind == NodeKind.LIST: 

391 seen_lists.add(node) 

392 for next_list_item in node.find_child(NodeKind.LIST_ITEM): 

393 sounds.extend( 

394 extract_zh_pron_list_item( 

395 wxr, 

396 next_list_item, 

397 current_tags, 

398 seen_lists, 

399 ) 

400 ) 

401 return sounds 

402 

403 

404def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]: 

405 raw_tags = [] 

406 if "(" not in raw_tag_text: 

407 for raw_tag in re.split(r",|:|;| và ", raw_tag_text): 

408 raw_tag = raw_tag.strip().removeprefix("bao gồm").strip() 

409 if raw_tag != "": 

410 raw_tags.append(raw_tag) 

411 else: 

412 processed_offsets = [] 

413 for match in re.finditer(r"\([^()]+\)", raw_tag_text): 

414 processed_offsets.append((match.start(), match.end())) 

415 raw_tags.extend( 

416 split_zh_pron_raw_tag( 

417 raw_tag_text[match.start() + 1 : match.end() - 1] 

418 ) 

419 ) 

420 not_processed = "" 

421 last_end = 0 

422 for start, end in processed_offsets: 

423 not_processed += raw_tag_text[last_end:start] 

424 last_end = end 

425 not_processed += raw_tag_text[last_end:] 

426 if not_processed != raw_tag_text: 

427 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags 

428 else: 

429 raw_tags.append(not_processed) 

430 return raw_tags 

431 

432 

433def extract_zh_pron_span( 

434 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str] 

435) -> list[Sound]: 

436 sounds = [] 

437 small_tags = [] 

438 pron_nodes = [] 

439 roman = "" 

440 phonetic_pron = "" 

441 for index, node in enumerate(span_tag.children): 

442 if isinstance(node, HTMLNode) and node.tag == "small": 

443 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

444 elif ( 

445 isinstance(node, HTMLNode) 

446 and node.tag == "span" 

447 and "-Latn" in node.attrs.get("lang", "") 

448 ): 

449 roman = clean_node(wxr, None, node).strip("() ") 

450 elif isinstance(node, str) and node.strip() == "[Phonetic:": 

451 phonetic_pron = clean_node( 

452 wxr, None, span_tag.children[index + 1 :] 

453 ).strip("] ") 

454 break 

455 else: 

456 pron_nodes.append(node) 

457 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)): 

458 zh_pron = zh_pron.strip("[]: ") 

459 if len(zh_pron) > 0: 

460 if "IPA" in span_tag.attrs.get("class", ""): 

461 sounds.append( 

462 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags) 

463 ) 

464 else: 

465 sounds.append( 

466 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags) 

467 ) 

468 if len(sounds) > 0: 

469 sounds[-1].raw_tags.extend(small_tags) 

470 if phonetic_pron != "": 

471 sounds.append( 

472 Sound( 

473 zh_pron=phonetic_pron, 

474 roman=roman, 

475 raw_tags=raw_tags + ["Phonetic"], 

476 ) 

477 ) 

478 return sounds 

479 

480 

481def split_zh_pron(zh_pron: str) -> list[str]: 

482 # split by comma and other symbols that outside parentheses 

483 parentheses = 0 

484 pron_list = [] 

485 pron = "" 

486 for c in zh_pron: 

487 if ( 

488 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/"))) 

489 and parentheses == 0 

490 and len(pron.strip()) > 0 

491 ): 

492 pron_list.append(pron.strip()) 

493 pron = "" 

494 elif c == "(": 

495 parentheses += 1 

496 pron += c 

497 elif c == ")": 

498 parentheses -= 1 

499 pron += c 

500 else: 

501 pron += c 

502 

503 if pron.strip() != "": 

504 pron_list.append(pron) 

505 return pron_list 

506 

507 

508def extract_zh_pron_homophones_table( 

509 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str] 

510) -> list[Sound]: 

511 sounds = [] 

512 for td_tag in table.find_html_recursively("td"): 

513 for span_tag in td_tag.find_html("span"): 

514 span_class = span_tag.attrs.get("class", "") 

515 span_lang = span_tag.attrs.get("lang", "") 

516 span_str = clean_node(wxr, None, span_tag) 

517 if ( 

518 span_str not in ["", "/"] 

519 and span_lang != "" 

520 and span_class in ["Hant", "Hans", "Hani"] 

521 ): 

522 sound = Sound(homophone=span_str, raw_tags=raw_tags) 

523 if span_class == "Hant": 

524 sound.tags.append("Traditional-Chinese") 

525 elif span_class == "Hans": 

526 sound.tags.append("Simplified-Chinese") 

527 sounds.append(sound) 

528 return sounds 

529 

530 

531def extract_th_pron_template( 

532 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

533): 

534 # https://vi.wiktionary.org/wiki/Bản mẫu:th-pron 

535 @dataclass 

536 class TableHeader: 

537 raw_tags: list[str] 

538 rowspan: int 

539 

540 expanded_node = wxr.wtp.parse( 

541 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

542 ) 

543 sounds = [] 

544 for table_tag in expanded_node.find_html("table"): 

545 row_headers = [] 

546 for tr_tag in table_tag.find_html("tr"): 

547 field = "other" 

548 new_headers = [] 

549 for header in row_headers: 

550 if header.rowspan > 1: 

551 header.rowspan -= 1 

552 new_headers.append(header) 

553 row_headers = new_headers 

554 for th_tag in tr_tag.find_html("th"): 

555 header_str = clean_node(wxr, None, th_tag) 

556 if header_str.startswith("(Tiêu chuẩn) IPA"): 

557 field = "ipa" 

558 elif header_str.startswith("Từ đồng âm"): 

559 field = "homophone" 

560 elif header_str == "Âm thanh": 

561 field = "audio" 

562 elif header_str != "": 

563 rowspan = 1 

564 rowspan_str = th_tag.attrs.get("rowspan", "1") 

565 if re.fullmatch(r"\d+", rowspan_str): 

566 rowspan = int(rowspan_str) 

567 header = TableHeader([], rowspan) 

568 for line in header_str.splitlines(): 

569 for raw_tag in line.strip("{}\n ").split(";"): 

570 raw_tag = raw_tag.strip() 

571 if raw_tag != "": 

572 header.raw_tags.append(raw_tag) 

573 row_headers.append(header) 

574 

575 for td_tag in tr_tag.find_html("td"): 

576 if field == "audio": 

577 for link_node in td_tag.find_child(NodeKind.LINK): 

578 filename = clean_node(wxr, None, link_node.largs[0]) 

579 if filename != "": 

580 sound = Sound() 

581 set_sound_file_url_fields(wxr, filename, sound) 

582 sounds.append(sound) 

583 elif field == "homophone": 

584 for span_tag in td_tag.find_html_recursively( 

585 "span", attr_name="lang", attr_value="th" 

586 ): 

587 word = clean_node(wxr, None, span_tag) 

588 if word != "": 

589 sounds.append(Sound(homophone=word)) 

590 else: 

591 raw_tags = [] 

592 for html_node in td_tag.find_child_recursively( 

593 NodeKind.HTML 

594 ): 

595 if html_node.tag == "small": 

596 node_str = clean_node(wxr, None, html_node) 

597 if node_str.startswith("[") and node_str.endswith( 

598 "]" 

599 ): 

600 for raw_tag in node_str.strip("[]").split(","): 

601 raw_tag = raw_tag.strip() 

602 if raw_tag != "": 

603 raw_tags.append(raw_tag) 

604 elif len(sounds) > 0: 

605 sounds[-1].roman = node_str 

606 elif html_node.tag == "span": 

607 node_str = clean_node(wxr, None, html_node) 

608 span_lang = html_node.attrs.get("lang", "") 

609 span_class = html_node.attrs.get("class", "") 

610 if node_str != "" and ( 

611 span_lang == "th" or span_class in ["IPA", "tr"] 

612 ): 

613 sound = Sound(raw_tags=raw_tags) 

614 for header in row_headers: 

615 sound.raw_tags.extend(header.raw_tags) 

616 translate_raw_tags(sound) 

617 if "romanization" in sound.tags: 

618 field = "roman" 

619 setattr(sound, field, node_str) 

620 sounds.append(sound) 

621 

622 base_data.sounds.extend(sounds) 

623 clean_node(wxr, base_data, expanded_node) 

624 

625 

626def extract_homophones_template( 

627 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

628): 

629 expanded_node = wxr.wtp.parse( 

630 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

631 ) 

632 homophones = [] 

633 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

634 for top_span in expanded_node.find_html( 

635 "span", attr_name="class", attr_value="homophones" 

636 ): 

637 for span_tag in top_span.find_html("span"): 

638 span_lang = span_tag.attrs.get("lang", "") 

639 span_class = span_tag.attrs.get("class", "").split() 

640 if "tr" in span_class and len(homophones) > 0: 

641 homophones[-1].roman = clean_node(wxr, None, span_tag) 

642 elif span_lang == lang_code: 

643 homophone = clean_node(wxr, None, span_tag) 

644 if homophone != "": 

645 homophones.append(Sound(homophone=homophone)) 

646 elif "qualifier-content" in span_class and len(homophones) > 0: 

647 raw_tag = clean_node(wxr, None, span_tag) 

648 if raw_tag != "": 

649 homophones[-1].raw_tags.append(raw_tag) 

650 translate_raw_tags(homophones[-1]) 

651 

652 base_data.sounds.extend(homophones) 

653 for link_node in expanded_node.find_child(NodeKind.LINK): 

654 clean_node(wxr, base_data, link_node) 

655 

656 

657def extract_ja_pron_template( 

658 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

659): 

660 JA_PRON_ACCENTS = {"Nakadaka", "Heiban", "Atamadaka", "Odaka"} 

661 expanded_node = wxr.wtp.parse( 

662 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

663 ) 

664 for li_tag in expanded_node.find_html_recursively("li"): 

665 sound = Sound() 

666 for span_tag in li_tag.find_html("span"): 

667 span_class = span_tag.attrs.get("class", "").split() 

668 if "usage-label-accent" in span_class: 

669 raw_tag = clean_node(wxr, None, span_tag).strip("() ") 

670 if raw_tag != "": 

671 sound.raw_tags.append(raw_tag) 

672 elif "IPA" in span_class: 

673 sound.ipa = clean_node(wxr, None, span_tag) 

674 elif "Latn" in span_class: 

675 sound.roman = clean_node(wxr, None, span_tag) 

676 elif span_tag.attrs.get("lang", "") == "ja": 

677 sound.other = clean_node(wxr, None, span_tag) 

678 for link_node in li_tag.find_child(NodeKind.LINK): 

679 link_text = clean_node(wxr, None, link_node) 

680 if link_text in JA_PRON_ACCENTS: 

681 sound.tags.append(link_text) 

682 if sound.ipa != "" or sound.other != "": 

683 translate_raw_tags(sound) 

684 base_data.sounds.append(sound) 

685 audio_file = t_node.template_parameters.get( 

686 "a", t_node.template_parameters.get("audio", "") 

687 ).strip() 

688 if audio_file != "": 

689 sound = Sound() 

690 set_sound_file_url_fields(wxr, audio_file, sound) 

691 base_data.sounds.append(sound) 

692 

693 clean_node(wxr, base_data, expanded_node) 

694 

695 

696def extract_ko_ipa_template( 

697 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

698): 

699 sounds = [] 

700 expanded_node = wxr.wtp.parse( 

701 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

702 ) 

703 clean_node(wxr, word_entry, expanded_node) 

704 for ul_node in expanded_node.find_html("ul"): 

705 for li_node in ul_node.find_html("li"): 

706 if "ko-pron__ph" in li_node.attrs.get("class", ""): 

707 for span_node in li_node.find_html( 

708 "span", attr_name="lang", attr_value="ko" 

709 ): 

710 hangeul_str = clean_node(wxr, None, span_node).strip("[]") 

711 for hangeul in hangeul_str.split("/"): 

712 if hangeul != "": 

713 sounds.append( 

714 Sound(hangeul=hangeul, tags=["phonetic"]) 

715 ) 

716 else: 

717 raw_tags = [] 

718 for i_node in li_node.find_html("i"): 

719 for raw_tag in clean_node(wxr, None, i_node).split("/"): 

720 if raw_tag not in ["", "IPA"]: 

721 raw_tags.append(raw_tag) 

722 for span_node in li_node.find_html( 

723 "span", attr_name="class", attr_value="IPA" 

724 ): 

725 ipas = clean_node(wxr, None, span_node) 

726 for ipa in ipas.split("~"): 

727 ipa = ipa.strip() 

728 if ipa != "": 

729 sound = Sound(ipa=ipa, raw_tags=raw_tags) 

730 translate_raw_tags(sound) 

731 sounds.append(sound) 

732 

733 for table in expanded_node.find_html("table"): 

734 for tr in table.find_html("tr"): 

735 raw_tag = "" 

736 for th in tr.find_html("th"): 

737 raw_tag = clean_node(wxr, None, th) 

738 for td in tr.find_html("td"): 

739 roman = clean_node(wxr, None, td) 

740 if roman != "": 

741 sound = Sound(roman=roman) 

742 if raw_tag != "": 

743 sound.raw_tags.append(raw_tag) 

744 translate_raw_tags(sound) 

745 sounds.append(sound) 

746 

747 audio_file = clean_node( 

748 wxr, 

749 None, 

750 t_node.template_parameters.get( 

751 "a", t_node.template_parameters.get("audio", "") 

752 ), 

753 ) 

754 if audio_file != "": 

755 sound = Sound() 

756 set_sound_file_url_fields(wxr, audio_file, sound) 

757 sounds.append(sound) 

758 word_entry.sounds.extend(sounds)