Coverage for src / wiktextract / extractor / vi / sound.py: 30%

415 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-05 07:46 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..share import set_sound_file_url_fields 

15from .models import Hyphenation, Sound, WordEntry 

16from .tags import translate_raw_tags 

17 

18 

19def extract_sound_section( 

20 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

21): 

22 for node in level_node.children: 

23 if isinstance(node, TemplateNode): 

24 extract_sound_template(wxr, base_data, node) 

25 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

26 for list_item in node.find_child(NodeKind.LIST_ITEM): 

27 extract_sound_list_item(wxr, base_data, list_item) 

28 

29 

30def extract_sound_template( 

31 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

32): 

33 if t_node.template_name == "vie-pron": 

34 extract_vie_pron_template(wxr, base_data, t_node) 

35 elif t_node.template_name in [ 35 ↛ 40line 35 didn't jump to line 40 because the condition on line 35 was never true

36 "âm thanh-IPA", 

37 "pron-audio", 

38 "audio-for-pron", 

39 ]: 

40 extract_pron_audio_template(wxr, base_data, t_node) 

41 elif t_node.template_name in ["zh-pron", "zho-pron"]: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 extract_zh_pron_template(wxr, base_data, t_node) 

43 elif t_node.template_name in ["th-pron", "tha-pron"]: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 extract_th_pron_template(wxr, base_data, t_node) 

45 elif t_node.template_name in ["ja-pron", "ja-IPA", "jpn-IPA", "jpn-pron"]: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 extract_ja_pron_template(wxr, base_data, t_node) 

47 elif t_node.template_name in ["âm thanh", "Audio", "Âm thanh"]: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 extract_audio_template(wxr, base_data, t_node, 1) 

49 elif t_node.template_name in ["âm thanh-2", "audio"]: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 extract_audio_template(wxr, base_data, t_node, 2) 

51 elif t_node.template_name in [ 

52 "IPA", 

53 "IPA2", 

54 "IPA3", 

55 "IPA4", 

56 ] or t_node.template_name.endswith("-IPA"): 

57 extract_ipa_template(wxr, base_data, t_node, "IPA") 

58 elif t_node.template_name in ["enPR", "AHD"]: 

59 extract_ipa_template(wxr, base_data, t_node, "enPR") 

60 elif t_node.template_name in ["rhymes", "rhyme"]: 

61 extract_rhymes_template(wxr, base_data, t_node) 

62 elif t_node.template_name in ["hyphenation", "hyph"]: 62 ↛ 64line 62 didn't jump to line 64 because the condition on line 62 was always true

63 extract_hyphenation_template(wxr, base_data, t_node) 

64 elif t_node.template_name in ["homophones", "homophone", "hmp"]: 

65 extract_homophones_template(wxr, base_data, t_node) 

66 

67 

68def extract_sound_list_item( 

69 wxr: WiktextractContext, base_data: WordEntry, list_item: WikiNode 

70): 

71 for node in list_item.children: 

72 if isinstance(node, TemplateNode): 

73 extract_sound_template(wxr, base_data, node) 

74 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

76 extract_sound_list_item(wxr, base_data, child_list_item) 

77 

78 

79@dataclass 

80class TableHeader: 

81 text: str 

82 index: int 

83 span: int 

84 

85 

86def extract_vie_pron_template( 

87 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

88): 

89 expanded_node = wxr.wtp.parse( 

90 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

91 ) 

92 for table in expanded_node.find_child(NodeKind.TABLE): 

93 col_headers = [] 

94 for row in table.find_child(NodeKind.TABLE_ROW): 

95 col_index = 0 

96 for cell in row.find_child( 

97 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

98 ): 

99 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

100 if col_index == 0: 

101 col_headers.clear() 

102 colspan = int(cell.attrs.get("colspan", "1")) 

103 col_headers.append( 

104 TableHeader( 

105 clean_node(wxr, None, cell), col_index, colspan 

106 ) 

107 ) 

108 col_index += colspan 

109 else: 

110 colspan = int(cell.attrs.get("colspan", "1")) 

111 for span_tag in cell.find_html( 

112 "span", attr_name="class", attr_value="IPA" 

113 ): 

114 extract_vie_pron_span_tag( 

115 wxr, 

116 base_data, 

117 span_tag, 

118 col_index, 

119 colspan, 

120 col_headers, 

121 ) 

122 col_index += colspan 

123 for td_tag in cell.find_html("td"): 

124 colspan = int(td_tag.attrs.get("colspan", "1")) 

125 for span_tag in td_tag.find_html( 

126 "span", attr_name="class", attr_value="IPA" 

127 ): 

128 extract_vie_pron_span_tag( 

129 wxr, 

130 base_data, 

131 span_tag, 

132 col_index, 

133 colspan, 

134 col_headers, 

135 ) 

136 col_index += colspan 

137 

138 for link in expanded_node.find_child(NodeKind.LINK): 

139 clean_node(wxr, base_data, link) 

140 

141 

142def extract_vie_pron_span_tag( 

143 wxr: WiktextractContext, 

144 base_data: WordEntry, 

145 span_tag: HTMLNode, 

146 index: str, 

147 colspan: int, 

148 col_headers: list[TableHeader], 

149): 

150 ipa = clean_node(wxr, None, span_tag) 

151 if ipa != "": 151 ↛ exitline 151 didn't return from function 'extract_vie_pron_span_tag' because the condition on line 151 was always true

152 sound = Sound(ipa=ipa) 

153 for header in col_headers: 

154 if ( 

155 index < header.index + header.span 

156 and index + colspan > header.index 

157 ): 

158 sound.raw_tags.append(header.text) 

159 translate_raw_tags(sound) 

160 base_data.sounds.append(sound) 

161 

162 

163def extract_pron_audio_template( 

164 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

165): 

166 file = clean_node(wxr, None, t_node.template_parameters.get("file", "")) 

167 if file == "": 

168 return 

169 sound = Sound() 

170 set_sound_file_url_fields(wxr, file, sound) 

171 place = clean_node(wxr, None, t_node.template_parameters.get("place", "")) 

172 if place != "": 

173 sound.raw_tags.append(place) 

174 sound.ipa = clean_node( 

175 wxr, None, t_node.template_parameters.get("pron", "") 

176 ) 

177 translate_raw_tags(sound) 

178 base_data.sounds.append(sound) 

179 

180 

181def extract_audio_template( 

182 wxr: WiktextractContext, 

183 base_data: WordEntry, 

184 t_node: TemplateNode, 

185 start_arg: int, 

186): 

187 # https://vi.wiktionary.org/wiki/Bản_mẫu:âm_thanh 

188 # https://vi.wiktionary.org/wiki/Bản_mẫu:âm_thanh-2 

189 file = clean_node(wxr, None, t_node.template_parameters.get(start_arg, "")) 

190 if file == "": 

191 return 

192 sound = Sound() 

193 set_sound_file_url_fields(wxr, file, sound) 

194 raw_tag = clean_node( 

195 wxr, None, t_node.template_parameters.get(start_arg + 1, "") 

196 ) 

197 if raw_tag != "": 

198 sound.raw_tags.append(raw_tag) 

199 translate_raw_tags(sound) 

200 base_data.sounds.append(sound) 

201 

202 

203def extract_ipa_template( 

204 wxr: WiktextractContext, 

205 base_data: WordEntry, 

206 t_node: TemplateNode, 

207 ipa_class: str, 

208): 

209 # https://vi.wiktionary.org/wiki/Bản_mẫu:IPA 

210 expanded_node = wxr.wtp.parse( 

211 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

212 ) 

213 no_list_nodes = [] 

214 for node in expanded_node.children: 

215 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

216 for list_item in node.find_child(NodeKind.LIST_ITEM): 

217 extract_ipa_list_item(wxr, base_data, list_item, ipa_class) 

218 else: 

219 no_list_nodes.append(node) 

220 if len(no_list_nodes) > 0: 220 ↛ 224line 220 didn't jump to line 224 because the condition on line 220 was always true

221 tmp_node = WikiNode(NodeKind.ROOT, 0) 

222 tmp_node.children = no_list_nodes 

223 extract_ipa_list_item(wxr, base_data, tmp_node, ipa_class) 

224 clean_node(wxr, base_data, expanded_node) 

225 

226 

227def extract_ipa_list_item( 

228 wxr: WiktextractContext, 

229 base_data: WordEntry, 

230 list_item: WikiNode, 

231 class_name: str, 

232): 

233 raw_tags = [] 

234 for italic_node in list_item.find_child(NodeKind.ITALIC): 

235 raw_tag = clean_node(wxr, None, italic_node) 

236 if raw_tag != "": 236 ↛ 234line 236 didn't jump to line 234 because the condition on line 236 was always true

237 raw_tags.append(raw_tag) 

238 for span_tag in list_item.find_html_recursively("span"): 

239 span_class = span_tag.attrs.get("class", "").split() 

240 if "qualifier-content" in span_class or "label-content" in span_class: 

241 for raw_tag in clean_node(wxr, None, span_tag).split(","): 

242 raw_tag = raw_tag.strip() 

243 if raw_tag != "": 243 ↛ 241line 243 didn't jump to line 241 because the condition on line 243 was always true

244 raw_tags.append(raw_tag) 

245 elif class_name in span_class: 

246 sound = Sound( 

247 ipa=clean_node(wxr, None, span_tag), raw_tags=raw_tags 

248 ) 

249 if sound.ipa != "": 249 ↛ 238line 249 didn't jump to line 238 because the condition on line 249 was always true

250 translate_raw_tags(sound) 

251 base_data.sounds.append(sound) 

252 

253 

254def extract_rhymes_template( 

255 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

256): 

257 # https://vi.wiktionary.org/wiki/Bản_mẫu:rhymes 

258 expanded_node = wxr.wtp.parse( 

259 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

260 ) 

261 for span_tag in expanded_node.find_html_recursively( 

262 "span", attr_name="class", attr_value="IPA" 

263 ): 

264 rhyme = clean_node(wxr, None, span_tag) 

265 if rhyme != "": 265 ↛ 261line 265 didn't jump to line 261 because the condition on line 265 was always true

266 base_data.sounds.append(Sound(rhymes=rhyme)) 

267 

268 for link in expanded_node.find_child(NodeKind.LINK): 

269 clean_node(wxr, base_data, link) 

270 

271 

272def extract_hyphenation_template( 

273 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

274): 

275 # https://vi.wiktionary.org/wiki/Bản_mẫu:hyphenation 

276 expanded_node = wxr.wtp.parse( 

277 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

278 ) 

279 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

280 for span_tag in expanded_node.find_html( 

281 "span", attr_name="lang", attr_value=lang_code 

282 ): 

283 h_str = clean_node(wxr, None, span_tag) 

284 h_data = Hyphenation( 

285 parts=list(filter(None, map(str.strip, h_str.split("‧")))) 

286 ) 

287 if len(h_data.parts) > 0: 287 ↛ 280line 287 didn't jump to line 280 because the condition on line 287 was always true

288 base_data.hyphenations.append(h_data) 

289 

290 

291def extract_homophone_section( 

292 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

293): 

294 for list in level_node.find_child(NodeKind.LIST): 

295 for list_item in list.find_child(NodeKind.LIST_ITEM): 

296 for link_node in list_item.find_child(NodeKind.LINK): 

297 homophone = clean_node(wxr, None, link_node) 

298 if homophone != "": 

299 base_data.sounds.append(Sound(homophone=homophone)) 

300 

301 

302def extract_zh_pron_template( 

303 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

304): 

305 expanded_node = wxr.wtp.parse( 

306 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

307 ) 

308 seen_lists = set() 

309 sounds = [] 

310 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

311 if list_node not in seen_lists: 

312 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

313 sounds.extend( 

314 extract_zh_pron_list_item(wxr, list_item, [], seen_lists) 

315 ) 

316 for sound in sounds: 

317 translate_raw_tags(sound) 

318 base_data.sounds.extend(sounds) 

319 clean_node(wxr, base_data, expanded_node) 

320 

321 

322def extract_zh_pron_list_item( 

323 wxr: WiktextractContext, 

324 list_item_node: WikiNode, 

325 raw_tags: list[str], 

326 seen_lists: set[WikiNode], 

327) -> list[Sound]: 

328 current_tags = raw_tags[:] 

329 sounds = [] 

330 is_first_small_tag = True 

331 for node in list_item_node.children: 

332 if isinstance(node, WikiNode): 

333 if node.kind == NodeKind.LINK: 

334 link_str = clean_node(wxr, None, node.largs) 

335 node_str = clean_node(wxr, None, node) 

336 if link_str.startswith(("File:", "Tập tin:")): 

337 filename = link_str.removeprefix("File:").removeprefix( 

338 "Tập tin:" 

339 ) 

340 sound_data = Sound(raw_tags=current_tags) 

341 set_sound_file_url_fields(wxr, filename, sound_data) 

342 sounds.append(sound_data) 

343 elif node_str != "": 

344 current_tags.append(node_str.strip("()")) 

345 elif isinstance(node, HTMLNode): 

346 if node.tag == "small": 

347 # remove "ghi chú"(help) <sup> tag 

348 if is_first_small_tag: 

349 raw_tag_text = clean_node( 

350 wxr, 

351 None, 

352 [ 

353 n 

354 for n in node.children 

355 if not ( 

356 isinstance(n, HTMLNode) and n.tag == "sup" 

357 ) 

358 ], 

359 ).rstrip(":") 

360 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text)) 

361 elif len(sounds) > 0: 

362 sounds[-1].raw_tags.extend( 

363 split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

364 ) 

365 is_first_small_tag = False 

366 elif node.tag == "span": 

367 sounds.extend(extract_zh_pron_span(wxr, node, current_tags)) 

368 elif ( 

369 node.tag == "table" 

370 and len(current_tags) > 0 

371 and current_tags[-1] == "Đồng âm" 

372 ): 

373 sounds.extend( 

374 extract_zh_pron_homophones_table( 

375 wxr, node, current_tags 

376 ) 

377 ) 

378 elif node.kind == NodeKind.LIST: 

379 seen_lists.add(node) 

380 for next_list_item in node.find_child(NodeKind.LIST_ITEM): 

381 sounds.extend( 

382 extract_zh_pron_list_item( 

383 wxr, 

384 next_list_item, 

385 current_tags, 

386 seen_lists, 

387 ) 

388 ) 

389 return sounds 

390 

391 

392def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]: 

393 raw_tags = [] 

394 if "(" not in raw_tag_text: 

395 for raw_tag in re.split(r",|:|;| và ", raw_tag_text): 

396 raw_tag = raw_tag.strip().removeprefix("bao gồm").strip() 

397 if raw_tag != "": 

398 raw_tags.append(raw_tag) 

399 else: 

400 processed_offsets = [] 

401 for match in re.finditer(r"\([^()]+\)", raw_tag_text): 

402 processed_offsets.append((match.start(), match.end())) 

403 raw_tags.extend( 

404 split_zh_pron_raw_tag( 

405 raw_tag_text[match.start() + 1 : match.end() - 1] 

406 ) 

407 ) 

408 not_processed = "" 

409 last_end = 0 

410 for start, end in processed_offsets: 

411 not_processed += raw_tag_text[last_end:start] 

412 last_end = end 

413 not_processed += raw_tag_text[last_end:] 

414 if not_processed != raw_tag_text: 

415 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags 

416 else: 

417 raw_tags.append(not_processed) 

418 return raw_tags 

419 

420 

421def extract_zh_pron_span( 

422 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str] 

423) -> list[Sound]: 

424 sounds = [] 

425 small_tags = [] 

426 pron_nodes = [] 

427 roman = "" 

428 phonetic_pron = "" 

429 for index, node in enumerate(span_tag.children): 

430 if isinstance(node, HTMLNode) and node.tag == "small": 

431 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

432 elif ( 

433 isinstance(node, HTMLNode) 

434 and node.tag == "span" 

435 and "-Latn" in node.attrs.get("lang", "") 

436 ): 

437 roman = clean_node(wxr, None, node).strip("() ") 

438 elif isinstance(node, str) and node.strip() == "[Phonetic:": 

439 phonetic_pron = clean_node( 

440 wxr, None, span_tag.children[index + 1 :] 

441 ).strip("] ") 

442 break 

443 else: 

444 pron_nodes.append(node) 

445 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)): 

446 zh_pron = zh_pron.strip("[]: ") 

447 if len(zh_pron) > 0: 

448 if "IPA" in span_tag.attrs.get("class", ""): 

449 sounds.append( 

450 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags) 

451 ) 

452 else: 

453 sounds.append( 

454 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags) 

455 ) 

456 if len(sounds) > 0: 

457 sounds[-1].raw_tags.extend(small_tags) 

458 if phonetic_pron != "": 

459 sounds.append( 

460 Sound( 

461 zh_pron=phonetic_pron, 

462 roman=roman, 

463 raw_tags=raw_tags + ["Phonetic"], 

464 ) 

465 ) 

466 return sounds 

467 

468 

469def split_zh_pron(zh_pron: str) -> list[str]: 

470 # split by comma and other symbols that outside parentheses 

471 parentheses = 0 

472 pron_list = [] 

473 pron = "" 

474 for c in zh_pron: 

475 if ( 

476 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/"))) 

477 and parentheses == 0 

478 and len(pron.strip()) > 0 

479 ): 

480 pron_list.append(pron.strip()) 

481 pron = "" 

482 elif c == "(": 

483 parentheses += 1 

484 pron += c 

485 elif c == ")": 

486 parentheses -= 1 

487 pron += c 

488 else: 

489 pron += c 

490 

491 if pron.strip() != "": 

492 pron_list.append(pron) 

493 return pron_list 

494 

495 

496def extract_zh_pron_homophones_table( 

497 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str] 

498) -> list[Sound]: 

499 sounds = [] 

500 for td_tag in table.find_html_recursively("td"): 

501 for span_tag in td_tag.find_html("span"): 

502 span_class = span_tag.attrs.get("class", "") 

503 span_lang = span_tag.attrs.get("lang", "") 

504 span_str = clean_node(wxr, None, span_tag) 

505 if ( 

506 span_str not in ["", "/"] 

507 and span_lang != "" 

508 and span_class in ["Hant", "Hans", "Hani"] 

509 ): 

510 sound = Sound(homophone=span_str, raw_tags=raw_tags) 

511 if span_class == "Hant": 

512 sound.tags.append("Traditional-Chinese") 

513 elif span_class == "Hans": 

514 sound.tags.append("Simplified-Chinese") 

515 sounds.append(sound) 

516 return sounds 

517 

518 

519def extract_th_pron_template( 

520 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

521): 

522 # https://vi.wiktionary.org/wiki/Bản mẫu:th-pron 

523 @dataclass 

524 class TableHeader: 

525 raw_tags: list[str] 

526 rowspan: int 

527 

528 expanded_node = wxr.wtp.parse( 

529 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

530 ) 

531 sounds = [] 

532 for table_tag in expanded_node.find_html("table"): 

533 row_headers = [] 

534 for tr_tag in table_tag.find_html("tr"): 

535 field = "other" 

536 new_headers = [] 

537 for header in row_headers: 

538 if header.rowspan > 1: 

539 header.rowspan -= 1 

540 new_headers.append(header) 

541 row_headers = new_headers 

542 for th_tag in tr_tag.find_html("th"): 

543 header_str = clean_node(wxr, None, th_tag) 

544 if header_str.startswith("(Tiêu chuẩn) IPA"): 

545 field = "ipa" 

546 elif header_str.startswith("Từ đồng âm"): 

547 field = "homophone" 

548 elif header_str == "Âm thanh": 

549 field = "audio" 

550 elif header_str != "": 

551 rowspan = 1 

552 rowspan_str = th_tag.attrs.get("rowspan", "1") 

553 if re.fullmatch(r"\d+", rowspan_str): 

554 rowspan = int(rowspan_str) 

555 header = TableHeader([], rowspan) 

556 for line in header_str.splitlines(): 

557 for raw_tag in line.strip("{}\n ").split(";"): 

558 raw_tag = raw_tag.strip() 

559 if raw_tag != "": 

560 header.raw_tags.append(raw_tag) 

561 row_headers.append(header) 

562 

563 for td_tag in tr_tag.find_html("td"): 

564 if field == "audio": 

565 for link_node in td_tag.find_child(NodeKind.LINK): 

566 filename = clean_node(wxr, None, link_node.largs[0]) 

567 if filename != "": 

568 sound = Sound() 

569 set_sound_file_url_fields(wxr, filename, sound) 

570 sounds.append(sound) 

571 elif field == "homophone": 

572 for span_tag in td_tag.find_html_recursively( 

573 "span", attr_name="lang", attr_value="th" 

574 ): 

575 word = clean_node(wxr, None, span_tag) 

576 if word != "": 

577 sounds.append(Sound(homophone=word)) 

578 else: 

579 raw_tags = [] 

580 for html_node in td_tag.find_child_recursively( 

581 NodeKind.HTML 

582 ): 

583 if html_node.tag == "small": 

584 node_str = clean_node(wxr, None, html_node) 

585 if node_str.startswith("[") and node_str.endswith( 

586 "]" 

587 ): 

588 for raw_tag in node_str.strip("[]").split(","): 

589 raw_tag = raw_tag.strip() 

590 if raw_tag != "": 

591 raw_tags.append(raw_tag) 

592 elif len(sounds) > 0: 

593 sounds[-1].roman = node_str 

594 elif html_node.tag == "span": 

595 node_str = clean_node(wxr, None, html_node) 

596 span_lang = html_node.attrs.get("lang", "") 

597 span_class = html_node.attrs.get("class", "") 

598 if node_str != "" and ( 

599 span_lang == "th" or span_class in ["IPA", "tr"] 

600 ): 

601 sound = Sound(raw_tags=raw_tags) 

602 for header in row_headers: 

603 sound.raw_tags.extend(header.raw_tags) 

604 translate_raw_tags(sound) 

605 if "romanization" in sound.tags: 

606 field = "roman" 

607 setattr(sound, field, node_str) 

608 sounds.append(sound) 

609 

610 base_data.sounds.extend(sounds) 

611 clean_node(wxr, base_data, expanded_node) 

612 

613 

614def extract_homophones_template( 

615 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

616): 

617 expanded_node = wxr.wtp.parse( 

618 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

619 ) 

620 homophones = [] 

621 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

622 for top_span in expanded_node.find_html( 

623 "span", attr_name="class", attr_value="homophones" 

624 ): 

625 for span_tag in top_span.find_html("span"): 

626 span_lang = span_tag.attrs.get("lang", "") 

627 span_class = span_tag.attrs.get("class", "").split() 

628 if "tr" in span_class and len(homophones) > 0: 

629 homophones[-1].roman = clean_node(wxr, None, span_tag) 

630 elif span_lang == lang_code: 

631 homophone = clean_node(wxr, None, span_tag) 

632 if homophone != "": 

633 homophones.append(Sound(homophone=homophone)) 

634 elif "qualifier-content" in span_class and len(homophones) > 0: 

635 raw_tag = clean_node(wxr, None, span_tag) 

636 if raw_tag != "": 

637 homophones[-1].raw_tags.append(raw_tag) 

638 translate_raw_tags(homophones[-1]) 

639 

640 base_data.sounds.extend(homophones) 

641 for link_node in expanded_node.find_child(NodeKind.LINK): 

642 clean_node(wxr, base_data, link_node) 

643 

644 

645def extract_ja_pron_template( 

646 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

647): 

648 JA_PRON_ACCENTS = {"Nakadaka", "Heiban", "Atamadaka", "Odaka"} 

649 expanded_node = wxr.wtp.parse( 

650 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

651 ) 

652 for li_tag in expanded_node.find_html_recursively("li"): 

653 sound = Sound() 

654 for span_tag in li_tag.find_html("span"): 

655 span_class = span_tag.attrs.get("class", "").split() 

656 if "usage-label-accent" in span_class: 

657 raw_tag = clean_node(wxr, None, span_tag).strip("() ") 

658 if raw_tag != "": 

659 sound.raw_tags.append(raw_tag) 

660 elif "IPA" in span_class: 

661 sound.ipa = clean_node(wxr, None, span_tag) 

662 elif "Latn" in span_class: 

663 sound.roman = clean_node(wxr, None, span_tag) 

664 elif span_tag.attrs.get("lang", "") == "ja": 

665 sound.other = clean_node(wxr, None, span_tag) 

666 for link_node in li_tag.find_child(NodeKind.LINK): 

667 link_text = clean_node(wxr, None, link_node) 

668 if link_text in JA_PRON_ACCENTS: 

669 sound.tags.append(link_text) 

670 if sound.ipa != "" or sound.other != "": 

671 translate_raw_tags(sound) 

672 base_data.sounds.append(sound) 

673 audio_file = t_node.template_parameters.get( 

674 "a", t_node.template_parameters.get("audio", "") 

675 ).strip() 

676 if audio_file != "": 

677 sound = Sound() 

678 set_sound_file_url_fields(wxr, audio_file, sound) 

679 base_data.sounds.append(sound) 

680 

681 clean_node(wxr, base_data, expanded_node)