Coverage for src/wiktextract/extractor/vi/sound.py: 36%

355 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..share import set_sound_file_url_fields 

15from .models import Hyphenation, Sound, WordEntry 

16from .tags import translate_raw_tags 

17 

18 

19def extract_sound_section( 

20 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

21): 

22 for node in level_node.children: 

23 if isinstance(node, TemplateNode): 

24 if node.template_name == "vie-pron": 

25 extract_vie_pron_template(wxr, base_data, node) 

26 elif node.template_name in [ 26 ↛ 31line 26 didn't jump to line 31 because the condition on line 26 was never true

27 "âm thanh-IPA", 

28 "pron-audio", 

29 "audio-for-pron", 

30 ]: 

31 extract_pron_audio_template(wxr, base_data, node) 

32 elif node.template_name == "tyz-IPA": 32 ↛ 34line 32 didn't jump to line 34 because the condition on line 32 was always true

33 extract_tyz_ipa_template(wxr, base_data, node) 

34 elif node.template_name in ["zh-pron", "zho-pron"]: 

35 extract_zh_pron_template(wxr, base_data, node) 

36 elif node.template_name in ["th-pron", "tha-pron"]: 

37 extract_th_pron_template(wxr, base_data, node) 

38 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

39 for list_item in node.find_child(NodeKind.LIST_ITEM): 

40 extract_sound_list_item(wxr, base_data, list_item) 

41 

42 

43def extract_sound_list_item( 

44 wxr: WiktextractContext, base_data: WordEntry, list_item: WikiNode 

45): 

46 for node in list_item.children: 

47 if isinstance(node, TemplateNode): 

48 if node.template_name in ["âm thanh", "Audio", "Âm thanh"]: 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 extract_audio_template(wxr, base_data, node, 1) 

50 elif node.template_name in ["âm thanh-2", "audio"]: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 extract_audio_template(wxr, base_data, node, 2) 

52 elif node.template_name in [ 

53 "IPA", 

54 "IPA2", 

55 "IPA3", 

56 "IPA4", 

57 "fra-IPA", 

58 "fr-IPA", 

59 ]: 

60 extract_ipa_template(wxr, base_data, node, "IPA") 

61 elif node.template_name in ["enPR", "AHD"]: 

62 extract_ipa_template(wxr, base_data, node, "enPR") 

63 elif node.template_name in ["rhymes", "rhyme"]: 

64 extract_rhymes_template(wxr, base_data, node) 

65 elif node.template_name in ["hyphenation", "hyph"]: 65 ↛ 46line 65 didn't jump to line 46 because the condition on line 65 was always true

66 extract_hyphenation_template(wxr, base_data, node) 

67 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

69 extract_sound_list_item(wxr, base_data, child_list_item) 

70 

71 

72@dataclass 

73class TableHeader: 

74 text: str 

75 index: int 

76 span: int 

77 

78 

79def extract_vie_pron_template( 

80 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

81): 

82 expanded_node = wxr.wtp.parse( 

83 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

84 ) 

85 for table in expanded_node.find_child(NodeKind.TABLE): 

86 col_headers = [] 

87 for row in table.find_child(NodeKind.TABLE_ROW): 

88 col_index = 0 

89 for cell in row.find_child( 

90 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

91 ): 

92 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

93 if col_index == 0: 

94 col_headers.clear() 

95 colspan = int(cell.attrs.get("colspan", "1")) 

96 col_headers.append( 

97 TableHeader( 

98 clean_node(wxr, None, cell), col_index, colspan 

99 ) 

100 ) 

101 col_index += colspan 

102 else: 

103 colspan = int(cell.attrs.get("colspan", "1")) 

104 for span_tag in cell.find_html( 

105 "span", attr_name="class", attr_value="IPA" 

106 ): 

107 extract_vie_pron_span_tag( 

108 wxr, 

109 base_data, 

110 span_tag, 

111 col_index, 

112 colspan, 

113 col_headers, 

114 ) 

115 col_index += colspan 

116 for td_tag in cell.find_html("td"): 

117 colspan = int(td_tag.attrs.get("colspan", "1")) 

118 for span_tag in td_tag.find_html( 

119 "span", attr_name="class", attr_value="IPA" 

120 ): 

121 extract_vie_pron_span_tag( 

122 wxr, 

123 base_data, 

124 span_tag, 

125 col_index, 

126 colspan, 

127 col_headers, 

128 ) 

129 col_index += colspan 

130 

131 for link in expanded_node.find_child(NodeKind.LINK): 

132 clean_node(wxr, base_data, link) 

133 

134 

135def extract_vie_pron_span_tag( 

136 wxr: WiktextractContext, 

137 base_data: WordEntry, 

138 span_tag: HTMLNode, 

139 index: str, 

140 colspan: int, 

141 col_headers: list[TableHeader], 

142): 

143 ipa = clean_node(wxr, None, span_tag) 

144 if ipa != "": 144 ↛ exitline 144 didn't return from function 'extract_vie_pron_span_tag' because the condition on line 144 was always true

145 sound = Sound(ipa=ipa) 

146 for header in col_headers: 

147 if ( 

148 index < header.index + header.span 

149 and index + colspan > header.index 

150 ): 

151 sound.raw_tags.append(header.text) 

152 translate_raw_tags(sound) 

153 base_data.sounds.append(sound) 

154 

155 

156def extract_pron_audio_template( 

157 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

158): 

159 file = clean_node(wxr, None, t_node.template_parameters.get("file", "")) 

160 if file == "": 

161 return 

162 sound = Sound() 

163 set_sound_file_url_fields(wxr, file, sound) 

164 place = clean_node(wxr, None, t_node.template_parameters.get("place", "")) 

165 if place != "": 

166 sound.raw_tags.append(place) 

167 sound.ipa = clean_node( 

168 wxr, None, t_node.template_parameters.get("pron", "") 

169 ) 

170 translate_raw_tags(sound) 

171 base_data.sounds.append(sound) 

172 

173 

174def extract_audio_template( 

175 wxr: WiktextractContext, 

176 base_data: WordEntry, 

177 t_node: TemplateNode, 

178 start_arg: int, 

179): 

180 # https://vi.wiktionary.org/wiki/Bản_mẫu:âm_thanh 

181 # https://vi.wiktionary.org/wiki/Bản_mẫu:âm_thanh-2 

182 file = clean_node(wxr, None, t_node.template_parameters.get(start_arg, "")) 

183 if file == "": 

184 return 

185 sound = Sound() 

186 set_sound_file_url_fields(wxr, file, sound) 

187 raw_tag = clean_node( 

188 wxr, None, t_node.template_parameters.get(start_arg + 1, "") 

189 ) 

190 if raw_tag != "": 

191 sound.raw_tags.append(raw_tag) 

192 translate_raw_tags(sound) 

193 base_data.sounds.append(sound) 

194 

195 

196def extract_tyz_ipa_template( 

197 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

198): 

199 expanded_node = wxr.wtp.parse( 

200 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

201 ) 

202 for list in expanded_node.find_child(NodeKind.LIST): 

203 for list_item in list.find_child(NodeKind.LIST_ITEM): 

204 sound = Sound() 

205 for node in list_item.children: 

206 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

207 raw_tag = clean_node(wxr, None, node) 

208 if raw_tag != "": 208 ↛ 205line 208 didn't jump to line 205 because the condition on line 208 was always true

209 sound.raw_tags.append(raw_tag) 

210 elif ( 

211 isinstance(node, HTMLNode) 

212 and node.tag == "span" 

213 and "IPA" in node.attrs.get("class", "").split() 

214 ): 

215 sound.ipa = clean_node(wxr, None, node) 

216 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

217 clean_node(wxr, base_data, node) 

218 if sound.ipa != "": 218 ↛ 203line 218 didn't jump to line 203 because the condition on line 218 was always true

219 base_data.sounds.append(sound) 

220 

221 

222def extract_ipa_template( 

223 wxr: WiktextractContext, 

224 base_data: WordEntry, 

225 t_node: TemplateNode, 

226 ipa_class: str, 

227): 

228 # https://vi.wiktionary.org/wiki/Bản_mẫu:IPA 

229 expanded_node = wxr.wtp.parse( 

230 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

231 ) 

232 raw_tags = [] 

233 for span_tag in expanded_node.find_html("span"): 

234 class_names = span_tag.attrs.get("class", "").split() 

235 if "qualifier-content" in class_names: 

236 raw_tag = clean_node(wxr, None, span_tag) 

237 if raw_tag != "": 237 ↛ 233line 237 didn't jump to line 233 because the condition on line 237 was always true

238 raw_tags.append(raw_tag) 

239 elif ipa_class in class_names: 

240 ipa = clean_node(wxr, None, span_tag) 

241 if ipa != "": 241 ↛ 233line 241 didn't jump to line 233 because the condition on line 241 was always true

242 sound = Sound(ipa=ipa, raw_tags=raw_tags) 

243 translate_raw_tags(sound) 

244 base_data.sounds.append(sound) 

245 

246 for link in expanded_node.find_child(NodeKind.LINK): 

247 clean_node(wxr, base_data, link) 

248 

249 

250def extract_rhymes_template( 

251 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

252): 

253 # https://vi.wiktionary.org/wiki/Bản_mẫu:rhymes 

254 expanded_node = wxr.wtp.parse( 

255 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

256 ) 

257 for span_tag in expanded_node.find_html_recursively( 

258 "span", attr_name="class", attr_value="IPA" 

259 ): 

260 rhyme = clean_node(wxr, None, span_tag) 

261 if rhyme != "": 261 ↛ 257line 261 didn't jump to line 257 because the condition on line 261 was always true

262 base_data.sounds.append(Sound(rhymes=rhyme)) 

263 

264 for link in expanded_node.find_child(NodeKind.LINK): 

265 clean_node(wxr, base_data, link) 

266 

267 

268def extract_hyphenation_template( 

269 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

270): 

271 # https://vi.wiktionary.org/wiki/Bản_mẫu:hyphenation 

272 expanded_node = wxr.wtp.parse( 

273 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

274 ) 

275 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

276 for span_tag in expanded_node.find_html( 

277 "span", attr_name="lang", attr_value=lang_code 

278 ): 

279 h_str = clean_node(wxr, None, span_tag) 

280 h_data = Hyphenation() 

281 for part in h_str.split("‧"): 

282 part = part.strip() 

283 if part != "": 283 ↛ 281line 283 didn't jump to line 281 because the condition on line 283 was always true

284 h_data.parts.append(part) 

285 if len(h_data.parts) > 0: 285 ↛ 276line 285 didn't jump to line 276 because the condition on line 285 was always true

286 base_data.hyphenations.append(h_data) 

287 

288 

289def extract_homophone_section( 

290 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

291): 

292 for list in level_node.find_child(NodeKind.LIST): 

293 for list_item in list.find_child(NodeKind.LIST_ITEM): 

294 for link_node in list_item.find_child(NodeKind.LINK): 

295 homophone = clean_node(wxr, None, link_node) 

296 if homophone != "": 

297 base_data.sounds.append(Sound(homophone=homophone)) 

298 

299 

300def extract_zh_pron_template( 

301 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

302): 

303 expanded_node = wxr.wtp.parse( 

304 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

305 ) 

306 seen_lists = set() 

307 sounds = [] 

308 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

309 if list_node not in seen_lists: 

310 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

311 sounds.extend( 

312 extract_zh_pron_list_item(wxr, list_item, [], seen_lists) 

313 ) 

314 for sound in sounds: 

315 translate_raw_tags(sound) 

316 base_data.sounds.extend(sounds) 

317 clean_node(wxr, base_data, expanded_node) 

318 

319 

320def extract_zh_pron_list_item( 

321 wxr: WiktextractContext, 

322 list_item_node: WikiNode, 

323 raw_tags: list[str], 

324 seen_lists: set[WikiNode], 

325) -> list[Sound]: 

326 current_tags = raw_tags[:] 

327 sounds = [] 

328 is_first_small_tag = True 

329 for node in list_item_node.children: 

330 if isinstance(node, WikiNode): 

331 if node.kind == NodeKind.LINK: 

332 link_str = clean_node(wxr, None, node.largs) 

333 node_str = clean_node(wxr, None, node) 

334 if link_str.startswith(("File:", "Tập tin:")): 

335 filename = link_str.removeprefix("File:").removeprefix( 

336 "Tập tin:" 

337 ) 

338 sound_data = Sound(raw_tags=current_tags) 

339 set_sound_file_url_fields(wxr, filename, sound_data) 

340 sounds.append(sound_data) 

341 elif node_str != "": 

342 current_tags.append(node_str.strip("()")) 

343 elif isinstance(node, HTMLNode): 

344 if node.tag == "small": 

345 # remove "ghi chú"(help) <sup> tag 

346 if is_first_small_tag: 

347 raw_tag_text = clean_node( 

348 wxr, 

349 None, 

350 [ 

351 n 

352 for n in node.children 

353 if not ( 

354 isinstance(n, HTMLNode) and n.tag == "sup" 

355 ) 

356 ], 

357 ).rstrip(":") 

358 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text)) 

359 elif len(sounds) > 0: 

360 sounds[-1].raw_tags.extend( 

361 split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

362 ) 

363 is_first_small_tag = False 

364 elif node.tag == "span": 

365 sounds.extend(extract_zh_pron_span(wxr, node, current_tags)) 

366 elif ( 

367 node.tag == "table" 

368 and len(current_tags) > 0 

369 and current_tags[-1] == "Đồng âm" 

370 ): 

371 sounds.extend( 

372 extract_zh_pron_homophones_table( 

373 wxr, node, current_tags 

374 ) 

375 ) 

376 elif node.kind == NodeKind.LIST: 

377 seen_lists.add(node) 

378 for next_list_item in node.find_child(NodeKind.LIST_ITEM): 

379 sounds.extend( 

380 extract_zh_pron_list_item( 

381 wxr, 

382 next_list_item, 

383 current_tags, 

384 seen_lists, 

385 ) 

386 ) 

387 return sounds 

388 

389 

390def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]: 

391 raw_tags = [] 

392 if "(" not in raw_tag_text: 

393 for raw_tag in re.split(r",|:|;| và ", raw_tag_text): 

394 raw_tag = raw_tag.strip().removeprefix("bao gồm").strip() 

395 if raw_tag != "": 

396 raw_tags.append(raw_tag) 

397 else: 

398 processed_offsets = [] 

399 for match in re.finditer(r"\([^()]+\)", raw_tag_text): 

400 processed_offsets.append((match.start(), match.end())) 

401 raw_tags.extend( 

402 split_zh_pron_raw_tag( 

403 raw_tag_text[match.start() + 1 : match.end() - 1] 

404 ) 

405 ) 

406 not_processed = "" 

407 last_end = 0 

408 for start, end in processed_offsets: 

409 not_processed += raw_tag_text[last_end:start] 

410 last_end = end 

411 not_processed += raw_tag_text[last_end:] 

412 if not_processed != raw_tag_text: 

413 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags 

414 else: 

415 raw_tags.append(not_processed) 

416 return raw_tags 

417 

418 

419def extract_zh_pron_span( 

420 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str] 

421) -> list[Sound]: 

422 sounds = [] 

423 small_tags = [] 

424 pron_nodes = [] 

425 roman = "" 

426 phonetic_pron = "" 

427 for index, node in enumerate(span_tag.children): 

428 if isinstance(node, HTMLNode) and node.tag == "small": 

429 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

430 elif ( 

431 isinstance(node, HTMLNode) 

432 and node.tag == "span" 

433 and "-Latn" in node.attrs.get("lang", "") 

434 ): 

435 roman = clean_node(wxr, None, node).strip("() ") 

436 elif isinstance(node, str) and node.strip() == "[Phonetic:": 

437 phonetic_pron = clean_node( 

438 wxr, None, span_tag.children[index + 1 :] 

439 ).strip("] ") 

440 break 

441 else: 

442 pron_nodes.append(node) 

443 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)): 

444 zh_pron = zh_pron.strip("[]: ") 

445 if len(zh_pron) > 0: 

446 if "IPA" in span_tag.attrs.get("class", ""): 

447 sounds.append( 

448 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags) 

449 ) 

450 else: 

451 sounds.append( 

452 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags) 

453 ) 

454 if len(sounds) > 0: 

455 sounds[-1].raw_tags.extend(small_tags) 

456 if phonetic_pron != "": 

457 sounds.append( 

458 Sound( 

459 zh_pron=phonetic_pron, 

460 roman=roman, 

461 raw_tags=raw_tags + ["Phonetic"], 

462 ) 

463 ) 

464 return sounds 

465 

466 

467def split_zh_pron(zh_pron: str) -> list[str]: 

468 # split by comma and other symbols that outside parentheses 

469 parentheses = 0 

470 pron_list = [] 

471 pron = "" 

472 for c in zh_pron: 

473 if ( 

474 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/"))) 

475 and parentheses == 0 

476 and len(pron.strip()) > 0 

477 ): 

478 pron_list.append(pron.strip()) 

479 pron = "" 

480 elif c == "(": 

481 parentheses += 1 

482 pron += c 

483 elif c == ")": 

484 parentheses -= 1 

485 pron += c 

486 else: 

487 pron += c 

488 

489 if pron.strip() != "": 

490 pron_list.append(pron) 

491 return pron_list 

492 

493 

494def extract_zh_pron_homophones_table( 

495 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str] 

496) -> list[Sound]: 

497 sounds = [] 

498 for td_tag in table.find_html_recursively("td"): 

499 for span_tag in td_tag.find_html("span"): 

500 span_class = span_tag.attrs.get("class", "") 

501 span_lang = span_tag.attrs.get("lang", "") 

502 span_str = clean_node(wxr, None, span_tag) 

503 if ( 

504 span_str not in ["", "/"] 

505 and span_lang != "" 

506 and span_class in ["Hant", "Hans", "Hani"] 

507 ): 

508 sound = Sound(homophone=span_str, raw_tags=raw_tags) 

509 if span_class == "Hant": 

510 sound.tags.append("Traditional-Chinese") 

511 elif span_class == "Hans": 

512 sound.tags.append("Simplified-Chinese") 

513 sounds.append(sound) 

514 return sounds 

515 

516 

517def extract_th_pron_template( 

518 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

519): 

520 # https://vi.wiktionary.org/wiki/Bản mẫu:th-pron 

521 @dataclass 

522 class TableHeader: 

523 text: str 

524 rowspan: int 

525 

526 expanded_node = wxr.wtp.parse( 

527 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

528 ) 

529 sounds = [] 

530 for table_tag in expanded_node.find_html("table"): 

531 row_headers = [] 

532 for tr_tag in table_tag.find_html("tr"): 

533 field = "other" 

534 new_headers = [] 

535 for header in row_headers: 

536 if header.rowspan > 1: 

537 header.rowspan -= 1 

538 new_headers.append(header) 

539 row_headers = new_headers 

540 for th_tag in tr_tag.find_html("th"): 

541 header_str = clean_node(wxr, None, th_tag) 

542 if header_str.startswith("(Tiêu chuẩn) IPA"): 

543 field = "ipa" 

544 elif header_str.startswith("Từ đồng âm"): 

545 field = "homophone" 

546 elif header_str == "Âm thanh": 

547 field = "audio" 

548 elif header_str != "": 

549 rowspan = 1 

550 rowspan_str = th_tag.attrs.get("rowspan", "1") 

551 if re.fullmatch(r"\d+", rowspan_str): 

552 rowspan = int(rowspan_str) 

553 row_headers.append(TableHeader(header_str, rowspan)) 

554 

555 for td_tag in tr_tag.find_html("td"): 

556 if field == "audio": 

557 for link_node in td_tag.find_child(NodeKind.LINK): 

558 filename = clean_node(wxr, None, link_node.largs[0]) 

559 if filename != "": 

560 sound = Sound() 

561 set_sound_file_url_fields(wxr, filename, sound) 

562 sounds.append(sound) 

563 elif field == "homophone": 

564 for span_tag in td_tag.find_html_recursively( 

565 "span", attr_name="lang", attr_value="th" 

566 ): 

567 word = clean_node(wxr, None, span_tag) 

568 if word != "": 

569 sounds.append(Sound(homophone=word)) 

570 else: 

571 raw_tag = "" 

572 for html_node in td_tag.find_child_recursively( 

573 NodeKind.HTML 

574 ): 

575 if html_node.tag == "small": 

576 node_str = clean_node(wxr, None, html_node) 

577 if node_str.startswith("[") and node_str.endswith( 

578 "]" 

579 ): 

580 raw_tag = node_str.strip("[]") 

581 elif len(sounds) > 0: 

582 sounds[-1].roman = node_str 

583 elif html_node.tag == "span": 

584 node_str = clean_node(wxr, None, html_node) 

585 span_lang = html_node.attrs.get("lang", "") 

586 span_class = html_node.attrs.get("class", "") 

587 if node_str != "" and ( 

588 span_lang == "th" or span_class in ["IPA", "tr"] 

589 ): 

590 sound = Sound() 

591 setattr(sound, field, node_str) 

592 if raw_tag != "": 

593 sound.raw_tags.append(raw_tag) 

594 for header in row_headers: 

595 sound.raw_tags.append(header.text) 

596 translate_raw_tags(sound) 

597 sounds.append(sound) 

598 

599 base_data.sounds.extend(sounds) 

600 clean_node(wxr, base_data, expanded_node)