Coverage for src/wiktextract/extractor/vi/sound.py: 27%

1import re

2from dataclasses import dataclass

4from wikitextprocessor import (

5 HTMLNode,

6 LevelNode,

7 NodeKind,

8 TemplateNode,

9 WikiNode,

10)

12from ...page import clean_node

13from ...wxr_context import WiktextractContext

14from ..share import set_sound_file_url_fields

15from .models import Hyphenation, Sound, WordEntry

16from .tags import translate_raw_tags

19def extract_sound_section(

20 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode

21):

22 for node in level_node.children:

23 if isinstance(node, TemplateNode):

24 extract_sound_template(wxr, base_data, node)

25 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

26 for list_item in node.find_child(NodeKind.LIST_ITEM):

27 extract_sound_list_item(wxr, base_data, list_item)

30def extract_sound_template(

31 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

32):

33 if t_node.template_name == "vie-pron":

34 extract_vie_pron_template(wxr, base_data, t_node)

35 elif t_node.template_name in [ 35 ↛ 40line 35 didn't jump to line 40 because the condition on line 35 was never true

36 "âm thanh-IPA",

37 "pron-audio",

38 "audio-for-pron",

39 ]:

40 extract_pron_audio_template(wxr, base_data, t_node)

41 elif t_node.template_name in ["zh-pron", "zho-pron"]: 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true

42 extract_zh_pron_template(wxr, base_data, t_node)

43 elif t_node.template_name in ["th-pron", "tha-pron"]: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 extract_th_pron_template(wxr, base_data, t_node)

45 elif t_node.template_name in ["ja-pron", "ja-IPA", "jpn-IPA", "jpn-pron"]: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 extract_ja_pron_template(wxr, base_data, t_node)

47 elif t_node.template_name in ["âm thanh", "Audio", "Âm thanh"]: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 extract_audio_template(wxr, base_data, t_node, 1)

49 elif t_node.template_name in ["âm thanh-2", "audio"]: 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 extract_audio_template(wxr, base_data, t_node, 2)

51 elif t_node.template_name.lower() in ["ko-ipa", "kor-ipa"]: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 extract_ko_ipa_template(wxr, base_data, t_node)

53 elif t_node.template_name in [

54 "IPA",

55 "IPA2",

56 "IPA3",

57 "IPA4",

58 ] or t_node.template_name.endswith("-IPA"):

59 extract_ipa_template(wxr, base_data, t_node, "IPA")

60 elif t_node.template_name in ["enPR", "AHD"]:

61 extract_ipa_template(wxr, base_data, t_node, "enPR")

62 elif t_node.template_name in ["rhymes", "rhyme"]:

63 extract_rhymes_template(wxr, base_data, t_node)

64 elif t_node.template_name in ["hyphenation", "hyph"]: 64 ↛ 66line 64 didn't jump to line 66 because the condition on line 64 was always true

65 extract_hyphenation_template(wxr, base_data, t_node)

66 elif t_node.template_name in ["homophones", "homophone", "hmp"]:

67 extract_homophones_template(wxr, base_data, t_node)

70def extract_sound_list_item(

71 wxr: WiktextractContext, base_data: WordEntry, list_item: WikiNode

72):

73 for node in list_item.children:

74 if isinstance(node, TemplateNode):

75 extract_sound_template(wxr, base_data, node)

76 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true

77 for child_list_item in node.find_child(NodeKind.LIST_ITEM):

78 extract_sound_list_item(wxr, base_data, child_list_item)

81@dataclass

82class TableHeader:

83 text: str

84 index: int

85 span: int

88def extract_vie_pron_template(

89 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

90):

91 expanded_node = wxr.wtp.parse(

92 wxr.wtp.node_to_wikitext(t_node), expand_all=True

93 )

94 for table in expanded_node.find_child(NodeKind.TABLE):

95 col_headers = []

96 for row in table.find_child(NodeKind.TABLE_ROW):

97 col_index = 0

98 for cell in row.find_child(

99 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

100 ):

101 if cell.kind == NodeKind.TABLE_HEADER_CELL:

102 if col_index == 0:

103 col_headers.clear()

104 colspan = int(cell.attrs.get("colspan", "1"))

105 col_headers.append(

106 TableHeader(

107 clean_node(wxr, None, cell), col_index, colspan

108 )

109 )

110 col_index += colspan

111 else:

112 colspan = int(cell.attrs.get("colspan", "1"))

113 for span_tag in cell.find_html(

114 "span", attr_name="class", attr_value="IPA"

115 ):

116 extract_vie_pron_span_tag(

117 wxr,

118 base_data,

119 span_tag,

120 col_index,

121 colspan,

122 col_headers,

123 )

124 col_index += colspan

125 for td_tag in cell.find_html("td"):

126 colspan = int(td_tag.attrs.get("colspan", "1"))

127 for span_tag in td_tag.find_html(

128 "span", attr_name="class", attr_value="IPA"

129 ):

130 extract_vie_pron_span_tag(

131 wxr,

132 base_data,

133 span_tag,

134 col_index,

135 colspan,

136 col_headers,

137 )

138 col_index += colspan

139

140 for link in expanded_node.find_child(NodeKind.LINK):

141 clean_node(wxr, base_data, link)

142

143

144def extract_vie_pron_span_tag(

145 wxr: WiktextractContext,

146 base_data: WordEntry,

147 span_tag: HTMLNode,

148 index: str,

149 colspan: int,

150 col_headers: list[TableHeader],

151):

152 ipa = clean_node(wxr, None, span_tag)

153 if ipa != "": 153 ↛ exitline 153 didn't return from function 'extract_vie_pron_span_tag' because the condition on line 153 was always true

154 sound = Sound(ipa=ipa)

155 for header in col_headers:

156 if (

157 index < header.index + header.span

158 and index + colspan > header.index

159 ):

160 sound.raw_tags.append(header.text)

161 translate_raw_tags(sound)

162 base_data.sounds.append(sound)

163

164

165def extract_pron_audio_template(

166 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

167):

168 file = clean_node(wxr, None, t_node.template_parameters.get("file", ""))

169 if file == "":

170 return

171 sound = Sound()

172 set_sound_file_url_fields(wxr, file, sound)

173 place = clean_node(wxr, None, t_node.template_parameters.get("place", ""))

174 if place != "":

175 sound.raw_tags.append(place)

176 sound.ipa = clean_node(

177 wxr, None, t_node.template_parameters.get("pron", "")

178 )

179 translate_raw_tags(sound)

180 base_data.sounds.append(sound)

181

182

183def extract_audio_template(

184 wxr: WiktextractContext,

185 base_data: WordEntry,

186 t_node: TemplateNode,

187 start_arg: int,

188):

189 # https://vi.wiktionary.org/wiki/Bản_mẫu:âm_thanh

190 # https://vi.wiktionary.org/wiki/Bản_mẫu:âm_thanh-2

191 file = clean_node(wxr, None, t_node.template_parameters.get(start_arg, ""))

192 if file == "":

193 return

194 sound = Sound()

195 set_sound_file_url_fields(wxr, file, sound)

196 raw_tag = clean_node(

197 wxr, None, t_node.template_parameters.get(start_arg + 1, "")

198 )

199 if raw_tag != "":

200 sound.raw_tags.append(raw_tag)

201 expanded_node = wxr.wtp.parse(

202 wxr.wtp.node_to_wikitext(t_node), expand_all=True

203 )

204 for span_node in expanded_node.find_html_recursively(

205 "span", attr_name="class", attr_value="ib-content"

206 ):

207 for raw_tag in clean_node(wxr, None, span_node).split(","):

208 if raw_tag != "":

209 sound.raw_tags.append(raw_tag)

210 translate_raw_tags(sound)

211 clean_node(wxr, base_data, expanded_node)

212 base_data.sounds.append(sound)

213

214

215def extract_ipa_template(

216 wxr: WiktextractContext,

217 base_data: WordEntry,

218 t_node: TemplateNode,

219 ipa_class: str,

220):

221 # https://vi.wiktionary.org/wiki/Bản_mẫu:IPA

222 expanded_node = wxr.wtp.parse(

223 wxr.wtp.node_to_wikitext(t_node), expand_all=True

224 )

225 no_list_nodes = []

226 for node in expanded_node.children:

227 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

228 for list_item in node.find_child(NodeKind.LIST_ITEM):

229 extract_ipa_list_item(wxr, base_data, list_item, ipa_class)

230 else:

231 no_list_nodes.append(node)

232 if len(no_list_nodes) > 0: 232 ↛ 236line 232 didn't jump to line 236 because the condition on line 232 was always true

233 tmp_node = WikiNode(NodeKind.ROOT, 0)

234 tmp_node.children = no_list_nodes

235 extract_ipa_list_item(wxr, base_data, tmp_node, ipa_class)

236 clean_node(wxr, base_data, expanded_node)

237

238

239def extract_ipa_list_item(

240 wxr: WiktextractContext,

241 base_data: WordEntry,

242 list_item: WikiNode,

243 class_name: str,

244):

245 raw_tags = []

246 for italic_node in list_item.find_child(NodeKind.ITALIC):

247 raw_tag = clean_node(wxr, None, italic_node)

248 if raw_tag != "": 248 ↛ 246line 248 didn't jump to line 246 because the condition on line 248 was always true

249 raw_tags.append(raw_tag)

250 for span_tag in list_item.find_html_recursively("span"):

251 span_class = span_tag.attrs.get("class", "").split()

252 if "qualifier-content" in span_class or "label-content" in span_class:

253 for raw_tag in clean_node(wxr, None, span_tag).split(","):

254 raw_tag = raw_tag.strip()

255 if raw_tag != "": 255 ↛ 253line 255 didn't jump to line 253 because the condition on line 255 was always true

256 raw_tags.append(raw_tag)

257 elif class_name in span_class:

258 sound = Sound(

259 ipa=clean_node(wxr, None, span_tag), raw_tags=raw_tags

260 )

261 if sound.ipa != "": 261 ↛ 250line 261 didn't jump to line 250 because the condition on line 261 was always true

262 translate_raw_tags(sound)

263 base_data.sounds.append(sound)

264

265

266def extract_rhymes_template(

267 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

268):

269 # https://vi.wiktionary.org/wiki/Bản_mẫu:rhymes

270 expanded_node = wxr.wtp.parse(

271 wxr.wtp.node_to_wikitext(t_node), expand_all=True

272 )

273 for span_tag in expanded_node.find_html_recursively(

274 "span", attr_name="class", attr_value="IPA"

275 ):

276 rhyme = clean_node(wxr, None, span_tag)

277 if rhyme != "": 277 ↛ 273line 277 didn't jump to line 273 because the condition on line 277 was always true

278 base_data.sounds.append(Sound(rhymes=rhyme))

279

280 for link in expanded_node.find_child(NodeKind.LINK):

281 clean_node(wxr, base_data, link)

282

283

284def extract_hyphenation_template(

285 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

286):

287 # https://vi.wiktionary.org/wiki/Bản_mẫu:hyphenation

288 expanded_node = wxr.wtp.parse(

289 wxr.wtp.node_to_wikitext(t_node), expand_all=True

290 )

291 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

292 for span_tag in expanded_node.find_html(

293 "span", attr_name="lang", attr_value=lang_code

294 ):

295 h_str = clean_node(wxr, None, span_tag)

296 h_data = Hyphenation(

297 parts=list(filter(None, map(str.strip, h_str.split("‧"))))

298 )

299 if len(h_data.parts) > 0: 299 ↛ 292line 299 didn't jump to line 292 because the condition on line 299 was always true

300 base_data.hyphenations.append(h_data)

301

302

303def extract_homophone_section(

304 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode

305):

306 for list in level_node.find_child(NodeKind.LIST):

307 for list_item in list.find_child(NodeKind.LIST_ITEM):

308 for link_node in list_item.find_child(NodeKind.LINK):

309 homophone = clean_node(wxr, None, link_node)

310 if homophone != "":

311 base_data.sounds.append(Sound(homophone=homophone))

312

313

314def extract_zh_pron_template(

315 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

316):

317 expanded_node = wxr.wtp.parse(

318 wxr.wtp.node_to_wikitext(t_node), expand_all=True

319 )

320 seen_lists = set()

321 sounds = []

322 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):

323 if list_node not in seen_lists:

324 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

325 sounds.extend(

326 extract_zh_pron_list_item(wxr, list_item, [], seen_lists)

327 )

328 for sound in sounds:

329 translate_raw_tags(sound)

330 base_data.sounds.extend(sounds)

331 clean_node(wxr, base_data, expanded_node)

332

333

334def extract_zh_pron_list_item(

335 wxr: WiktextractContext,

336 list_item_node: WikiNode,

337 raw_tags: list[str],

338 seen_lists: set[WikiNode],

339) -> list[Sound]:

340 current_tags = raw_tags[:]

341 sounds = []

342 is_first_small_tag = True

343 for node in list_item_node.children:

344 if isinstance(node, WikiNode):

345 if node.kind == NodeKind.LINK:

346 link_str = clean_node(wxr, None, node.largs)

347 node_str = clean_node(wxr, None, node)

348 if link_str.startswith(("File:", "Tập tin:")):

349 filename = link_str.removeprefix("File:").removeprefix(

350 "Tập tin:"

351 )

352 sound_data = Sound(raw_tags=current_tags)

353 set_sound_file_url_fields(wxr, filename, sound_data)

354 sounds.append(sound_data)

355 elif node_str != "":

356 current_tags.append(node_str.strip("()"))

357 elif isinstance(node, HTMLNode):

358 if node.tag == "small":

359 # remove "ghi chú"(help) <sup> tag

360 if is_first_small_tag:

361 raw_tag_text = clean_node(

362 wxr,

363 None,

364 [

365 n

366 for n in node.children

367 if not (

368 isinstance(n, HTMLNode) and n.tag == "sup"

369 )

370 ],

371 ).rstrip(":")

372 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))

373 elif len(sounds) > 0:

374 sounds[-1].raw_tags.extend(

375 split_zh_pron_raw_tag(clean_node(wxr, None, node))

376 )

377 is_first_small_tag = False

378 elif node.tag == "span":

379 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))

380 elif (

381 node.tag == "table"

382 and len(current_tags) > 0

383 and current_tags[-1] == "Đồng âm"

384 ):

385 sounds.extend(

386 extract_zh_pron_homophones_table(

387 wxr, node, current_tags

388 )

389 )

390 elif node.kind == NodeKind.LIST:

391 seen_lists.add(node)

392 for next_list_item in node.find_child(NodeKind.LIST_ITEM):

393 sounds.extend(

394 extract_zh_pron_list_item(

395 wxr,

396 next_list_item,

397 current_tags,

398 seen_lists,

399 )

400 )

401 return sounds

402

403

404def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:

405 raw_tags = []

406 if "(" not in raw_tag_text:

407 for raw_tag in re.split(r",|:|;| và ", raw_tag_text):

408 raw_tag = raw_tag.strip().removeprefix("bao gồm").strip()

409 if raw_tag != "":

410 raw_tags.append(raw_tag)

411 else:

412 processed_offsets = []

413 for match in re.finditer(r"\([^()]+\)", raw_tag_text):

414 processed_offsets.append((match.start(), match.end()))

415 raw_tags.extend(

416 split_zh_pron_raw_tag(

417 raw_tag_text[match.start() + 1 : match.end() - 1]

418 )

419 )

420 not_processed = ""

421 last_end = 0

422 for start, end in processed_offsets:

423 not_processed += raw_tag_text[last_end:start]

424 last_end = end

425 not_processed += raw_tag_text[last_end:]

426 if not_processed != raw_tag_text:

427 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags

428 else:

429 raw_tags.append(not_processed)

430 return raw_tags

431

432

433def extract_zh_pron_span(

434 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]

435) -> list[Sound]:

436 sounds = []

437 small_tags = []

438 pron_nodes = []

439 roman = ""

440 phonetic_pron = ""

441 for index, node in enumerate(span_tag.children):

442 if isinstance(node, HTMLNode) and node.tag == "small":

443 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))

444 elif (

445 isinstance(node, HTMLNode)

446 and node.tag == "span"

447 and "-Latn" in node.attrs.get("lang", "")

448 ):

449 roman = clean_node(wxr, None, node).strip("() ")

450 elif isinstance(node, str) and node.strip() == "[Phonetic:":

451 phonetic_pron = clean_node(

452 wxr, None, span_tag.children[index + 1 :]

453 ).strip("] ")

454 break

455 else:

456 pron_nodes.append(node)

457 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):

458 zh_pron = zh_pron.strip("[]： ")

459 if len(zh_pron) > 0:

460 if "IPA" in span_tag.attrs.get("class", ""):

461 sounds.append(

462 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags)

463 )

464 else:

465 sounds.append(

466 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags)

467 )

468 if len(sounds) > 0:

469 sounds[-1].raw_tags.extend(small_tags)

470 if phonetic_pron != "":

471 sounds.append(

472 Sound(

473 zh_pron=phonetic_pron,

474 roman=roman,

475 raw_tags=raw_tags + ["Phonetic"],

476 )

477 )

478 return sounds

479

480

481def split_zh_pron(zh_pron: str) -> list[str]:

482 # split by comma and other symbols that outside parentheses

483 parentheses = 0

484 pron_list = []

485 pron = ""

486 for c in zh_pron:

487 if (

488 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))

489 and parentheses == 0

490 and len(pron.strip()) > 0

491 ):

492 pron_list.append(pron.strip())

493 pron = ""

494 elif c == "(":

495 parentheses += 1

496 pron += c

497 elif c == ")":

498 parentheses -= 1

499 pron += c

500 else:

501 pron += c

502

503 if pron.strip() != "":

504 pron_list.append(pron)

505 return pron_list

506

507

508def extract_zh_pron_homophones_table(

509 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]

510) -> list[Sound]:

511 sounds = []

512 for td_tag in table.find_html_recursively("td"):

513 for span_tag in td_tag.find_html("span"):

514 span_class = span_tag.attrs.get("class", "")

515 span_lang = span_tag.attrs.get("lang", "")

516 span_str = clean_node(wxr, None, span_tag)

517 if (

518 span_str not in ["", "／"]

519 and span_lang != ""

520 and span_class in ["Hant", "Hans", "Hani"]

521 ):

522 sound = Sound(homophone=span_str, raw_tags=raw_tags)

523 if span_class == "Hant":

524 sound.tags.append("Traditional-Chinese")

525 elif span_class == "Hans":

526 sound.tags.append("Simplified-Chinese")

527 sounds.append(sound)

528 return sounds

529

530

531def extract_th_pron_template(

532 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

533):

534 # https://vi.wiktionary.org/wiki/Bản mẫu:th-pron

535 @dataclass

536 class TableHeader:

537 raw_tags: list[str]

538 rowspan: int

539

540 expanded_node = wxr.wtp.parse(

541 wxr.wtp.node_to_wikitext(t_node), expand_all=True

542 )

543 sounds = []

544 for table_tag in expanded_node.find_html("table"):

545 row_headers = []

546 for tr_tag in table_tag.find_html("tr"):

547 field = "other"

548 new_headers = []

549 for header in row_headers:

550 if header.rowspan > 1:

551 header.rowspan -= 1

552 new_headers.append(header)

553 row_headers = new_headers

554 for th_tag in tr_tag.find_html("th"):

555 header_str = clean_node(wxr, None, th_tag)

556 if header_str.startswith("(Tiêu chuẩn) IPA"):

557 field = "ipa"

558 elif header_str.startswith("Từ đồng âm"):

559 field = "homophone"

560 elif header_str == "Âm thanh":

561 field = "audio"

562 elif header_str != "":

563 rowspan = 1

564 rowspan_str = th_tag.attrs.get("rowspan", "1")

565 if re.fullmatch(r"\d+", rowspan_str):

566 rowspan = int(rowspan_str)

567 header = TableHeader([], rowspan)

568 for line in header_str.splitlines():

569 for raw_tag in line.strip("{}\n ").split(";"):

570 raw_tag = raw_tag.strip()

571 if raw_tag != "":

572 header.raw_tags.append(raw_tag)

573 row_headers.append(header)

574

575 for td_tag in tr_tag.find_html("td"):

576 if field == "audio":

577 for link_node in td_tag.find_child(NodeKind.LINK):

578 filename = clean_node(wxr, None, link_node.largs[0])

579 if filename != "":

580 sound = Sound()

581 set_sound_file_url_fields(wxr, filename, sound)

582 sounds.append(sound)

583 elif field == "homophone":

584 for span_tag in td_tag.find_html_recursively(

585 "span", attr_name="lang", attr_value="th"

586 ):

587 word = clean_node(wxr, None, span_tag)

588 if word != "":

589 sounds.append(Sound(homophone=word))

590 else:

591 raw_tags = []

592 for html_node in td_tag.find_child_recursively(

593 NodeKind.HTML

594 ):

595 if html_node.tag == "small":

596 node_str = clean_node(wxr, None, html_node)

597 if node_str.startswith("[") and node_str.endswith(

598 "]"

599 ):

600 for raw_tag in node_str.strip("[]").split(","):

601 raw_tag = raw_tag.strip()

602 if raw_tag != "":

603 raw_tags.append(raw_tag)

604 elif len(sounds) > 0:

605 sounds[-1].roman = node_str

606 elif html_node.tag == "span":

607 node_str = clean_node(wxr, None, html_node)

608 span_lang = html_node.attrs.get("lang", "")

609 span_class = html_node.attrs.get("class", "")

610 if node_str != "" and (

611 span_lang == "th" or span_class in ["IPA", "tr"]

612 ):

613 sound = Sound(raw_tags=raw_tags)

614 for header in row_headers:

615 sound.raw_tags.extend(header.raw_tags)

616 translate_raw_tags(sound)

617 if "romanization" in sound.tags:

618 field = "roman"

619 setattr(sound, field, node_str)

620 sounds.append(sound)

621

622 base_data.sounds.extend(sounds)

623 clean_node(wxr, base_data, expanded_node)

624

625

626def extract_homophones_template(

627 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

628):

629 expanded_node = wxr.wtp.parse(

630 wxr.wtp.node_to_wikitext(t_node), expand_all=True

631 )

632 homophones = []

633 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

634 for top_span in expanded_node.find_html(

635 "span", attr_name="class", attr_value="homophones"

636 ):

637 for span_tag in top_span.find_html("span"):

638 span_lang = span_tag.attrs.get("lang", "")

639 span_class = span_tag.attrs.get("class", "").split()

640 if "tr" in span_class and len(homophones) > 0:

641 homophones[-1].roman = clean_node(wxr, None, span_tag)

642 elif span_lang == lang_code:

643 homophone = clean_node(wxr, None, span_tag)

644 if homophone != "":

645 homophones.append(Sound(homophone=homophone))

646 elif "qualifier-content" in span_class and len(homophones) > 0:

647 raw_tag = clean_node(wxr, None, span_tag)

648 if raw_tag != "":

649 homophones[-1].raw_tags.append(raw_tag)

650 translate_raw_tags(homophones[-1])

651

652 base_data.sounds.extend(homophones)

653 for link_node in expanded_node.find_child(NodeKind.LINK):

654 clean_node(wxr, base_data, link_node)

655

656

657def extract_ja_pron_template(

658 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

659):

660 JA_PRON_ACCENTS = {"Nakadaka", "Heiban", "Atamadaka", "Odaka"}

661 expanded_node = wxr.wtp.parse(

662 wxr.wtp.node_to_wikitext(t_node), expand_all=True

663 )

664 for li_tag in expanded_node.find_html_recursively("li"):

665 sound = Sound()

666 for span_tag in li_tag.find_html("span"):

667 span_class = span_tag.attrs.get("class", "").split()

668 if "usage-label-accent" in span_class:

669 raw_tag = clean_node(wxr, None, span_tag).strip("() ")

670 if raw_tag != "":

671 sound.raw_tags.append(raw_tag)

672 elif "IPA" in span_class:

673 sound.ipa = clean_node(wxr, None, span_tag)

674 elif "Latn" in span_class:

675 sound.roman = clean_node(wxr, None, span_tag)

676 elif span_tag.attrs.get("lang", "") == "ja":

677 sound.other = clean_node(wxr, None, span_tag)

678 for link_node in li_tag.find_child(NodeKind.LINK):

679 link_text = clean_node(wxr, None, link_node)

680 if link_text in JA_PRON_ACCENTS:

681 sound.tags.append(link_text)

682 if sound.ipa != "" or sound.other != "":

683 translate_raw_tags(sound)

684 base_data.sounds.append(sound)

685 audio_file = t_node.template_parameters.get(

686 "a", t_node.template_parameters.get("audio", "")

687 ).strip()

688 if audio_file != "":

689 sound = Sound()

690 set_sound_file_url_fields(wxr, audio_file, sound)

691 base_data.sounds.append(sound)

692

693 clean_node(wxr, base_data, expanded_node)

694

695

696def extract_ko_ipa_template(

697 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

698):

699 sounds = []

700 expanded_node = wxr.wtp.parse(

701 wxr.wtp.node_to_wikitext(t_node), expand_all=True

702 )

703 clean_node(wxr, word_entry, expanded_node)

704 for ul_node in expanded_node.find_html("ul"):

705 for li_node in ul_node.find_html("li"):

706 if "ko-pron__ph" in li_node.attrs.get("class", ""):

707 for span_node in li_node.find_html(

708 "span", attr_name="lang", attr_value="ko"

709 ):

710 hangeul_str = clean_node(wxr, None, span_node).strip("[]")

711 for hangeul in hangeul_str.split("/"):

712 if hangeul != "":

713 sounds.append(

714 Sound(hangeul=hangeul, tags=["phonetic"])

715 )

716 else:

717 raw_tags = []

718 for i_node in li_node.find_html("i"):

719 for raw_tag in clean_node(wxr, None, i_node).split("/"):

720 if raw_tag not in ["", "IPA"]:

721 raw_tags.append(raw_tag)

722 for span_node in li_node.find_html(

723 "span", attr_name="class", attr_value="IPA"

724 ):

725 ipas = clean_node(wxr, None, span_node)

726 for ipa in ipas.split("~"):

727 ipa = ipa.strip()

728 if ipa != "":

729 sound = Sound(ipa=ipa, raw_tags=raw_tags)

730 translate_raw_tags(sound)

731 sounds.append(sound)

732

733 for table in expanded_node.find_html("table"):

734 for tr in table.find_html("tr"):

735 raw_tag = ""

736 for th in tr.find_html("th"):

737 raw_tag = clean_node(wxr, None, th)

738 for td in tr.find_html("td"):

739 roman = clean_node(wxr, None, td)

740 if roman != "":

741 sound = Sound(roman=roman)

742 if raw_tag != "":

743 sound.raw_tags.append(raw_tag)

744 translate_raw_tags(sound)

745 sounds.append(sound)

746

747 audio_file = clean_node(

748 wxr,

749 None,

750 t_node.template_parameters.get(

751 "a", t_node.template_parameters.get("audio", "")

752 ),

753 )

754 if audio_file != "":

755 sound = Sound()

756 set_sound_file_url_fields(wxr, audio_file, sound)

757 sounds.append(sound)

758 word_entry.sounds.extend(sounds)

Coverage for src / wiktextract / extractor / vi / sound.py: 27%

467 statements