Coverage for src/wiktextract/extractor/th/sound.py: 50%

1import re

2from dataclasses import dataclass

4from wikitextprocessor import (

5 HTMLNode,

6 LevelNode,

7 NodeKind,

8 TemplateNode,

9 WikiNode,

10)

12from ...page import clean_node

13from ...wxr_context import WiktextractContext

14from ..share import set_sound_file_url_fields

15from .models import Hyphenation, Sound, WordEntry

16from .tags import translate_raw_tags

19def extract_sound_section(

20 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode

21):

22 for t_node in level_node.find_child(NodeKind.TEMPLATE):

23 if t_node.template_name == "zh-forms": 23 ↛ 24line 23 didn't jump to line 24 because the condition on line 23 was never true

24 from .page import extract_zh_forms

26 extract_zh_forms(wxr, base_data, t_node)

27 else:

28 extract_sound_template(wxr, base_data, t_node)

29 for list_node in level_node.find_child(NodeKind.LIST):

30 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

31 for t_node in list_item.find_child(NodeKind.TEMPLATE):

32 extract_sound_template(wxr, base_data, t_node)

35def extract_sound_template(

36 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

37):

38 if t_node.template_name in ["ja-pron", "ja-IPA"]:

39 extract_ja_pron_template(wxr, base_data, t_node)

40 elif t_node.template_name == "th-pron":

41 extract_th_pron_template(wxr, base_data, t_node)

42 elif t_node.template_name == "lo-pron":

43 extract_lo_pron_template(wxr, base_data, t_node)

44 elif t_node.template_name == "zh-pron": 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 extract_zh_pron_template(wxr, base_data, t_node)

46 elif t_node.template_name.lower() == "ko-ipa": 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 extract_ko_ipa_template(wxr, base_data, t_node)

48 elif (

49 t_node.template_name.lower() == "ipa"

50 or t_node.template_name.lower().endswith(("-ipa", "-pron"))

51 ):

52 extract_ipa_template(wxr, base_data, t_node)

53 elif t_node.template_name == "X-SAMPA": 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 extract_x_sampa_template(wxr, base_data, t_node)

55 elif t_node.template_name == "enPR": 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 extract_enpr_template(wxr, base_data, t_node)

57 elif t_node.template_name in ["audio", "Audio", "เสียง"]: 57 ↛ 58line 57 didn't jump to line 58 because the condition on line 57 was never true

58 extract_audio_template(wxr, base_data, t_node)

59 elif t_node.template_name in ["rhymes", "rhyme"]:

60 extract_rhymes_template(wxr, base_data, t_node)

61 elif t_node.template_name in ["homophones", "homophone", "hmp"]: 61 ↛ 63line 61 didn't jump to line 63 because the condition on line 61 was always true

62 extract_homophones_template(wxr, base_data, t_node)

63 elif t_node.template_name in ["hyphenation", "hyph"]:

64 extract_hyphenation_template(wxr, base_data, t_node)

65 elif t_node.template_name in ["คำอ่านไทย", "คอท"]:

66 extract_approximate_th_pron(wxr, base_data, t_node)

69def extract_ipa_template(

70 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

71):

72 expanded_node = wxr.wtp.parse(

73 wxr.wtp.node_to_wikitext(t_node), expand_all=True

74 )

75 no_list_nodes = []

76 for node in expanded_node.children:

77 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

78 for list_item in node.find_child(NodeKind.LIST_ITEM):

79 extract_ipa_list_item(wxr, base_data, list_item)

80 else:

81 no_list_nodes.append(node)

82 if len(no_list_nodes) > 0: 82 ↛ 86line 82 didn't jump to line 86 because the condition on line 82 was always true

83 tmp_node = WikiNode(NodeKind.ROOT, 0)

84 tmp_node.children = no_list_nodes

85 extract_ipa_list_item(wxr, base_data, tmp_node)

86 clean_node(wxr, base_data, expanded_node)

89def extract_ipa_list_item(

90 wxr: WiktextractContext, base_data: WordEntry, list_item: WikiNode

91):

92 raw_tags = []

93 for italic_node in list_item.find_child(NodeKind.ITALIC):

94 # Template:vi-ipa location data

95 raw_tag = clean_node(wxr, None, italic_node)

96 if raw_tag != "": 96 ↛ 93line 96 didn't jump to line 93 because the condition on line 96 was always true

97 raw_tags.append(raw_tag)

98 for span_tag in list_item.find_html_recursively("span"):

99 span_class = span_tag.attrs.get("class", "").split()

100 if "qualifier-content" in span_class or "ib-content" in span_class: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 for raw_tag in clean_node(wxr, None, span_tag).split(","):

102 raw_tag = raw_tag.strip()

103 if raw_tag != "":

104 raw_tags.append(raw_tag)

105 elif "IPA" in span_class: 105 ↛ 112line 105 didn't jump to line 112 because the condition on line 105 was always true

106 sound = Sound(

107 ipa=clean_node(wxr, None, span_tag), raw_tags=raw_tags

108 )

109 if sound.ipa != "": 109 ↛ 98line 109 didn't jump to line 98 because the condition on line 109 was always true

110 translate_raw_tags(sound)

111 base_data.sounds.append(sound)

112 elif "Latn" in span_class:

113 sound = Sound(

114 roman=clean_node(wxr, None, span_tag), raw_tags=raw_tags

115 )

116 if sound.roman != "":

117 translate_raw_tags(sound)

118 base_data.sounds.append(sound)

119

120

121def extract_ja_pron_template(

122 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

123):

124 JA_PRON_ACCENTS = {

125 "นากาดากะ": "Nakadaka",

126 "เฮบัง": "Heiban",

127 "อาตามาดากะ": "Atamadaka",

128 "โอดากะ": "Odaka",

129 }

130 expanded_node = wxr.wtp.parse(

131 wxr.wtp.node_to_wikitext(t_node), expand_all=True

132 )

133 for li_tag in expanded_node.find_html_recursively("li"):

134 sound = Sound()

135 for span_tag in li_tag.find_html("span"):

136 span_class = span_tag.attrs.get("class", "").split()

137 if "usage-label-accent" in span_class:

138 raw_tag = clean_node(wxr, None, span_tag).strip("() ")

139 if raw_tag != "": 139 ↛ 135line 139 didn't jump to line 135 because the condition on line 139 was always true

140 sound.raw_tags.append(raw_tag)

141 elif "IPA" in span_class:

142 sound.ipa = clean_node(wxr, None, span_tag)

143 elif "Latn" in span_class:

144 sound.roman = clean_node(wxr, None, span_tag)

145 elif span_tag.attrs.get("lang", "") == "ja": 145 ↛ 135line 145 didn't jump to line 135 because the condition on line 145 was always true

146 sound.other = clean_node(wxr, None, span_tag)

147 for link_node in li_tag.find_child(NodeKind.LINK):

148 link_text = clean_node(wxr, None, link_node)

149 if link_text in JA_PRON_ACCENTS:

150 sound.tags.append(JA_PRON_ACCENTS[link_text])

151 if sound.ipa != "" or sound.other != "": 151 ↛ 133line 151 didn't jump to line 133 because the condition on line 151 was always true

152 translate_raw_tags(sound)

153 base_data.sounds.append(sound)

154 audio_file = t_node.template_parameters.get(

155 "a", t_node.template_parameters.get("audio", "")

156 ).strip()

157 if audio_file != "": 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true

158 sound = Sound()

159 set_sound_file_url_fields(wxr, audio_file, sound)

160 base_data.sounds.append(sound)

161 clean_node(wxr, base_data, expanded_node)

162

163

164def extract_x_sampa_template(

165 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

166):

167 sound = Sound(

168 ipa=clean_node(wxr, None, t_node.template_parameters.get(1, "")),

169 tags=["X-SAMPA"],

170 )

171 if sound.ipa != "":

172 base_data.sounds.append(sound)

173

174

175def extract_enpr_template(

176 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

177):

178 sound = Sound(

179 enpr=clean_node(wxr, None, t_node.template_parameters.get(1, ""))

180 )

181 if sound.enpr != "":

182 base_data.sounds.append(sound)

183

184

185def extract_audio_template(

186 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

187):

188 sound = Sound()

189 filename = clean_node(wxr, None, t_node.template_parameters.get(2, ""))

190 if filename != "": 190 ↛ exitline 190 didn't return from function 'extract_audio_template' because the condition on line 190 was always true

191 set_sound_file_url_fields(wxr, filename, sound)

192 caption = clean_node(wxr, None, t_node.template_parameters.get(3, ""))

193 if caption != "": 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true

194 sound.raw_tags.append(caption)

195 expanded_node = wxr.wtp.parse(

196 wxr.wtp.node_to_wikitext(t_node), expand_all=True

197 )

198 for span_node in expanded_node.find_html_recursively(

199 "span", attr_name="class", attr_value="ib-content"

200 ):

201 for raw_tag in clean_node(wxr, None, span_node).split(","):

202 if raw_tag != "": 202 ↛ 201line 202 didn't jump to line 201 because the condition on line 202 was always true

203 sound.raw_tags.append(raw_tag)

204 translate_raw_tags(sound)

205 base_data.sounds.append(sound)

206 clean_node(wxr, base_data, t_node)

207

208

209def extract_th_pron_template(

210 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

211):

212 # https://th.wiktionary.org/wiki/แม่แบบ:th-pron

213 @dataclass

214 class TableHeader:

215 raw_tags: list[str]

216 rowspan: int

217

218 expanded_node = wxr.wtp.parse(

219 wxr.wtp.node_to_wikitext(t_node), expand_all=True

220 )

221 for table_tag in expanded_node.find_html("table"):

222 row_headers = []

223 for tr_tag in table_tag.find_html("tr"):

224 field = "other"

225 new_headers = []

226 for header in row_headers:

227 if header.rowspan > 1:

228 header.rowspan -= 1

229 new_headers.append(header)

230 row_headers = new_headers

231 for th_tag in tr_tag.find_html("th"):

232 header_str = clean_node(wxr, None, th_tag)

233 if header_str.startswith("(มาตรฐาน) สัทอักษรสากล"):

234 field = "ipa"

235 elif header_str.startswith("คำพ้องเสียง"):

236 field = "homophone"

237 elif header_str == "ไฟล์เสียง": 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true

238 field = "audio"

239 elif header_str != "": 239 ↛ 231line 239 didn't jump to line 231 because the condition on line 239 was always true

240 rowspan = 1

241 rowspan_str = th_tag.attrs.get("rowspan", "1")

242 if re.fullmatch(r"\d+", rowspan_str): 242 ↛ 244line 242 didn't jump to line 244 because the condition on line 242 was always true

243 rowspan = int(rowspan_str)

244 header = TableHeader([], rowspan)

245 for line in header_str.splitlines():

246 for raw_tag in line.strip("{}\n ").split(";"):

247 raw_tag = raw_tag.strip()

248 if raw_tag != "": 248 ↛ 246line 248 didn't jump to line 246 because the condition on line 248 was always true

249 header.raw_tags.append(raw_tag)

250 row_headers.append(header)

251

252 for td_tag in tr_tag.find_html("td"):

253 if field == "audio": 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true

254 for link_node in td_tag.find_child(NodeKind.LINK):

255 filename = clean_node(wxr, None, link_node.largs[0])

256 if filename != "":

257 sound = Sound()

258 set_sound_file_url_fields(wxr, filename, sound)

259 base_data.sounds.append(sound)

260 elif field == "homophone":

261 for span_tag in td_tag.find_html_recursively(

262 "span", attr_name="lang", attr_value="th"

263 ):

264 word = clean_node(wxr, None, span_tag)

265 if word != "": 265 ↛ 261line 265 didn't jump to line 261 because the condition on line 265 was always true

266 base_data.sounds.append(Sound(homophone=word))

267 else:

268 raw_tags = []

269 for html_node in td_tag.find_child_recursively(

270 NodeKind.HTML

271 ):

272 if html_node.tag == "small":

273 node_str = clean_node(wxr, None, html_node)

274 if node_str.startswith("[") and node_str.endswith( 274 ↛ 269line 274 didn't jump to line 269 because the condition on line 274 was always true

275 "]"

276 ):

277 for raw_tag in node_str.strip("[]").split(","):

278 raw_tag = raw_tag.strip()

279 if raw_tag != "": 279 ↛ 277line 279 didn't jump to line 277 because the condition on line 279 was always true

280 raw_tags.append(raw_tag)

281 elif html_node.tag == "span":

282 node_str = clean_node(wxr, None, html_node)

283 span_lang = html_node.attrs.get("lang", "")

284 span_class = html_node.attrs.get("class", "")

285 if node_str != "" and (

286 span_lang == "th" or span_class in ["IPA", "tr"]

287 ):

288 sound = Sound(raw_tags=raw_tags)

289 for header in row_headers:

290 sound.raw_tags.extend(header.raw_tags)

291 translate_raw_tags(sound)

292 if "romanization" in sound.tags:

293 field = "roman"

294 setattr(sound, field, node_str)

295 base_data.sounds.append(sound)

296

297 clean_node(wxr, base_data, expanded_node)

298

299

300def extract_lo_pron_template(

301 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

302):

303 # https://th.wiktionary.org/wiki/แม่แบบ:lo-pron

304 expanded_node = wxr.wtp.parse(

305 wxr.wtp.node_to_wikitext(t_node), expand_all=True

306 )

307 for list_node in expanded_node.find_child(NodeKind.LIST):

308 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

309 field = "other"

310 raw_tag = ""

311 for node in list_item.children:

312 if isinstance(node, HTMLNode) and node.tag == "span":

313 span_class = node.attrs.get("class", "")

314 if "qualifier-content" in span_class:

315 raw_tag = clean_node(wxr, None, node)

316 elif span_class == "IPA":

317 ipa = clean_node(wxr, None, node)

318 if ipa != "": 318 ↛ 311line 318 didn't jump to line 311 because the condition on line 318 was always true

319 sound = Sound(ipa=ipa)

320 if raw_tag != "": 320 ↛ 323line 320 didn't jump to line 323 because the condition on line 320 was always true

321 sound.raw_tags.append(raw_tag)

322 translate_raw_tags(sound)

323 base_data.sounds.append(sound)

324 else:

325 span_lang = node.attrs.get("lang", "")

326 if span_lang == "lo" and field == "hyphenation":

327 span_str = clean_node(wxr, None, node)

328 if span_str != "": 328 ↛ 311line 328 didn't jump to line 311 because the condition on line 328 was always true

329 base_data.hyphenations.append(

330 Hyphenation(parts=span_str.split("-"))

331 )

332 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:

333 link_str = clean_node(wxr, None, node)

334 if link_str == "สัทอักษรสากล":

335 field = "ipa"

336 elif link_str != "" and field == "rhymes":

337 base_data.sounds.append(Sound(rhymes=link_str))

338 elif isinstance(node, str) and node.strip().endswith(":"):

339 node = node.strip()

340 if node == "การแบ่งพยางค์:":

341 field = "hyphenation"

342 elif node == "สัมผัส:": 342 ↛ 311line 342 didn't jump to line 311 because the condition on line 342 was always true

343 field = "rhymes"

344

345 clean_node(wxr, base_data, expanded_node)

346

347

348def extract_zh_pron_template(

349 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

350):

351 expanded_node = wxr.wtp.parse(

352 wxr.wtp.node_to_wikitext(t_node), expand_all=True

353 )

354 seen_lists = set()

355 sounds = []

356 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):

357 if list_node not in seen_lists:

358 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

359 sounds.extend(

360 extract_zh_pron_list_item(wxr, list_item, [], seen_lists)

361 )

362 for sound in sounds:

363 translate_raw_tags(sound)

364 base_data.sounds.extend(sounds)

365 clean_node(wxr, base_data, expanded_node)

366

367

368def extract_zh_pron_list_item(

369 wxr: WiktextractContext,

370 list_item_node: WikiNode,

371 raw_tags: list[str],

372 seen_lists: set[WikiNode],

373) -> list[Sound]:

374 current_tags = raw_tags[:]

375 sounds = []

376 is_first_small_tag = True

377 for node in list_item_node.children:

378 if isinstance(node, WikiNode):

379 if node.kind == NodeKind.LINK:

380 link_str = clean_node(wxr, None, node.largs)

381 node_str = clean_node(wxr, None, node)

382 if link_str.startswith(("File:", "ไฟล์:")):

383 filename = link_str.removeprefix("File:").removeprefix(

384 "ไฟล์:"

385 )

386 sound_data = Sound(raw_tags=current_tags)

387 set_sound_file_url_fields(wxr, filename, sound_data)

388 sounds.append(sound_data)

389 elif node_str != "":

390 current_tags.append(node_str.strip("()"))

391 elif isinstance(node, HTMLNode):

392 if node.tag == "small":

393 # remove <sup> tag

394 if is_first_small_tag:

395 raw_tag_text = clean_node(

396 wxr,

397 None,

398 [

399 n

400 for n in node.children

401 if not (

402 isinstance(n, HTMLNode) and n.tag == "sup"

403 )

404 ],

405 ).rstrip(":")

406 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))

407 elif len(sounds) > 0:

408 sounds[-1].raw_tags.extend(

409 split_zh_pron_raw_tag(clean_node(wxr, None, node))

410 )

411 is_first_small_tag = False

412 elif node.tag == "span":

413 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))

414 elif (

415 node.tag == "table"

416 and len(current_tags) > 0

417 and current_tags[-1] == "คำพ้องเสียง"

418 ):

419 sounds.extend(

420 extract_zh_pron_homophones_table(

421 wxr, node, current_tags

422 )

423 )

424 elif node.kind == NodeKind.LIST:

425 seen_lists.add(node)

426 for next_list_item in node.find_child(NodeKind.LIST_ITEM):

427 sounds.extend(

428 extract_zh_pron_list_item(

429 wxr,

430 next_list_item,

431 current_tags,

432 seen_lists,

433 )

434 )

435 return sounds

436

437

438def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:

439 raw_tags = []

440 if "(" not in raw_tag_text:

441 for raw_tag in re.split(r",|:|;| and ", raw_tag_text):

442 raw_tag = raw_tag.strip().removeprefix("incl. ").strip()

443 if raw_tag != "":

444 raw_tags.append(raw_tag)

445 else:

446 processed_offsets = []

447 for match in re.finditer(r"\([^()]+\)", raw_tag_text):

448 processed_offsets.append((match.start(), match.end()))

449 raw_tags.extend(

450 split_zh_pron_raw_tag(

451 raw_tag_text[match.start() + 1 : match.end() - 1]

452 )

453 )

454 not_processed = ""

455 last_end = 0

456 for start, end in processed_offsets:

457 not_processed += raw_tag_text[last_end:start]

458 last_end = end

459 not_processed += raw_tag_text[last_end:]

460 if not_processed != raw_tag_text:

461 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags

462 else:

463 raw_tags.append(not_processed)

464 return raw_tags

465

466

467def extract_zh_pron_span(

468 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]

469) -> list[Sound]:

470 sounds = []

471 small_tags = []

472 pron_nodes = []

473 roman = ""

474 phonetic_pron = ""

475 for index, node in enumerate(span_tag.children):

476 if isinstance(node, HTMLNode) and node.tag == "small":

477 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))

478 elif (

479 isinstance(node, HTMLNode)

480 and node.tag == "span"

481 and "-Latn" in node.attrs.get("lang", "")

482 ):

483 roman = clean_node(wxr, None, node).strip("() ")

484 elif isinstance(node, str) and node.strip() == "[Phonetic:":

485 phonetic_pron = clean_node(

486 wxr, None, span_tag.children[index + 1 :]

487 ).strip("] ")

488 break

489 else:

490 pron_nodes.append(node)

491 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):

492 zh_pron = zh_pron.strip("[]： ")

493 if len(zh_pron) > 0:

494 if "IPA" in span_tag.attrs.get("class", ""):

495 sounds.append(

496 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags)

497 )

498 else:

499 sounds.append(

500 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags)

501 )

502 if len(sounds) > 0:

503 sounds[-1].raw_tags.extend(small_tags)

504 if phonetic_pron != "":

505 sounds.append(

506 Sound(

507 zh_pron=phonetic_pron,

508 roman=roman,

509 raw_tags=raw_tags + ["Phonetic"],

510 )

511 )

512 return sounds

513

514

515def split_zh_pron(zh_pron: str) -> list[str]:

516 # split by comma and other symbols that outside parentheses

517 parentheses = 0

518 pron_list = []

519 pron = ""

520 for c in zh_pron:

521 if (

522 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))

523 and parentheses == 0

524 and len(pron.strip()) > 0

525 ):

526 pron_list.append(pron.strip())

527 pron = ""

528 elif c == "(":

529 parentheses += 1

530 pron += c

531 elif c == ")":

532 parentheses -= 1

533 pron += c

534 else:

535 pron += c

536

537 if pron.strip() != "":

538 pron_list.append(pron)

539 return pron_list

540

541

542def extract_zh_pron_homophones_table(

543 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]

544) -> list[Sound]:

545 sounds = []

546 for td_tag in table.find_html_recursively("td"):

547 for span_tag in td_tag.find_html("span"):

548 span_class = span_tag.attrs.get("class", "")

549 span_lang = span_tag.attrs.get("lang", "")

550 span_str = clean_node(wxr, None, span_tag)

551 if (

552 span_str not in ["", "／"]

553 and span_lang != ""

554 and span_class in ["Hant", "Hans", "Hani"]

555 ):

556 sound = Sound(homophone=span_str, raw_tags=raw_tags)

557 if span_class == "Hant":

558 sound.tags.append("Traditional-Chinese")

559 elif span_class == "Hans":

560 sound.tags.append("Simplified-Chinese")

561 sounds.append(sound)

562 return sounds

563

564

565def extract_rhymes_template(

566 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

567):

568 expanded_node = wxr.wtp.parse(

569 wxr.wtp.node_to_wikitext(t_node), expand_all=True

570 )

571 for link_node in expanded_node.find_child(NodeKind.LINK):

572 rhyme = clean_node(wxr, base_data, link_node)

573 if rhyme != "":

574 base_data.sounds.append(Sound(rhymes=rhyme))

575

576

577def extract_homophones_template(

578 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

579):

580 expanded_node = wxr.wtp.parse(

581 wxr.wtp.node_to_wikitext(t_node), expand_all=True

582 )

583 homophones = []

584 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

585 for top_span in expanded_node.find_html(

586 "span", attr_name="class", attr_value="homophones"

587 ):

588 for span_tag in top_span.find_html("span"):

589 span_lang = span_tag.attrs.get("lang", "")

590 span_class = span_tag.attrs.get("class", "").split()

591 if "tr" in span_class and len(homophones) > 0:

592 homophones[-1].roman = clean_node(wxr, None, span_tag)

593 elif span_lang == lang_code:

594 homophone = clean_node(wxr, None, span_tag)

595 if homophone != "": 595 ↛ 588line 595 didn't jump to line 588 because the condition on line 595 was always true

596 homophones.append(Sound(homophone=homophone))

597 elif "qualifier-content" in span_class and len(homophones) > 0:

598 raw_tag = clean_node(wxr, None, span_tag)

599 if raw_tag != "": 599 ↛ 588line 599 didn't jump to line 588 because the condition on line 599 was always true

600 homophones[-1].raw_tags.append(raw_tag)

601 translate_raw_tags(homophones[-1])

602

603 base_data.sounds.extend(homophones)

604 for link_node in expanded_node.find_child(NodeKind.LINK):

605 clean_node(wxr, base_data, link_node)

606

607

608def extract_hyphenation_template(

609 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

610):

611 expanded_node = wxr.wtp.parse(

612 wxr.wtp.node_to_wikitext(t_node), expand_all=True

613 )

614 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

615 for span_tag in expanded_node.find_html(

616 "span", attr_name="lang", attr_value=lang_code

617 ):

618 h_str = clean_node(wxr, None, span_tag)

619 h_data = Hyphenation(

620 parts=list(filter(None, map(str.strip, h_str.split("‧"))))

621 )

622 if len(h_data.parts) > 0:

623 base_data.hyphenations.append(h_data)

624

625

626def extract_ko_ipa_template(

627 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

628):

629 sounds = []

630 expanded_node = wxr.wtp.parse(

631 wxr.wtp.node_to_wikitext(t_node), expand_all=True

632 )

633 clean_node(wxr, word_entry, expanded_node)

634 for ul_node in expanded_node.find_html("ul"):

635 for li_node in ul_node.find_html("li"):

636 if "ko-pron__ph" in li_node.attrs.get("class", ""):

637 for span_node in li_node.find_html(

638 "span", attr_name="lang", attr_value="ko"

639 ):

640 hangeul_str = clean_node(wxr, None, span_node).strip("[]")

641 for hangeul in hangeul_str.split("/"):

642 if hangeul != "":

643 sounds.append(

644 Sound(hangeul=hangeul, tags=["phonetic"])

645 )

646 else:

647 raw_tags = []

648 for i_node in li_node.find_html("i"):

649 for raw_tag in clean_node(wxr, None, i_node).split("/"):

650 if raw_tag not in ["", "สัทอักษรสากล"]:

651 raw_tags.append(raw_tag)

652 for span_node in li_node.find_html(

653 "span", attr_name="class", attr_value="IPA"

654 ):

655 ipas = clean_node(wxr, None, span_node)

656 for ipa in ipas.split("~"):

657 ipa = ipa.strip()

658 if ipa != "":

659 sound = Sound(ipa=ipa, raw_tags=raw_tags)

660 translate_raw_tags(sound)

661 sounds.append(sound)

662

663 for table in expanded_node.find_html("table"):

664 for tr in table.find_html("tr"):

665 raw_tag = ""

666 for th in tr.find_html("th"):

667 raw_tag = clean_node(wxr, None, th)

668 for td in tr.find_html("td"):

669 roman = clean_node(wxr, None, td)

670 if roman != "":

671 sound = Sound(roman=roman)

672 if raw_tag != "":

673 sound.raw_tags.append(raw_tag)

674 translate_raw_tags(sound)

675 sounds.append(sound)

676

677 audio_file = clean_node(

678 wxr,

679 None,

680 t_node.template_parameters.get(

681 "a", t_node.template_parameters.get("audio", "")

682 ),

683 )

684 if audio_file != "":

685 sound = Sound()

686 set_sound_file_url_fields(wxr, audio_file, sound)

687 sounds.append(sound)

688 word_entry.sounds.extend(sounds)

689

690

691def extract_approximate_th_pron(

692 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode

693):

694 # https://th.wiktionary.org/wiki/แม่แบบ:คำอ่านไทย

695 for arg_index in range(1, 7):

696 if arg_index not in t_node.template_parameters:

697 break

698 value = clean_node(wxr, None, t_node.template_parameters[arg_index])

699 if value != "":

700 base_data.sounds.append(

701 Sound(other=value, raw_tags=["เทียบเสียงภาษาไทยโดยประมาณ"])

702 )

Coverage for src / wiktextract / extractor / th / sound.py: 50%

456 statements