Coverage for src / wiktextract / extractor / th / sound.py: 50%

456 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-02 00:27 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..share import set_sound_file_url_fields 

15from .models import Hyphenation, Sound, WordEntry 

16from .tags import translate_raw_tags 

17 

18 

19def extract_sound_section( 

20 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

21): 

22 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

23 if t_node.template_name == "zh-forms": 23 ↛ 24line 23 didn't jump to line 24 because the condition on line 23 was never true

24 from .page import extract_zh_forms 

25 

26 extract_zh_forms(wxr, base_data, t_node) 

27 else: 

28 extract_sound_template(wxr, base_data, t_node) 

29 for list_node in level_node.find_child(NodeKind.LIST): 

30 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

31 for t_node in list_item.find_child(NodeKind.TEMPLATE): 

32 extract_sound_template(wxr, base_data, t_node) 

33 

34 

35def extract_sound_template( 

36 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

37): 

38 if t_node.template_name in ["ja-pron", "ja-IPA"]: 

39 extract_ja_pron_template(wxr, base_data, t_node) 

40 elif t_node.template_name == "th-pron": 

41 extract_th_pron_template(wxr, base_data, t_node) 

42 elif t_node.template_name == "lo-pron": 

43 extract_lo_pron_template(wxr, base_data, t_node) 

44 elif t_node.template_name == "zh-pron": 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 extract_zh_pron_template(wxr, base_data, t_node) 

46 elif t_node.template_name.lower() == "ko-ipa": 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 extract_ko_ipa_template(wxr, base_data, t_node) 

48 elif ( 

49 t_node.template_name.lower() == "ipa" 

50 or t_node.template_name.lower().endswith(("-ipa", "-pron")) 

51 ): 

52 extract_ipa_template(wxr, base_data, t_node) 

53 elif t_node.template_name == "X-SAMPA": 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 extract_x_sampa_template(wxr, base_data, t_node) 

55 elif t_node.template_name == "enPR": 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 extract_enpr_template(wxr, base_data, t_node) 

57 elif t_node.template_name in ["audio", "Audio", "เสียง"]: 57 ↛ 58line 57 didn't jump to line 58 because the condition on line 57 was never true

58 extract_audio_template(wxr, base_data, t_node) 

59 elif t_node.template_name in ["rhymes", "rhyme"]: 

60 extract_rhymes_template(wxr, base_data, t_node) 

61 elif t_node.template_name in ["homophones", "homophone", "hmp"]: 61 ↛ 63line 61 didn't jump to line 63 because the condition on line 61 was always true

62 extract_homophones_template(wxr, base_data, t_node) 

63 elif t_node.template_name in ["hyphenation", "hyph"]: 

64 extract_hyphenation_template(wxr, base_data, t_node) 

65 elif t_node.template_name in ["คำอ่านไทย", "คอท"]: 

66 extract_approximate_th_pron(wxr, base_data, t_node) 

67 

68 

69def extract_ipa_template( 

70 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

71): 

72 expanded_node = wxr.wtp.parse( 

73 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

74 ) 

75 no_list_nodes = [] 

76 for node in expanded_node.children: 

77 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

78 for list_item in node.find_child(NodeKind.LIST_ITEM): 

79 extract_ipa_list_item(wxr, base_data, list_item) 

80 else: 

81 no_list_nodes.append(node) 

82 if len(no_list_nodes) > 0: 82 ↛ 86line 82 didn't jump to line 86 because the condition on line 82 was always true

83 tmp_node = WikiNode(NodeKind.ROOT, 0) 

84 tmp_node.children = no_list_nodes 

85 extract_ipa_list_item(wxr, base_data, tmp_node) 

86 clean_node(wxr, base_data, expanded_node) 

87 

88 

89def extract_ipa_list_item( 

90 wxr: WiktextractContext, base_data: WordEntry, list_item: WikiNode 

91): 

92 raw_tags = [] 

93 for italic_node in list_item.find_child(NodeKind.ITALIC): 

94 # Template:vi-ipa location data 

95 raw_tag = clean_node(wxr, None, italic_node) 

96 if raw_tag != "": 96 ↛ 93line 96 didn't jump to line 93 because the condition on line 96 was always true

97 raw_tags.append(raw_tag) 

98 for span_tag in list_item.find_html_recursively("span"): 

99 span_class = span_tag.attrs.get("class", "").split() 

100 if "qualifier-content" in span_class or "ib-content" in span_class: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 for raw_tag in clean_node(wxr, None, span_tag).split(","): 

102 raw_tag = raw_tag.strip() 

103 if raw_tag != "": 

104 raw_tags.append(raw_tag) 

105 elif "IPA" in span_class: 105 ↛ 112line 105 didn't jump to line 112 because the condition on line 105 was always true

106 sound = Sound( 

107 ipa=clean_node(wxr, None, span_tag), raw_tags=raw_tags 

108 ) 

109 if sound.ipa != "": 109 ↛ 98line 109 didn't jump to line 98 because the condition on line 109 was always true

110 translate_raw_tags(sound) 

111 base_data.sounds.append(sound) 

112 elif "Latn" in span_class: 

113 sound = Sound( 

114 roman=clean_node(wxr, None, span_tag), raw_tags=raw_tags 

115 ) 

116 if sound.roman != "": 

117 translate_raw_tags(sound) 

118 base_data.sounds.append(sound) 

119 

120 

121def extract_ja_pron_template( 

122 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

123): 

124 JA_PRON_ACCENTS = { 

125 "นากาดากะ": "Nakadaka", 

126 "เฮบัง": "Heiban", 

127 "อาตามาดากะ": "Atamadaka", 

128 "โอดากะ": "Odaka", 

129 } 

130 expanded_node = wxr.wtp.parse( 

131 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

132 ) 

133 for li_tag in expanded_node.find_html_recursively("li"): 

134 sound = Sound() 

135 for span_tag in li_tag.find_html("span"): 

136 span_class = span_tag.attrs.get("class", "").split() 

137 if "usage-label-accent" in span_class: 

138 raw_tag = clean_node(wxr, None, span_tag).strip("() ") 

139 if raw_tag != "": 139 ↛ 135line 139 didn't jump to line 135 because the condition on line 139 was always true

140 sound.raw_tags.append(raw_tag) 

141 elif "IPA" in span_class: 

142 sound.ipa = clean_node(wxr, None, span_tag) 

143 elif "Latn" in span_class: 

144 sound.roman = clean_node(wxr, None, span_tag) 

145 elif span_tag.attrs.get("lang", "") == "ja": 145 ↛ 135line 145 didn't jump to line 135 because the condition on line 145 was always true

146 sound.other = clean_node(wxr, None, span_tag) 

147 for link_node in li_tag.find_child(NodeKind.LINK): 

148 link_text = clean_node(wxr, None, link_node) 

149 if link_text in JA_PRON_ACCENTS: 

150 sound.tags.append(JA_PRON_ACCENTS[link_text]) 

151 if sound.ipa != "" or sound.other != "": 151 ↛ 133line 151 didn't jump to line 133 because the condition on line 151 was always true

152 translate_raw_tags(sound) 

153 base_data.sounds.append(sound) 

154 audio_file = t_node.template_parameters.get( 

155 "a", t_node.template_parameters.get("audio", "") 

156 ).strip() 

157 if audio_file != "": 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true

158 sound = Sound() 

159 set_sound_file_url_fields(wxr, audio_file, sound) 

160 base_data.sounds.append(sound) 

161 clean_node(wxr, base_data, expanded_node) 

162 

163 

164def extract_x_sampa_template( 

165 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

166): 

167 sound = Sound( 

168 ipa=clean_node(wxr, None, t_node.template_parameters.get(1, "")), 

169 tags=["X-SAMPA"], 

170 ) 

171 if sound.ipa != "": 

172 base_data.sounds.append(sound) 

173 

174 

175def extract_enpr_template( 

176 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

177): 

178 sound = Sound( 

179 enpr=clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

180 ) 

181 if sound.enpr != "": 

182 base_data.sounds.append(sound) 

183 

184 

185def extract_audio_template( 

186 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

187): 

188 sound = Sound() 

189 filename = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

190 if filename != "": 190 ↛ exitline 190 didn't return from function 'extract_audio_template' because the condition on line 190 was always true

191 set_sound_file_url_fields(wxr, filename, sound) 

192 caption = clean_node(wxr, None, t_node.template_parameters.get(3, "")) 

193 if caption != "": 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true

194 sound.raw_tags.append(caption) 

195 expanded_node = wxr.wtp.parse( 

196 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

197 ) 

198 for span_node in expanded_node.find_html_recursively( 

199 "span", attr_name="class", attr_value="ib-content" 

200 ): 

201 for raw_tag in clean_node(wxr, None, span_node).split(","): 

202 if raw_tag != "": 202 ↛ 201line 202 didn't jump to line 201 because the condition on line 202 was always true

203 sound.raw_tags.append(raw_tag) 

204 translate_raw_tags(sound) 

205 base_data.sounds.append(sound) 

206 clean_node(wxr, base_data, t_node) 

207 

208 

209def extract_th_pron_template( 

210 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

211): 

212 # https://th.wiktionary.org/wiki/แม่แบบ:th-pron 

213 @dataclass 

214 class TableHeader: 

215 raw_tags: list[str] 

216 rowspan: int 

217 

218 expanded_node = wxr.wtp.parse( 

219 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

220 ) 

221 for table_tag in expanded_node.find_html("table"): 

222 row_headers = [] 

223 for tr_tag in table_tag.find_html("tr"): 

224 field = "other" 

225 new_headers = [] 

226 for header in row_headers: 

227 if header.rowspan > 1: 

228 header.rowspan -= 1 

229 new_headers.append(header) 

230 row_headers = new_headers 

231 for th_tag in tr_tag.find_html("th"): 

232 header_str = clean_node(wxr, None, th_tag) 

233 if header_str.startswith("(มาตรฐาน) สัทอักษรสากล"): 

234 field = "ipa" 

235 elif header_str.startswith("คำพ้องเสียง"): 

236 field = "homophone" 

237 elif header_str == "ไฟล์เสียง": 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true

238 field = "audio" 

239 elif header_str != "": 239 ↛ 231line 239 didn't jump to line 231 because the condition on line 239 was always true

240 rowspan = 1 

241 rowspan_str = th_tag.attrs.get("rowspan", "1") 

242 if re.fullmatch(r"\d+", rowspan_str): 242 ↛ 244line 242 didn't jump to line 244 because the condition on line 242 was always true

243 rowspan = int(rowspan_str) 

244 header = TableHeader([], rowspan) 

245 for line in header_str.splitlines(): 

246 for raw_tag in line.strip("{}\n ").split(";"): 

247 raw_tag = raw_tag.strip() 

248 if raw_tag != "": 248 ↛ 246line 248 didn't jump to line 246 because the condition on line 248 was always true

249 header.raw_tags.append(raw_tag) 

250 row_headers.append(header) 

251 

252 for td_tag in tr_tag.find_html("td"): 

253 if field == "audio": 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true

254 for link_node in td_tag.find_child(NodeKind.LINK): 

255 filename = clean_node(wxr, None, link_node.largs[0]) 

256 if filename != "": 

257 sound = Sound() 

258 set_sound_file_url_fields(wxr, filename, sound) 

259 base_data.sounds.append(sound) 

260 elif field == "homophone": 

261 for span_tag in td_tag.find_html_recursively( 

262 "span", attr_name="lang", attr_value="th" 

263 ): 

264 word = clean_node(wxr, None, span_tag) 

265 if word != "": 265 ↛ 261line 265 didn't jump to line 261 because the condition on line 265 was always true

266 base_data.sounds.append(Sound(homophone=word)) 

267 else: 

268 raw_tags = [] 

269 for html_node in td_tag.find_child_recursively( 

270 NodeKind.HTML 

271 ): 

272 if html_node.tag == "small": 

273 node_str = clean_node(wxr, None, html_node) 

274 if node_str.startswith("[") and node_str.endswith( 274 ↛ 269line 274 didn't jump to line 269 because the condition on line 274 was always true

275 "]" 

276 ): 

277 for raw_tag in node_str.strip("[]").split(","): 

278 raw_tag = raw_tag.strip() 

279 if raw_tag != "": 279 ↛ 277line 279 didn't jump to line 277 because the condition on line 279 was always true

280 raw_tags.append(raw_tag) 

281 elif html_node.tag == "span": 

282 node_str = clean_node(wxr, None, html_node) 

283 span_lang = html_node.attrs.get("lang", "") 

284 span_class = html_node.attrs.get("class", "") 

285 if node_str != "" and ( 

286 span_lang == "th" or span_class in ["IPA", "tr"] 

287 ): 

288 sound = Sound(raw_tags=raw_tags) 

289 for header in row_headers: 

290 sound.raw_tags.extend(header.raw_tags) 

291 translate_raw_tags(sound) 

292 if "romanization" in sound.tags: 

293 field = "roman" 

294 setattr(sound, field, node_str) 

295 base_data.sounds.append(sound) 

296 

297 clean_node(wxr, base_data, expanded_node) 

298 

299 

300def extract_lo_pron_template( 

301 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

302): 

303 # https://th.wiktionary.org/wiki/แม่แบบ:lo-pron 

304 expanded_node = wxr.wtp.parse( 

305 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

306 ) 

307 for list_node in expanded_node.find_child(NodeKind.LIST): 

308 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

309 field = "other" 

310 raw_tag = "" 

311 for node in list_item.children: 

312 if isinstance(node, HTMLNode) and node.tag == "span": 

313 span_class = node.attrs.get("class", "") 

314 if "qualifier-content" in span_class: 

315 raw_tag = clean_node(wxr, None, node) 

316 elif span_class == "IPA": 

317 ipa = clean_node(wxr, None, node) 

318 if ipa != "": 318 ↛ 311line 318 didn't jump to line 311 because the condition on line 318 was always true

319 sound = Sound(ipa=ipa) 

320 if raw_tag != "": 320 ↛ 323line 320 didn't jump to line 323 because the condition on line 320 was always true

321 sound.raw_tags.append(raw_tag) 

322 translate_raw_tags(sound) 

323 base_data.sounds.append(sound) 

324 else: 

325 span_lang = node.attrs.get("lang", "") 

326 if span_lang == "lo" and field == "hyphenation": 

327 span_str = clean_node(wxr, None, node) 

328 if span_str != "": 328 ↛ 311line 328 didn't jump to line 311 because the condition on line 328 was always true

329 base_data.hyphenations.append( 

330 Hyphenation(parts=span_str.split("-")) 

331 ) 

332 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

333 link_str = clean_node(wxr, None, node) 

334 if link_str == "สัทอักษรสากล": 

335 field = "ipa" 

336 elif link_str != "" and field == "rhymes": 

337 base_data.sounds.append(Sound(rhymes=link_str)) 

338 elif isinstance(node, str) and node.strip().endswith(":"): 

339 node = node.strip() 

340 if node == "การแบ่งพยางค์:": 

341 field = "hyphenation" 

342 elif node == "สัมผัส:": 342 ↛ 311line 342 didn't jump to line 311 because the condition on line 342 was always true

343 field = "rhymes" 

344 

345 clean_node(wxr, base_data, expanded_node) 

346 

347 

348def extract_zh_pron_template( 

349 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

350): 

351 expanded_node = wxr.wtp.parse( 

352 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

353 ) 

354 seen_lists = set() 

355 sounds = [] 

356 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

357 if list_node not in seen_lists: 

358 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

359 sounds.extend( 

360 extract_zh_pron_list_item(wxr, list_item, [], seen_lists) 

361 ) 

362 for sound in sounds: 

363 translate_raw_tags(sound) 

364 base_data.sounds.extend(sounds) 

365 clean_node(wxr, base_data, expanded_node) 

366 

367 

368def extract_zh_pron_list_item( 

369 wxr: WiktextractContext, 

370 list_item_node: WikiNode, 

371 raw_tags: list[str], 

372 seen_lists: set[WikiNode], 

373) -> list[Sound]: 

374 current_tags = raw_tags[:] 

375 sounds = [] 

376 is_first_small_tag = True 

377 for node in list_item_node.children: 

378 if isinstance(node, WikiNode): 

379 if node.kind == NodeKind.LINK: 

380 link_str = clean_node(wxr, None, node.largs) 

381 node_str = clean_node(wxr, None, node) 

382 if link_str.startswith(("File:", "ไฟล์:")): 

383 filename = link_str.removeprefix("File:").removeprefix( 

384 "ไฟล์:" 

385 ) 

386 sound_data = Sound(raw_tags=current_tags) 

387 set_sound_file_url_fields(wxr, filename, sound_data) 

388 sounds.append(sound_data) 

389 elif node_str != "": 

390 current_tags.append(node_str.strip("()")) 

391 elif isinstance(node, HTMLNode): 

392 if node.tag == "small": 

393 # remove <sup> tag 

394 if is_first_small_tag: 

395 raw_tag_text = clean_node( 

396 wxr, 

397 None, 

398 [ 

399 n 

400 for n in node.children 

401 if not ( 

402 isinstance(n, HTMLNode) and n.tag == "sup" 

403 ) 

404 ], 

405 ).rstrip(":") 

406 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text)) 

407 elif len(sounds) > 0: 

408 sounds[-1].raw_tags.extend( 

409 split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

410 ) 

411 is_first_small_tag = False 

412 elif node.tag == "span": 

413 sounds.extend(extract_zh_pron_span(wxr, node, current_tags)) 

414 elif ( 

415 node.tag == "table" 

416 and len(current_tags) > 0 

417 and current_tags[-1] == "คำพ้องเสียง" 

418 ): 

419 sounds.extend( 

420 extract_zh_pron_homophones_table( 

421 wxr, node, current_tags 

422 ) 

423 ) 

424 elif node.kind == NodeKind.LIST: 

425 seen_lists.add(node) 

426 for next_list_item in node.find_child(NodeKind.LIST_ITEM): 

427 sounds.extend( 

428 extract_zh_pron_list_item( 

429 wxr, 

430 next_list_item, 

431 current_tags, 

432 seen_lists, 

433 ) 

434 ) 

435 return sounds 

436 

437 

438def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]: 

439 raw_tags = [] 

440 if "(" not in raw_tag_text: 

441 for raw_tag in re.split(r",|:|;| and ", raw_tag_text): 

442 raw_tag = raw_tag.strip().removeprefix("incl. ").strip() 

443 if raw_tag != "": 

444 raw_tags.append(raw_tag) 

445 else: 

446 processed_offsets = [] 

447 for match in re.finditer(r"\([^()]+\)", raw_tag_text): 

448 processed_offsets.append((match.start(), match.end())) 

449 raw_tags.extend( 

450 split_zh_pron_raw_tag( 

451 raw_tag_text[match.start() + 1 : match.end() - 1] 

452 ) 

453 ) 

454 not_processed = "" 

455 last_end = 0 

456 for start, end in processed_offsets: 

457 not_processed += raw_tag_text[last_end:start] 

458 last_end = end 

459 not_processed += raw_tag_text[last_end:] 

460 if not_processed != raw_tag_text: 

461 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags 

462 else: 

463 raw_tags.append(not_processed) 

464 return raw_tags 

465 

466 

467def extract_zh_pron_span( 

468 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str] 

469) -> list[Sound]: 

470 sounds = [] 

471 small_tags = [] 

472 pron_nodes = [] 

473 roman = "" 

474 phonetic_pron = "" 

475 for index, node in enumerate(span_tag.children): 

476 if isinstance(node, HTMLNode) and node.tag == "small": 

477 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

478 elif ( 

479 isinstance(node, HTMLNode) 

480 and node.tag == "span" 

481 and "-Latn" in node.attrs.get("lang", "") 

482 ): 

483 roman = clean_node(wxr, None, node).strip("() ") 

484 elif isinstance(node, str) and node.strip() == "[Phonetic:": 

485 phonetic_pron = clean_node( 

486 wxr, None, span_tag.children[index + 1 :] 

487 ).strip("] ") 

488 break 

489 else: 

490 pron_nodes.append(node) 

491 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)): 

492 zh_pron = zh_pron.strip("[]: ") 

493 if len(zh_pron) > 0: 

494 if "IPA" in span_tag.attrs.get("class", ""): 

495 sounds.append( 

496 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags) 

497 ) 

498 else: 

499 sounds.append( 

500 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags) 

501 ) 

502 if len(sounds) > 0: 

503 sounds[-1].raw_tags.extend(small_tags) 

504 if phonetic_pron != "": 

505 sounds.append( 

506 Sound( 

507 zh_pron=phonetic_pron, 

508 roman=roman, 

509 raw_tags=raw_tags + ["Phonetic"], 

510 ) 

511 ) 

512 return sounds 

513 

514 

515def split_zh_pron(zh_pron: str) -> list[str]: 

516 # split by comma and other symbols that outside parentheses 

517 parentheses = 0 

518 pron_list = [] 

519 pron = "" 

520 for c in zh_pron: 

521 if ( 

522 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/"))) 

523 and parentheses == 0 

524 and len(pron.strip()) > 0 

525 ): 

526 pron_list.append(pron.strip()) 

527 pron = "" 

528 elif c == "(": 

529 parentheses += 1 

530 pron += c 

531 elif c == ")": 

532 parentheses -= 1 

533 pron += c 

534 else: 

535 pron += c 

536 

537 if pron.strip() != "": 

538 pron_list.append(pron) 

539 return pron_list 

540 

541 

542def extract_zh_pron_homophones_table( 

543 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str] 

544) -> list[Sound]: 

545 sounds = [] 

546 for td_tag in table.find_html_recursively("td"): 

547 for span_tag in td_tag.find_html("span"): 

548 span_class = span_tag.attrs.get("class", "") 

549 span_lang = span_tag.attrs.get("lang", "") 

550 span_str = clean_node(wxr, None, span_tag) 

551 if ( 

552 span_str not in ["", "/"] 

553 and span_lang != "" 

554 and span_class in ["Hant", "Hans", "Hani"] 

555 ): 

556 sound = Sound(homophone=span_str, raw_tags=raw_tags) 

557 if span_class == "Hant": 

558 sound.tags.append("Traditional-Chinese") 

559 elif span_class == "Hans": 

560 sound.tags.append("Simplified-Chinese") 

561 sounds.append(sound) 

562 return sounds 

563 

564 

565def extract_rhymes_template( 

566 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

567): 

568 expanded_node = wxr.wtp.parse( 

569 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

570 ) 

571 for link_node in expanded_node.find_child(NodeKind.LINK): 

572 rhyme = clean_node(wxr, base_data, link_node) 

573 if rhyme != "": 

574 base_data.sounds.append(Sound(rhymes=rhyme)) 

575 

576 

577def extract_homophones_template( 

578 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

579): 

580 expanded_node = wxr.wtp.parse( 

581 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

582 ) 

583 homophones = [] 

584 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

585 for top_span in expanded_node.find_html( 

586 "span", attr_name="class", attr_value="homophones" 

587 ): 

588 for span_tag in top_span.find_html("span"): 

589 span_lang = span_tag.attrs.get("lang", "") 

590 span_class = span_tag.attrs.get("class", "").split() 

591 if "tr" in span_class and len(homophones) > 0: 

592 homophones[-1].roman = clean_node(wxr, None, span_tag) 

593 elif span_lang == lang_code: 

594 homophone = clean_node(wxr, None, span_tag) 

595 if homophone != "": 595 ↛ 588line 595 didn't jump to line 588 because the condition on line 595 was always true

596 homophones.append(Sound(homophone=homophone)) 

597 elif "qualifier-content" in span_class and len(homophones) > 0: 

598 raw_tag = clean_node(wxr, None, span_tag) 

599 if raw_tag != "": 599 ↛ 588line 599 didn't jump to line 588 because the condition on line 599 was always true

600 homophones[-1].raw_tags.append(raw_tag) 

601 translate_raw_tags(homophones[-1]) 

602 

603 base_data.sounds.extend(homophones) 

604 for link_node in expanded_node.find_child(NodeKind.LINK): 

605 clean_node(wxr, base_data, link_node) 

606 

607 

608def extract_hyphenation_template( 

609 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

610): 

611 expanded_node = wxr.wtp.parse( 

612 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

613 ) 

614 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

615 for span_tag in expanded_node.find_html( 

616 "span", attr_name="lang", attr_value=lang_code 

617 ): 

618 h_str = clean_node(wxr, None, span_tag) 

619 h_data = Hyphenation( 

620 parts=list(filter(None, map(str.strip, h_str.split("‧")))) 

621 ) 

622 if len(h_data.parts) > 0: 

623 base_data.hyphenations.append(h_data) 

624 

625 

626def extract_ko_ipa_template( 

627 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

628): 

629 sounds = [] 

630 expanded_node = wxr.wtp.parse( 

631 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

632 ) 

633 clean_node(wxr, word_entry, expanded_node) 

634 for ul_node in expanded_node.find_html("ul"): 

635 for li_node in ul_node.find_html("li"): 

636 if "ko-pron__ph" in li_node.attrs.get("class", ""): 

637 for span_node in li_node.find_html( 

638 "span", attr_name="lang", attr_value="ko" 

639 ): 

640 hangeul_str = clean_node(wxr, None, span_node).strip("[]") 

641 for hangeul in hangeul_str.split("/"): 

642 if hangeul != "": 

643 sounds.append( 

644 Sound(hangeul=hangeul, tags=["phonetic"]) 

645 ) 

646 else: 

647 raw_tags = [] 

648 for i_node in li_node.find_html("i"): 

649 for raw_tag in clean_node(wxr, None, i_node).split("/"): 

650 if raw_tag not in ["", "สัทอักษรสากล"]: 

651 raw_tags.append(raw_tag) 

652 for span_node in li_node.find_html( 

653 "span", attr_name="class", attr_value="IPA" 

654 ): 

655 ipas = clean_node(wxr, None, span_node) 

656 for ipa in ipas.split("~"): 

657 ipa = ipa.strip() 

658 if ipa != "": 

659 sound = Sound(ipa=ipa, raw_tags=raw_tags) 

660 translate_raw_tags(sound) 

661 sounds.append(sound) 

662 

663 for table in expanded_node.find_html("table"): 

664 for tr in table.find_html("tr"): 

665 raw_tag = "" 

666 for th in tr.find_html("th"): 

667 raw_tag = clean_node(wxr, None, th) 

668 for td in tr.find_html("td"): 

669 roman = clean_node(wxr, None, td) 

670 if roman != "": 

671 sound = Sound(roman=roman) 

672 if raw_tag != "": 

673 sound.raw_tags.append(raw_tag) 

674 translate_raw_tags(sound) 

675 sounds.append(sound) 

676 

677 audio_file = clean_node( 

678 wxr, 

679 None, 

680 t_node.template_parameters.get( 

681 "a", t_node.template_parameters.get("audio", "") 

682 ), 

683 ) 

684 if audio_file != "": 

685 sound = Sound() 

686 set_sound_file_url_fields(wxr, audio_file, sound) 

687 sounds.append(sound) 

688 word_entry.sounds.extend(sounds) 

689 

690 

691def extract_approximate_th_pron( 

692 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

693): 

694 # https://th.wiktionary.org/wiki/แม่แบบ:คำอ่านไทย 

695 for arg_index in range(1, 7): 

696 if arg_index not in t_node.template_parameters: 

697 break 

698 value = clean_node(wxr, None, t_node.template_parameters[arg_index]) 

699 if value != "": 

700 base_data.sounds.append( 

701 Sound(other=value, raw_tags=["เทียบเสียงภาษาไทยโดยประมาณ"]) 

702 )