Coverage for src/wiktextract/extractor/th/sound.py: 51%

315 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..share import set_sound_file_url_fields 

15from .models import Hyphenation, Sound, WordEntry 

16from .tags import translate_raw_tags 

17 

18 

19def extract_sound_section( 

20 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

21): 

22 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

23 extract_sound_template(wxr, base_data, t_node) 

24 for list_node in level_node.find_child(NodeKind.LIST): 

25 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

26 for t_node in list_item.find_child(NodeKind.TEMPLATE): 

27 extract_sound_template(wxr, base_data, t_node) 

28 

29 

30def extract_sound_template( 

31 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

32): 

33 if t_node.template_name == "IPA": 33 ↛ 34line 33 didn't jump to line 34 because the condition on line 33 was never true

34 extract_ipa_template(wxr, base_data, t_node) 

35 elif t_node.template_name == "X-SAMPA": 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true

36 extract_x_sampa_template(wxr, base_data, t_node) 

37 elif t_node.template_name == "enPR": 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 extract_enpr_template(wxr, base_data, t_node) 

39 elif t_node.template_name == "audio": 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 extract_audio_template(wxr, base_data, t_node) 

41 elif t_node.template_name == "th-pron": 

42 extract_th_pron_template(wxr, base_data, t_node) 

43 elif t_node.template_name == "lo-pron": 

44 extract_lo_pron_template(wxr, base_data, t_node) 

45 elif t_node.template_name in ["ja-pron", "ja-IPA"]: 45 ↛ 47line 45 didn't jump to line 47 because the condition on line 45 was always true

46 extract_ja_pron_template(wxr, base_data, t_node) 

47 elif t_node.template_name == "zh-pron": 

48 extract_zh_pron_template(wxr, base_data, t_node) 

49 

50 

51def extract_ipa_template( 

52 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

53): 

54 expanded_node = wxr.wtp.parse( 

55 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

56 ) 

57 extract_ipa_li_tag(wxr, base_data, expanded_node) 

58 clean_node(wxr, base_data, expanded_node) 

59 

60 

61def extract_ipa_li_tag( 

62 wxr: WiktextractContext, base_data: WordEntry, li_tag: HTMLNode 

63): 

64 raw_tag = "" 

65 for span_tag in li_tag.find_html_recursively("span"): 

66 span_class = span_tag.attrs.get("class", "").split() 

67 if "qualifier-content" in span_class: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 raw_tag = clean_node(wxr, None, span_tag) 

69 elif "IPA" in span_class: 

70 sound = Sound(ipa=clean_node(wxr, None, span_tag)) 

71 if raw_tag != "": 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 sound.raw_tags.append(raw_tag) 

73 translate_raw_tags(sound) 

74 if sound.ipa != "": 74 ↛ 65line 74 didn't jump to line 65 because the condition on line 74 was always true

75 base_data.sounds.append(sound) 

76 elif "Latn" in span_class: 

77 sound = Sound(roman=clean_node(wxr, None, span_tag)) 

78 if raw_tag != "": 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 sound.raw_tags.append(raw_tag) 

80 translate_raw_tags(sound) 

81 if sound.roman != "": 81 ↛ 65line 81 didn't jump to line 65 because the condition on line 81 was always true

82 base_data.sounds.append(sound) 

83 

84 

85def extract_ja_pron_template( 

86 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

87): 

88 expanded_node = wxr.wtp.parse( 

89 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

90 ) 

91 for li_tag in expanded_node.find_html_recursively("li"): 

92 extract_ipa_li_tag(wxr, base_data, li_tag) 

93 clean_node(wxr, base_data, expanded_node) 

94 

95 

96def extract_x_sampa_template( 

97 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

98): 

99 sound = Sound( 

100 ipa=clean_node(wxr, None, t_node.template_parameters.get(1, "")), 

101 tags=["X-SAMPA"], 

102 ) 

103 if sound.ipa != "": 

104 base_data.sounds.append(sound) 

105 

106 

107def extract_enpr_template( 

108 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

109): 

110 sound = Sound( 

111 enpr=clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

112 ) 

113 if sound.enpr != "": 

114 base_data.sounds.append(sound) 

115 

116 

117def extract_audio_template( 

118 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

119): 

120 sound = Sound() 

121 filename = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

122 if filename != "": 122 ↛ exitline 122 didn't return from function 'extract_audio_template' because the condition on line 122 was always true

123 set_sound_file_url_fields(wxr, filename, sound) 

124 for raw_tag in clean_node( 

125 wxr, None, t_node.template_parameters.get("a", "") 

126 ).split(","): 

127 raw_tag = raw_tag.strip() 

128 if raw_tag != "": 128 ↛ 124line 128 didn't jump to line 124 because the condition on line 128 was always true

129 sound.raw_tags.append(raw_tag) 

130 translate_raw_tags(sound) 

131 base_data.sounds.append(sound) 

132 clean_node(wxr, base_data, t_node) 

133 

134 

135def extract_th_pron_template( 

136 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

137): 

138 # https://th.wiktionary.org/wiki/แม่แบบ:th-pron 

139 @dataclass 

140 class TableHeader: 

141 raw_tags: list[str] 

142 rowspan: int 

143 

144 expanded_node = wxr.wtp.parse( 

145 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

146 ) 

147 for table_tag in expanded_node.find_html("table"): 

148 row_headers = [] 

149 for tr_tag in table_tag.find_html("tr"): 

150 field = "other" 

151 new_headers = [] 

152 for header in row_headers: 

153 if header.rowspan > 1: 

154 header.rowspan -= 1 

155 new_headers.append(header) 

156 row_headers = new_headers 

157 for th_tag in tr_tag.find_html("th"): 

158 header_str = clean_node(wxr, None, th_tag) 

159 if header_str.startswith("(มาตรฐาน) สัทอักษรสากล"): 

160 field = "ipa" 

161 elif header_str.startswith("คำพ้องเสียง"): 

162 field = "homophone" 

163 elif header_str == "ไฟล์เสียง": 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 field = "audio" 

165 elif header_str != "": 165 ↛ 157line 165 didn't jump to line 157 because the condition on line 165 was always true

166 rowspan = 1 

167 rowspan_str = th_tag.attrs.get("rowspan", "1") 

168 if re.fullmatch(r"\d+", rowspan_str): 168 ↛ 170line 168 didn't jump to line 170 because the condition on line 168 was always true

169 rowspan = int(rowspan_str) 

170 header = TableHeader([], rowspan) 

171 for line in header_str.splitlines(): 

172 for raw_tag in line.strip("{}\n ").split(";"): 

173 raw_tag = raw_tag.strip() 

174 if raw_tag != "": 174 ↛ 172line 174 didn't jump to line 172 because the condition on line 174 was always true

175 header.raw_tags.append(raw_tag) 

176 row_headers.append(header) 

177 

178 for td_tag in tr_tag.find_html("td"): 

179 if field == "audio": 179 ↛ 180line 179 didn't jump to line 180 because the condition on line 179 was never true

180 for link_node in td_tag.find_child(NodeKind.LINK): 

181 filename = clean_node(wxr, None, link_node.largs[0]) 

182 if filename != "": 

183 sound = Sound() 

184 set_sound_file_url_fields(wxr, filename, sound) 

185 base_data.sounds.append(sound) 

186 elif field == "homophone": 

187 for span_tag in td_tag.find_html_recursively( 

188 "span", attr_name="lang", attr_value="th" 

189 ): 

190 word = clean_node(wxr, None, span_tag) 

191 if word != "": 191 ↛ 187line 191 didn't jump to line 187 because the condition on line 191 was always true

192 base_data.sounds.append(Sound(homophone=word)) 

193 else: 

194 raw_tags = [] 

195 for html_node in td_tag.find_child_recursively( 

196 NodeKind.HTML 

197 ): 

198 if html_node.tag == "small": 

199 node_str = clean_node(wxr, None, html_node) 

200 if node_str.startswith("[") and node_str.endswith( 200 ↛ 195line 200 didn't jump to line 195 because the condition on line 200 was always true

201 "]" 

202 ): 

203 for raw_tag in node_str.strip("[]").split(","): 

204 raw_tag = raw_tag.strip() 

205 if raw_tag != "": 205 ↛ 203line 205 didn't jump to line 203 because the condition on line 205 was always true

206 raw_tags.append(raw_tag) 

207 elif html_node.tag == "span": 

208 node_str = clean_node(wxr, None, html_node) 

209 span_lang = html_node.attrs.get("lang", "") 

210 span_class = html_node.attrs.get("class", "") 

211 if node_str != "" and ( 

212 span_lang == "th" or span_class in ["IPA", "tr"] 

213 ): 

214 sound = Sound(raw_tags=raw_tags) 

215 for header in row_headers: 

216 sound.raw_tags.extend(header.raw_tags) 

217 translate_raw_tags(sound) 

218 if "romanization" in sound.tags: 

219 field = "roman" 

220 setattr(sound, field, node_str) 

221 base_data.sounds.append(sound) 

222 

223 clean_node(wxr, base_data, expanded_node) 

224 

225 

226def extract_lo_pron_template( 

227 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

228): 

229 # https://th.wiktionary.org/wiki/แม่แบบ:lo-pron 

230 expanded_node = wxr.wtp.parse( 

231 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

232 ) 

233 for list_node in expanded_node.find_child(NodeKind.LIST): 

234 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

235 field = "other" 

236 raw_tag = "" 

237 for node in list_item.children: 

238 if isinstance(node, HTMLNode) and node.tag == "span": 

239 span_class = node.attrs.get("class", "") 

240 if "qualifier-content" in span_class: 

241 raw_tag = clean_node(wxr, None, node) 

242 elif span_class == "IPA": 

243 ipa = clean_node(wxr, None, node) 

244 if ipa != "": 244 ↛ 237line 244 didn't jump to line 237 because the condition on line 244 was always true

245 sound = Sound(ipa=ipa) 

246 if raw_tag != "": 246 ↛ 249line 246 didn't jump to line 249 because the condition on line 246 was always true

247 sound.raw_tags.append(raw_tag) 

248 translate_raw_tags(sound) 

249 base_data.sounds.append(sound) 

250 else: 

251 span_lang = node.attrs.get("lang", "") 

252 if span_lang == "lo" and field == "hyphenation": 

253 span_str = clean_node(wxr, None, node) 

254 if span_str != "": 254 ↛ 237line 254 didn't jump to line 237 because the condition on line 254 was always true

255 base_data.hyphenations.append( 

256 Hyphenation(parts=span_str.split("-")) 

257 ) 

258 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

259 link_str = clean_node(wxr, None, node) 

260 if link_str == "สัทอักษรสากล": 

261 field = "ipa" 

262 elif link_str != "" and field == "rhymes": 

263 base_data.sounds.append(Sound(rhymes=link_str)) 

264 elif isinstance(node, str) and node.strip().endswith(":"): 

265 node = node.strip() 

266 if node == "การแบ่งพยางค์:": 

267 field = "hyphenation" 

268 elif node == "สัมผัส:": 268 ↛ 237line 268 didn't jump to line 237 because the condition on line 268 was always true

269 field = "rhymes" 

270 

271 clean_node(wxr, base_data, expanded_node) 

272 

273 

274def extract_zh_pron_template( 

275 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

276): 

277 expanded_node = wxr.wtp.parse( 

278 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

279 ) 

280 seen_lists = set() 

281 sounds = [] 

282 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

283 if list_node not in seen_lists: 

284 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

285 sounds.extend( 

286 extract_zh_pron_list_item(wxr, list_item, [], seen_lists) 

287 ) 

288 for sound in sounds: 

289 translate_raw_tags(sound) 

290 base_data.sounds.extend(sounds) 

291 clean_node(wxr, base_data, expanded_node) 

292 

293 

294def extract_zh_pron_list_item( 

295 wxr: WiktextractContext, 

296 list_item_node: WikiNode, 

297 raw_tags: list[str], 

298 seen_lists: set[WikiNode], 

299) -> list[Sound]: 

300 current_tags = raw_tags[:] 

301 sounds = [] 

302 is_first_small_tag = True 

303 for node in list_item_node.children: 

304 if isinstance(node, WikiNode): 

305 if node.kind == NodeKind.LINK: 

306 link_str = clean_node(wxr, None, node.largs) 

307 node_str = clean_node(wxr, None, node) 

308 if link_str.startswith(("File:", "ไฟล์:")): 

309 filename = link_str.removeprefix("File:").removeprefix( 

310 "ไฟล์:" 

311 ) 

312 sound_data = Sound(raw_tags=current_tags) 

313 set_sound_file_url_fields(wxr, filename, sound_data) 

314 sounds.append(sound_data) 

315 elif node_str != "": 

316 current_tags.append(node_str.strip("()")) 

317 elif isinstance(node, HTMLNode): 

318 if node.tag == "small": 

319 # remove <sup> tag 

320 if is_first_small_tag: 

321 raw_tag_text = clean_node( 

322 wxr, 

323 None, 

324 [ 

325 n 

326 for n in node.children 

327 if not ( 

328 isinstance(n, HTMLNode) and n.tag == "sup" 

329 ) 

330 ], 

331 ).rstrip(":") 

332 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text)) 

333 elif len(sounds) > 0: 

334 sounds[-1].raw_tags.extend( 

335 split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

336 ) 

337 is_first_small_tag = False 

338 elif node.tag == "span": 

339 sounds.extend(extract_zh_pron_span(wxr, node, current_tags)) 

340 elif ( 

341 node.tag == "table" 

342 and len(current_tags) > 0 

343 and current_tags[-1] == "คำพ้องเสียง" 

344 ): 

345 sounds.extend( 

346 extract_zh_pron_homophones_table( 

347 wxr, node, current_tags 

348 ) 

349 ) 

350 elif node.kind == NodeKind.LIST: 

351 seen_lists.add(node) 

352 for next_list_item in node.find_child(NodeKind.LIST_ITEM): 

353 sounds.extend( 

354 extract_zh_pron_list_item( 

355 wxr, 

356 next_list_item, 

357 current_tags, 

358 seen_lists, 

359 ) 

360 ) 

361 return sounds 

362 

363 

364def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]: 

365 raw_tags = [] 

366 if "(" not in raw_tag_text: 

367 for raw_tag in re.split(r",|:|;| and ", raw_tag_text): 

368 raw_tag = raw_tag.strip().removeprefix("incl. ").strip() 

369 if raw_tag != "": 

370 raw_tags.append(raw_tag) 

371 else: 

372 processed_offsets = [] 

373 for match in re.finditer(r"\([^()]+\)", raw_tag_text): 

374 processed_offsets.append((match.start(), match.end())) 

375 raw_tags.extend( 

376 split_zh_pron_raw_tag( 

377 raw_tag_text[match.start() + 1 : match.end() - 1] 

378 ) 

379 ) 

380 not_processed = "" 

381 last_end = 0 

382 for start, end in processed_offsets: 

383 not_processed += raw_tag_text[last_end:start] 

384 last_end = end 

385 not_processed += raw_tag_text[last_end:] 

386 if not_processed != raw_tag_text: 

387 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags 

388 else: 

389 raw_tags.append(not_processed) 

390 return raw_tags 

391 

392 

393def extract_zh_pron_span( 

394 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str] 

395) -> list[Sound]: 

396 sounds = [] 

397 small_tags = [] 

398 pron_nodes = [] 

399 roman = "" 

400 phonetic_pron = "" 

401 for index, node in enumerate(span_tag.children): 

402 if isinstance(node, HTMLNode) and node.tag == "small": 

403 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

404 elif ( 

405 isinstance(node, HTMLNode) 

406 and node.tag == "span" 

407 and "-Latn" in node.attrs.get("lang", "") 

408 ): 

409 roman = clean_node(wxr, None, node).strip("() ") 

410 elif isinstance(node, str) and node.strip() == "[Phonetic:": 

411 phonetic_pron = clean_node( 

412 wxr, None, span_tag.children[index + 1 :] 

413 ).strip("] ") 

414 break 

415 else: 

416 pron_nodes.append(node) 

417 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)): 

418 zh_pron = zh_pron.strip("[]: ") 

419 if len(zh_pron) > 0: 

420 if "IPA" in span_tag.attrs.get("class", ""): 

421 sounds.append( 

422 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags) 

423 ) 

424 else: 

425 sounds.append( 

426 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags) 

427 ) 

428 if len(sounds) > 0: 

429 sounds[-1].raw_tags.extend(small_tags) 

430 if phonetic_pron != "": 

431 sounds.append( 

432 Sound( 

433 zh_pron=phonetic_pron, 

434 roman=roman, 

435 raw_tags=raw_tags + ["Phonetic"], 

436 ) 

437 ) 

438 return sounds 

439 

440 

441def split_zh_pron(zh_pron: str) -> list[str]: 

442 # split by comma and other symbols that outside parentheses 

443 parentheses = 0 

444 pron_list = [] 

445 pron = "" 

446 for c in zh_pron: 

447 if ( 

448 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/"))) 

449 and parentheses == 0 

450 and len(pron.strip()) > 0 

451 ): 

452 pron_list.append(pron.strip()) 

453 pron = "" 

454 elif c == "(": 

455 parentheses += 1 

456 pron += c 

457 elif c == ")": 

458 parentheses -= 1 

459 pron += c 

460 else: 

461 pron += c 

462 

463 if pron.strip() != "": 

464 pron_list.append(pron) 

465 return pron_list 

466 

467 

468def extract_zh_pron_homophones_table( 

469 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str] 

470) -> list[Sound]: 

471 sounds = [] 

472 for td_tag in table.find_html_recursively("td"): 

473 for span_tag in td_tag.find_html("span"): 

474 span_class = span_tag.attrs.get("class", "") 

475 span_lang = span_tag.attrs.get("lang", "") 

476 span_str = clean_node(wxr, None, span_tag) 

477 if ( 

478 span_str not in ["", "/"] 

479 and span_lang != "" 

480 and span_class in ["Hant", "Hans", "Hani"] 

481 ): 

482 sound = Sound(homophone=span_str, raw_tags=raw_tags) 

483 if span_class == "Hant": 

484 sound.tags.append("Traditional-Chinese") 

485 elif span_class == "Hans": 

486 sound.tags.append("Simplified-Chinese") 

487 sounds.append(sound) 

488 return sounds