Coverage for src/wiktextract/extractor/th/sound.py: 48%

295 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..share import set_sound_file_url_fields 

15from .models import Hyphenation, Sound, WordEntry 

16from .tags import translate_raw_tags 

17 

18 

19def extract_sound_section( 

20 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

21): 

22 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

23 extract_sound_template(wxr, base_data, t_node) 

24 for list_node in level_node.find_child(NodeKind.LIST): 

25 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

26 for t_node in list_item.find_child(NodeKind.TEMPLATE): 

27 extract_sound_template(wxr, base_data, t_node) 

28 

29 

30def extract_sound_template( 

31 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

32): 

33 if t_node.template_name == "IPA": 33 ↛ 34line 33 didn't jump to line 34 because the condition on line 33 was never true

34 extract_ipa_template(wxr, base_data, t_node) 

35 elif t_node.template_name == "X-SAMPA": 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true

36 extract_x_sampa_template(wxr, base_data, t_node) 

37 elif t_node.template_name == "enPR": 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 extract_enpr_template(wxr, base_data, t_node) 

39 elif t_node.template_name == "audio": 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true

40 extract_audio_template(wxr, base_data, t_node) 

41 elif t_node.template_name == "th-pron": 

42 extract_th_pron_template(wxr, base_data, t_node) 

43 elif t_node.template_name == "lo-pron": 

44 extract_lo_pron_template(wxr, base_data, t_node) 

45 elif t_node.template_name in ["ja-pron", "ja-IPA"]: 45 ↛ 47line 45 didn't jump to line 47 because the condition on line 45 was always true

46 extract_ja_pron_template(wxr, base_data, t_node) 

47 elif t_node.template_name == "zh-pron": 

48 extract_zh_pron_template(wxr, base_data, t_node) 

49 

50 

51def extract_ipa_template( 

52 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

53): 

54 expanded_node = wxr.wtp.parse( 

55 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

56 ) 

57 extract_ipa_li_tag(wxr, base_data, expanded_node) 

58 clean_node(wxr, base_data, expanded_node) 

59 

60 

61def extract_ipa_li_tag( 

62 wxr: WiktextractContext, base_data: WordEntry, li_tag: HTMLNode 

63): 

64 raw_tag = "" 

65 for span_tag in li_tag.find_html_recursively("span"): 

66 span_class = span_tag.attrs.get("class", "").split() 

67 if "qualifier-content" in span_class: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 raw_tag = clean_node(wxr, None, span_tag) 

69 elif "IPA" in span_class: 

70 sound = Sound(ipa=clean_node(wxr, None, span_tag)) 

71 if raw_tag != "": 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 sound.raw_tags.append(raw_tag) 

73 translate_raw_tags(sound) 

74 if sound.ipa != "": 74 ↛ 65line 74 didn't jump to line 65 because the condition on line 74 was always true

75 base_data.sounds.append(sound) 

76 elif "Latn" in span_class: 

77 sound = Sound(roman=clean_node(wxr, None, span_tag)) 

78 if raw_tag != "": 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true

79 sound.raw_tags.append(raw_tag) 

80 translate_raw_tags(sound) 

81 if sound.roman != "": 81 ↛ 65line 81 didn't jump to line 65 because the condition on line 81 was always true

82 base_data.sounds.append(sound) 

83 

84 

85def extract_ja_pron_template( 

86 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

87): 

88 expanded_node = wxr.wtp.parse( 

89 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

90 ) 

91 for li_tag in expanded_node.find_html_recursively("li"): 

92 extract_ipa_li_tag(wxr, base_data, li_tag) 

93 clean_node(wxr, base_data, expanded_node) 

94 

95 

96def extract_x_sampa_template( 

97 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

98): 

99 sound = Sound( 

100 ipa=clean_node(wxr, None, t_node.template_parameters.get(1, "")), 

101 tags=["X-SAMPA"], 

102 ) 

103 if sound.ipa != "": 

104 base_data.sounds.append(sound) 

105 

106 

107def extract_enpr_template( 

108 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

109): 

110 sound = Sound( 

111 enpr=clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

112 ) 

113 if sound.enpr != "": 

114 base_data.sounds.append(sound) 

115 

116 

117def extract_audio_template( 

118 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

119): 

120 sound = Sound() 

121 filename = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

122 if filename != "": 122 ↛ exitline 122 didn't return from function 'extract_audio_template' because the condition on line 122 was always true

123 set_sound_file_url_fields(wxr, filename, sound) 

124 for raw_tag in clean_node( 

125 wxr, None, t_node.template_parameters.get("a", "") 

126 ).split(","): 

127 raw_tag = raw_tag.strip() 

128 if raw_tag != "": 128 ↛ 124line 128 didn't jump to line 124 because the condition on line 128 was always true

129 sound.raw_tags.append(raw_tag) 

130 translate_raw_tags(sound) 

131 base_data.sounds.append(sound) 

132 clean_node(wxr, base_data, t_node) 

133 

134 

135@dataclass 

136class TableHeader: 

137 text: str 

138 rowspan: int 

139 

140 

141def extract_th_pron_template( 

142 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

143): 

144 # https://th.wiktionary.org/wiki/แม่แบบ:th-pron 

145 expanded_node = wxr.wtp.parse( 

146 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

147 ) 

148 for table_tag in expanded_node.find_html("table"): 

149 row_headers = [] 

150 for tr_tag in table_tag.find_html("tr"): 

151 field = "other" 

152 new_headers = [] 

153 for header in row_headers: 

154 if header.rowspan > 1: 

155 header.rowspan -= 1 

156 new_headers.append(header) 

157 row_headers = new_headers 

158 for th_tag in tr_tag.find_html("th"): 

159 header_str = clean_node(wxr, None, th_tag) 

160 if header_str.startswith("(มาตรฐาน) สัทอักษรสากล"): 

161 field = "ipa" 

162 elif header_str.startswith("คำพ้องเสียง"): 

163 field = "homophone" 

164 elif header_str == "ไฟล์เสียง": 164 ↛ 165line 164 didn't jump to line 165 because the condition on line 164 was never true

165 field = "audio" 

166 elif header_str != "": 166 ↛ 158line 166 didn't jump to line 158 because the condition on line 166 was always true

167 rowspan = 1 

168 rowspan_str = th_tag.attrs.get("rowspan", "1") 

169 if re.fullmatch(r"\d+", rowspan_str): 169 ↛ 171line 169 didn't jump to line 171 because the condition on line 169 was always true

170 rowspan = int(rowspan_str) 

171 row_headers.append(TableHeader(header_str, rowspan)) 

172 

173 for td_tag in tr_tag.find_html("td"): 

174 if field == "audio": 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true

175 for link_node in td_tag.find_child(NodeKind.LINK): 

176 filename = clean_node(wxr, None, link_node.largs[0]) 

177 if filename != "": 

178 sound = Sound() 

179 set_sound_file_url_fields(wxr, filename, sound) 

180 base_data.sounds.append(sound) 

181 elif field == "homophone": 

182 for span_tag in td_tag.find_html_recursively( 

183 "span", attr_name="lang", attr_value="th" 

184 ): 

185 word = clean_node(wxr, None, span_tag) 

186 if word != "": 186 ↛ 182line 186 didn't jump to line 182 because the condition on line 186 was always true

187 base_data.sounds.append(Sound(homophone=word)) 

188 else: 

189 data = clean_node(wxr, None, td_tag) 

190 if data != "": 190 ↛ 173line 190 didn't jump to line 173 because the condition on line 190 was always true

191 sound = Sound() 

192 setattr(sound, field, data) 

193 for header in row_headers: 

194 sound.raw_tags.append(header.text) 

195 translate_raw_tags(sound) 

196 base_data.sounds.append(sound) 

197 

198 clean_node(wxr, base_data, expanded_node) 

199 

200 

201def extract_lo_pron_template( 

202 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

203): 

204 # https://th.wiktionary.org/wiki/แม่แบบ:lo-pron 

205 expanded_node = wxr.wtp.parse( 

206 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

207 ) 

208 for list_node in expanded_node.find_child(NodeKind.LIST): 

209 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

210 field = "other" 

211 raw_tag = "" 

212 for node in list_item.children: 

213 if isinstance(node, HTMLNode) and node.tag == "span": 

214 span_class = node.attrs.get("class", "") 

215 if "qualifier-content" in span_class: 

216 raw_tag = clean_node(wxr, None, node) 

217 elif span_class == "IPA": 

218 ipa = clean_node(wxr, None, node) 

219 if ipa != "": 219 ↛ 212line 219 didn't jump to line 212 because the condition on line 219 was always true

220 sound = Sound(ipa=ipa) 

221 if raw_tag != "": 221 ↛ 224line 221 didn't jump to line 224 because the condition on line 221 was always true

222 sound.raw_tags.append(raw_tag) 

223 translate_raw_tags(sound) 

224 base_data.sounds.append(sound) 

225 else: 

226 span_lang = node.attrs.get("lang", "") 

227 if span_lang == "lo" and field == "hyphenation": 

228 span_str = clean_node(wxr, None, node) 

229 if span_str != "": 229 ↛ 212line 229 didn't jump to line 212 because the condition on line 229 was always true

230 base_data.hyphenations.append( 

231 Hyphenation(parts=span_str.split("-")) 

232 ) 

233 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

234 link_str = clean_node(wxr, None, node) 

235 if link_str == "สัทอักษรสากล": 

236 field = "ipa" 

237 elif link_str != "" and field == "rhymes": 

238 base_data.sounds.append(Sound(rhymes=link_str)) 

239 elif isinstance(node, str) and node.strip().endswith(":"): 

240 node = node.strip() 

241 if node == "การแบ่งพยางค์:": 

242 field = "hyphenation" 

243 elif node == "สัมผัส:": 243 ↛ 212line 243 didn't jump to line 212 because the condition on line 243 was always true

244 field = "rhymes" 

245 

246 clean_node(wxr, base_data, expanded_node) 

247 

248 

249def extract_zh_pron_template( 

250 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

251): 

252 expanded_node = wxr.wtp.parse( 

253 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

254 ) 

255 seen_lists = set() 

256 sounds = [] 

257 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

258 if list_node not in seen_lists: 

259 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

260 sounds.extend( 

261 extract_zh_pron_list_item(wxr, list_item, [], seen_lists) 

262 ) 

263 for sound in sounds: 

264 translate_raw_tags(sound) 

265 base_data.sounds.extend(sounds) 

266 clean_node(wxr, base_data, expanded_node) 

267 

268 

269def extract_zh_pron_list_item( 

270 wxr: WiktextractContext, 

271 list_item_node: WikiNode, 

272 raw_tags: list[str], 

273 seen_lists: set[WikiNode], 

274) -> list[Sound]: 

275 current_tags = raw_tags[:] 

276 sounds = [] 

277 is_first_small_tag = True 

278 for node in list_item_node.children: 

279 if isinstance(node, WikiNode): 

280 if node.kind == NodeKind.LINK: 

281 link_str = clean_node(wxr, None, node.largs) 

282 node_str = clean_node(wxr, None, node) 

283 if link_str.startswith(("File:", "ไฟล์:")): 

284 filename = link_str.removeprefix("File:").removeprefix( 

285 "ไฟล์:" 

286 ) 

287 sound_data = Sound(raw_tags=current_tags) 

288 set_sound_file_url_fields(wxr, filename, sound_data) 

289 sounds.append(sound_data) 

290 elif node_str != "": 

291 current_tags.append(node_str.strip("()")) 

292 elif isinstance(node, HTMLNode): 

293 if node.tag == "small": 

294 # remove <sup> tag 

295 if is_first_small_tag: 

296 raw_tag_text = clean_node( 

297 wxr, 

298 None, 

299 [ 

300 n 

301 for n in node.children 

302 if not ( 

303 isinstance(n, HTMLNode) and n.tag == "sup" 

304 ) 

305 ], 

306 ).rstrip(":") 

307 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text)) 

308 elif len(sounds) > 0: 

309 sounds[-1].raw_tags.extend( 

310 split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

311 ) 

312 is_first_small_tag = False 

313 elif node.tag == "span": 

314 sounds.extend(extract_zh_pron_span(wxr, node, current_tags)) 

315 elif ( 

316 node.tag == "table" 

317 and len(current_tags) > 0 

318 and current_tags[-1] == "คำพ้องเสียง" 

319 ): 

320 sounds.extend( 

321 extract_zh_pron_homophones_table( 

322 wxr, node, current_tags 

323 ) 

324 ) 

325 elif node.kind == NodeKind.LIST: 

326 seen_lists.add(node) 

327 for next_list_item in node.find_child(NodeKind.LIST_ITEM): 

328 sounds.extend( 

329 extract_zh_pron_list_item( 

330 wxr, 

331 next_list_item, 

332 current_tags, 

333 seen_lists, 

334 ) 

335 ) 

336 return sounds 

337 

338 

339def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]: 

340 raw_tags = [] 

341 if "(" not in raw_tag_text: 

342 for raw_tag in re.split(r",|:|;| and ", raw_tag_text): 

343 raw_tag = raw_tag.strip().removeprefix("incl. ").strip() 

344 if raw_tag != "": 

345 raw_tags.append(raw_tag) 

346 else: 

347 processed_offsets = [] 

348 for match in re.finditer(r"\([^()]+\)", raw_tag_text): 

349 processed_offsets.append((match.start(), match.end())) 

350 raw_tags.extend( 

351 split_zh_pron_raw_tag( 

352 raw_tag_text[match.start() + 1 : match.end() - 1] 

353 ) 

354 ) 

355 not_processed = "" 

356 last_end = 0 

357 for start, end in processed_offsets: 

358 not_processed += raw_tag_text[last_end:start] 

359 last_end = end 

360 not_processed += raw_tag_text[last_end:] 

361 if not_processed != raw_tag_text: 

362 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags 

363 else: 

364 raw_tags.append(not_processed) 

365 return raw_tags 

366 

367 

368def extract_zh_pron_span( 

369 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str] 

370) -> list[Sound]: 

371 sounds = [] 

372 small_tags = [] 

373 pron_nodes = [] 

374 roman = "" 

375 phonetic_pron = "" 

376 for index, node in enumerate(span_tag.children): 

377 if isinstance(node, HTMLNode) and node.tag == "small": 

378 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

379 elif ( 

380 isinstance(node, HTMLNode) 

381 and node.tag == "span" 

382 and "-Latn" in node.attrs.get("lang", "") 

383 ): 

384 roman = clean_node(wxr, None, node).strip("() ") 

385 elif isinstance(node, str) and node.strip() == "[Phonetic:": 

386 phonetic_pron = clean_node( 

387 wxr, None, span_tag.children[index + 1 :] 

388 ).strip("] ") 

389 break 

390 else: 

391 pron_nodes.append(node) 

392 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)): 

393 zh_pron = zh_pron.strip("[]: ") 

394 if len(zh_pron) > 0: 

395 if "IPA" in span_tag.attrs.get("class", ""): 

396 sounds.append( 

397 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags) 

398 ) 

399 else: 

400 sounds.append( 

401 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags) 

402 ) 

403 if len(sounds) > 0: 

404 sounds[-1].raw_tags.extend(small_tags) 

405 if phonetic_pron != "": 

406 sounds.append( 

407 Sound( 

408 zh_pron=phonetic_pron, 

409 roman=roman, 

410 raw_tags=raw_tags + ["Phonetic"], 

411 ) 

412 ) 

413 return sounds 

414 

415 

416def split_zh_pron(zh_pron: str) -> list[str]: 

417 # split by comma and other symbols that outside parentheses 

418 parentheses = 0 

419 pron_list = [] 

420 pron = "" 

421 for c in zh_pron: 

422 if ( 

423 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/"))) 

424 and parentheses == 0 

425 and len(pron.strip()) > 0 

426 ): 

427 pron_list.append(pron.strip()) 

428 pron = "" 

429 elif c == "(": 

430 parentheses += 1 

431 pron += c 

432 elif c == ")": 

433 parentheses -= 1 

434 pron += c 

435 else: 

436 pron += c 

437 

438 if pron.strip() != "": 

439 pron_list.append(pron) 

440 return pron_list 

441 

442 

443def extract_zh_pron_homophones_table( 

444 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str] 

445) -> list[Sound]: 

446 sounds = [] 

447 for td_tag in table.find_html_recursively("td"): 

448 for span_tag in td_tag.find_html("span"): 

449 span_class = span_tag.attrs.get("class", "") 

450 span_lang = span_tag.attrs.get("lang", "") 

451 span_str = clean_node(wxr, None, span_tag) 

452 if ( 

453 span_str not in ["", "/"] 

454 and span_lang != "" 

455 and span_class in ["Hant", "Hans", "Hani"] 

456 ): 

457 sound = Sound(homophone=span_str, raw_tags=raw_tags) 

458 if span_class == "Hant": 

459 sound.tags.append("Traditional-Chinese") 

460 elif span_class == "Hans": 

461 sound.tags.append("Simplified-Chinese") 

462 sounds.append(sound) 

463 return sounds