Coverage for src / wiktextract / extractor / th / sound.py: 57%

397 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..share import set_sound_file_url_fields 

15from .models import Hyphenation, Sound, WordEntry 

16from .tags import translate_raw_tags 

17 

18 

19def extract_sound_section( 

20 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

21): 

22 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

23 if t_node.template_name == "zh-forms": 23 ↛ 24line 23 didn't jump to line 24 because the condition on line 23 was never true

24 from .page import extract_zh_forms 

25 

26 extract_zh_forms(wxr, base_data, t_node) 

27 else: 

28 extract_sound_template(wxr, base_data, t_node) 

29 for list_node in level_node.find_child(NodeKind.LIST): 

30 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

31 for t_node in list_item.find_child(NodeKind.TEMPLATE): 

32 extract_sound_template(wxr, base_data, t_node) 

33 

34 

35def extract_sound_template( 

36 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

37): 

38 if t_node.template_name in ["ja-pron", "ja-IPA"]: 

39 extract_ja_pron_template(wxr, base_data, t_node) 

40 elif t_node.template_name == "th-pron": 

41 extract_th_pron_template(wxr, base_data, t_node) 

42 elif t_node.template_name == "lo-pron": 

43 extract_lo_pron_template(wxr, base_data, t_node) 

44 elif t_node.template_name == "zh-pron": 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 extract_zh_pron_template(wxr, base_data, t_node) 

46 elif ( 

47 t_node.template_name.lower() == "ipa" 

48 or t_node.template_name.lower().endswith(("-ipa", "-pron")) 

49 ): 

50 extract_ipa_template(wxr, base_data, t_node) 

51 elif t_node.template_name == "X-SAMPA": 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 extract_x_sampa_template(wxr, base_data, t_node) 

53 elif t_node.template_name == "enPR": 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 extract_enpr_template(wxr, base_data, t_node) 

55 elif t_node.template_name in ["audio", "Audio", "เสียง"]: 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 extract_audio_template(wxr, base_data, t_node) 

57 elif t_node.template_name in ["rhymes", "rhyme"]: 

58 extract_rhymes_template(wxr, base_data, t_node) 

59 elif t_node.template_name in ["homophones", "homophone", "hmp"]: 59 ↛ 61line 59 didn't jump to line 61 because the condition on line 59 was always true

60 extract_homophones_template(wxr, base_data, t_node) 

61 elif t_node.template_name in ["hyphenation", "hyph"]: 

62 extract_hyphenation_template(wxr, base_data, t_node) 

63 

64 

65def extract_ipa_template( 

66 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

67): 

68 expanded_node = wxr.wtp.parse( 

69 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

70 ) 

71 no_list_nodes = [] 

72 for node in expanded_node.children: 

73 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

74 for list_item in node.find_child(NodeKind.LIST_ITEM): 

75 extract_ipa_list_item(wxr, base_data, list_item) 

76 else: 

77 no_list_nodes.append(node) 

78 if len(no_list_nodes) > 0: 78 ↛ 82line 78 didn't jump to line 82 because the condition on line 78 was always true

79 tmp_node = WikiNode(NodeKind.ROOT, 0) 

80 tmp_node.children = no_list_nodes 

81 extract_ipa_list_item(wxr, base_data, tmp_node) 

82 clean_node(wxr, base_data, expanded_node) 

83 

84 

85def extract_ipa_list_item( 

86 wxr: WiktextractContext, base_data: WordEntry, list_item: WikiNode 

87): 

88 raw_tags = [] 

89 for italic_node in list_item.find_child(NodeKind.ITALIC): 

90 # Template:vi-ipa location data 

91 raw_tag = clean_node(wxr, None, italic_node) 

92 if raw_tag != "": 92 ↛ 89line 92 didn't jump to line 89 because the condition on line 92 was always true

93 raw_tags.append(raw_tag) 

94 for span_tag in list_item.find_html_recursively("span"): 

95 span_class = span_tag.attrs.get("class", "").split() 

96 if "qualifier-content" in span_class or "ib-content" in span_class: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true

97 for raw_tag in clean_node(wxr, None, span_tag).split(","): 

98 raw_tag = raw_tag.strip() 

99 if raw_tag != "": 

100 raw_tags.append(raw_tag) 

101 elif "IPA" in span_class: 101 ↛ 108line 101 didn't jump to line 108 because the condition on line 101 was always true

102 sound = Sound( 

103 ipa=clean_node(wxr, None, span_tag), raw_tags=raw_tags 

104 ) 

105 if sound.ipa != "": 105 ↛ 94line 105 didn't jump to line 94 because the condition on line 105 was always true

106 translate_raw_tags(sound) 

107 base_data.sounds.append(sound) 

108 elif "Latn" in span_class: 

109 sound = Sound( 

110 roman=clean_node(wxr, None, span_tag), raw_tags=raw_tags 

111 ) 

112 if sound.roman != "": 

113 translate_raw_tags(sound) 

114 base_data.sounds.append(sound) 

115 

116 

117def extract_ja_pron_template( 

118 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

119): 

120 JA_PRON_ACCENTS = { 

121 "นากาดากะ": "Nakadaka", 

122 "เฮบัง": "Heiban", 

123 "อาตามาดากะ": "Atamadaka", 

124 "โอดากะ": "Odaka", 

125 } 

126 expanded_node = wxr.wtp.parse( 

127 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

128 ) 

129 for li_tag in expanded_node.find_html_recursively("li"): 

130 sound = Sound() 

131 for span_tag in li_tag.find_html("span"): 

132 span_class = span_tag.attrs.get("class", "").split() 

133 if "usage-label-accent" in span_class: 

134 raw_tag = clean_node(wxr, None, span_tag).strip("() ") 

135 if raw_tag != "": 135 ↛ 131line 135 didn't jump to line 131 because the condition on line 135 was always true

136 sound.raw_tags.append(raw_tag) 

137 elif "IPA" in span_class: 

138 sound.ipa = clean_node(wxr, None, span_tag) 

139 elif "Latn" in span_class: 

140 sound.roman = clean_node(wxr, None, span_tag) 

141 elif span_tag.attrs.get("lang", "") == "ja": 141 ↛ 131line 141 didn't jump to line 131 because the condition on line 141 was always true

142 sound.other = clean_node(wxr, None, span_tag) 

143 for link_node in li_tag.find_child(NodeKind.LINK): 

144 link_text = clean_node(wxr, None, link_node) 

145 if link_text in JA_PRON_ACCENTS: 

146 sound.tags.append(JA_PRON_ACCENTS[link_text]) 

147 if sound.ipa != "" or sound.other != "": 147 ↛ 129line 147 didn't jump to line 129 because the condition on line 147 was always true

148 translate_raw_tags(sound) 

149 base_data.sounds.append(sound) 

150 audio_file = t_node.template_parameters.get( 

151 "a", t_node.template_parameters.get("audio", "") 

152 ).strip() 

153 if audio_file != "": 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true

154 sound = Sound() 

155 set_sound_file_url_fields(wxr, audio_file, sound) 

156 base_data.sounds.append(sound) 

157 clean_node(wxr, base_data, expanded_node) 

158 

159 

160def extract_x_sampa_template( 

161 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

162): 

163 sound = Sound( 

164 ipa=clean_node(wxr, None, t_node.template_parameters.get(1, "")), 

165 tags=["X-SAMPA"], 

166 ) 

167 if sound.ipa != "": 

168 base_data.sounds.append(sound) 

169 

170 

171def extract_enpr_template( 

172 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

173): 

174 sound = Sound( 

175 enpr=clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

176 ) 

177 if sound.enpr != "": 

178 base_data.sounds.append(sound) 

179 

180 

181def extract_audio_template( 

182 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

183): 

184 sound = Sound() 

185 filename = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

186 if filename != "": 186 ↛ exitline 186 didn't return from function 'extract_audio_template' because the condition on line 186 was always true

187 set_sound_file_url_fields(wxr, filename, sound) 

188 for raw_tag in clean_node( 

189 wxr, None, t_node.template_parameters.get("a", "") 

190 ).split(","): 

191 raw_tag = raw_tag.strip() 

192 if raw_tag != "": 192 ↛ 188line 192 didn't jump to line 188 because the condition on line 192 was always true

193 sound.raw_tags.append(raw_tag) 

194 translate_raw_tags(sound) 

195 base_data.sounds.append(sound) 

196 clean_node(wxr, base_data, t_node) 

197 

198 

199def extract_th_pron_template( 

200 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

201): 

202 # https://th.wiktionary.org/wiki/แม่แบบ:th-pron 

203 @dataclass 

204 class TableHeader: 

205 raw_tags: list[str] 

206 rowspan: int 

207 

208 expanded_node = wxr.wtp.parse( 

209 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

210 ) 

211 for table_tag in expanded_node.find_html("table"): 

212 row_headers = [] 

213 for tr_tag in table_tag.find_html("tr"): 

214 field = "other" 

215 new_headers = [] 

216 for header in row_headers: 

217 if header.rowspan > 1: 

218 header.rowspan -= 1 

219 new_headers.append(header) 

220 row_headers = new_headers 

221 for th_tag in tr_tag.find_html("th"): 

222 header_str = clean_node(wxr, None, th_tag) 

223 if header_str.startswith("(มาตรฐาน) สัทอักษรสากล"): 

224 field = "ipa" 

225 elif header_str.startswith("คำพ้องเสียง"): 

226 field = "homophone" 

227 elif header_str == "ไฟล์เสียง": 227 ↛ 228line 227 didn't jump to line 228 because the condition on line 227 was never true

228 field = "audio" 

229 elif header_str != "": 229 ↛ 221line 229 didn't jump to line 221 because the condition on line 229 was always true

230 rowspan = 1 

231 rowspan_str = th_tag.attrs.get("rowspan", "1") 

232 if re.fullmatch(r"\d+", rowspan_str): 232 ↛ 234line 232 didn't jump to line 234 because the condition on line 232 was always true

233 rowspan = int(rowspan_str) 

234 header = TableHeader([], rowspan) 

235 for line in header_str.splitlines(): 

236 for raw_tag in line.strip("{}\n ").split(";"): 

237 raw_tag = raw_tag.strip() 

238 if raw_tag != "": 238 ↛ 236line 238 didn't jump to line 236 because the condition on line 238 was always true

239 header.raw_tags.append(raw_tag) 

240 row_headers.append(header) 

241 

242 for td_tag in tr_tag.find_html("td"): 

243 if field == "audio": 243 ↛ 244line 243 didn't jump to line 244 because the condition on line 243 was never true

244 for link_node in td_tag.find_child(NodeKind.LINK): 

245 filename = clean_node(wxr, None, link_node.largs[0]) 

246 if filename != "": 

247 sound = Sound() 

248 set_sound_file_url_fields(wxr, filename, sound) 

249 base_data.sounds.append(sound) 

250 elif field == "homophone": 

251 for span_tag in td_tag.find_html_recursively( 

252 "span", attr_name="lang", attr_value="th" 

253 ): 

254 word = clean_node(wxr, None, span_tag) 

255 if word != "": 255 ↛ 251line 255 didn't jump to line 251 because the condition on line 255 was always true

256 base_data.sounds.append(Sound(homophone=word)) 

257 else: 

258 raw_tags = [] 

259 for html_node in td_tag.find_child_recursively( 

260 NodeKind.HTML 

261 ): 

262 if html_node.tag == "small": 

263 node_str = clean_node(wxr, None, html_node) 

264 if node_str.startswith("[") and node_str.endswith( 264 ↛ 259line 264 didn't jump to line 259 because the condition on line 264 was always true

265 "]" 

266 ): 

267 for raw_tag in node_str.strip("[]").split(","): 

268 raw_tag = raw_tag.strip() 

269 if raw_tag != "": 269 ↛ 267line 269 didn't jump to line 267 because the condition on line 269 was always true

270 raw_tags.append(raw_tag) 

271 elif html_node.tag == "span": 

272 node_str = clean_node(wxr, None, html_node) 

273 span_lang = html_node.attrs.get("lang", "") 

274 span_class = html_node.attrs.get("class", "") 

275 if node_str != "" and ( 

276 span_lang == "th" or span_class in ["IPA", "tr"] 

277 ): 

278 sound = Sound(raw_tags=raw_tags) 

279 for header in row_headers: 

280 sound.raw_tags.extend(header.raw_tags) 

281 translate_raw_tags(sound) 

282 if "romanization" in sound.tags: 

283 field = "roman" 

284 setattr(sound, field, node_str) 

285 base_data.sounds.append(sound) 

286 

287 clean_node(wxr, base_data, expanded_node) 

288 

289 

290def extract_lo_pron_template( 

291 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

292): 

293 # https://th.wiktionary.org/wiki/แม่แบบ:lo-pron 

294 expanded_node = wxr.wtp.parse( 

295 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

296 ) 

297 for list_node in expanded_node.find_child(NodeKind.LIST): 

298 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

299 field = "other" 

300 raw_tag = "" 

301 for node in list_item.children: 

302 if isinstance(node, HTMLNode) and node.tag == "span": 

303 span_class = node.attrs.get("class", "") 

304 if "qualifier-content" in span_class: 

305 raw_tag = clean_node(wxr, None, node) 

306 elif span_class == "IPA": 

307 ipa = clean_node(wxr, None, node) 

308 if ipa != "": 308 ↛ 301line 308 didn't jump to line 301 because the condition on line 308 was always true

309 sound = Sound(ipa=ipa) 

310 if raw_tag != "": 310 ↛ 313line 310 didn't jump to line 313 because the condition on line 310 was always true

311 sound.raw_tags.append(raw_tag) 

312 translate_raw_tags(sound) 

313 base_data.sounds.append(sound) 

314 else: 

315 span_lang = node.attrs.get("lang", "") 

316 if span_lang == "lo" and field == "hyphenation": 

317 span_str = clean_node(wxr, None, node) 

318 if span_str != "": 318 ↛ 301line 318 didn't jump to line 301 because the condition on line 318 was always true

319 base_data.hyphenations.append( 

320 Hyphenation(parts=span_str.split("-")) 

321 ) 

322 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

323 link_str = clean_node(wxr, None, node) 

324 if link_str == "สัทอักษรสากล": 

325 field = "ipa" 

326 elif link_str != "" and field == "rhymes": 

327 base_data.sounds.append(Sound(rhymes=link_str)) 

328 elif isinstance(node, str) and node.strip().endswith(":"): 

329 node = node.strip() 

330 if node == "การแบ่งพยางค์:": 

331 field = "hyphenation" 

332 elif node == "สัมผัส:": 332 ↛ 301line 332 didn't jump to line 301 because the condition on line 332 was always true

333 field = "rhymes" 

334 

335 clean_node(wxr, base_data, expanded_node) 

336 

337 

338def extract_zh_pron_template( 

339 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

340): 

341 expanded_node = wxr.wtp.parse( 

342 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

343 ) 

344 seen_lists = set() 

345 sounds = [] 

346 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

347 if list_node not in seen_lists: 

348 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

349 sounds.extend( 

350 extract_zh_pron_list_item(wxr, list_item, [], seen_lists) 

351 ) 

352 for sound in sounds: 

353 translate_raw_tags(sound) 

354 base_data.sounds.extend(sounds) 

355 clean_node(wxr, base_data, expanded_node) 

356 

357 

358def extract_zh_pron_list_item( 

359 wxr: WiktextractContext, 

360 list_item_node: WikiNode, 

361 raw_tags: list[str], 

362 seen_lists: set[WikiNode], 

363) -> list[Sound]: 

364 current_tags = raw_tags[:] 

365 sounds = [] 

366 is_first_small_tag = True 

367 for node in list_item_node.children: 

368 if isinstance(node, WikiNode): 

369 if node.kind == NodeKind.LINK: 

370 link_str = clean_node(wxr, None, node.largs) 

371 node_str = clean_node(wxr, None, node) 

372 if link_str.startswith(("File:", "ไฟล์:")): 

373 filename = link_str.removeprefix("File:").removeprefix( 

374 "ไฟล์:" 

375 ) 

376 sound_data = Sound(raw_tags=current_tags) 

377 set_sound_file_url_fields(wxr, filename, sound_data) 

378 sounds.append(sound_data) 

379 elif node_str != "": 

380 current_tags.append(node_str.strip("()")) 

381 elif isinstance(node, HTMLNode): 

382 if node.tag == "small": 

383 # remove <sup> tag 

384 if is_first_small_tag: 

385 raw_tag_text = clean_node( 

386 wxr, 

387 None, 

388 [ 

389 n 

390 for n in node.children 

391 if not ( 

392 isinstance(n, HTMLNode) and n.tag == "sup" 

393 ) 

394 ], 

395 ).rstrip(":") 

396 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text)) 

397 elif len(sounds) > 0: 

398 sounds[-1].raw_tags.extend( 

399 split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

400 ) 

401 is_first_small_tag = False 

402 elif node.tag == "span": 

403 sounds.extend(extract_zh_pron_span(wxr, node, current_tags)) 

404 elif ( 

405 node.tag == "table" 

406 and len(current_tags) > 0 

407 and current_tags[-1] == "คำพ้องเสียง" 

408 ): 

409 sounds.extend( 

410 extract_zh_pron_homophones_table( 

411 wxr, node, current_tags 

412 ) 

413 ) 

414 elif node.kind == NodeKind.LIST: 

415 seen_lists.add(node) 

416 for next_list_item in node.find_child(NodeKind.LIST_ITEM): 

417 sounds.extend( 

418 extract_zh_pron_list_item( 

419 wxr, 

420 next_list_item, 

421 current_tags, 

422 seen_lists, 

423 ) 

424 ) 

425 return sounds 

426 

427 

428def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]: 

429 raw_tags = [] 

430 if "(" not in raw_tag_text: 

431 for raw_tag in re.split(r",|:|;| and ", raw_tag_text): 

432 raw_tag = raw_tag.strip().removeprefix("incl. ").strip() 

433 if raw_tag != "": 

434 raw_tags.append(raw_tag) 

435 else: 

436 processed_offsets = [] 

437 for match in re.finditer(r"\([^()]+\)", raw_tag_text): 

438 processed_offsets.append((match.start(), match.end())) 

439 raw_tags.extend( 

440 split_zh_pron_raw_tag( 

441 raw_tag_text[match.start() + 1 : match.end() - 1] 

442 ) 

443 ) 

444 not_processed = "" 

445 last_end = 0 

446 for start, end in processed_offsets: 

447 not_processed += raw_tag_text[last_end:start] 

448 last_end = end 

449 not_processed += raw_tag_text[last_end:] 

450 if not_processed != raw_tag_text: 

451 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags 

452 else: 

453 raw_tags.append(not_processed) 

454 return raw_tags 

455 

456 

457def extract_zh_pron_span( 

458 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str] 

459) -> list[Sound]: 

460 sounds = [] 

461 small_tags = [] 

462 pron_nodes = [] 

463 roman = "" 

464 phonetic_pron = "" 

465 for index, node in enumerate(span_tag.children): 

466 if isinstance(node, HTMLNode) and node.tag == "small": 

467 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

468 elif ( 

469 isinstance(node, HTMLNode) 

470 and node.tag == "span" 

471 and "-Latn" in node.attrs.get("lang", "") 

472 ): 

473 roman = clean_node(wxr, None, node).strip("() ") 

474 elif isinstance(node, str) and node.strip() == "[Phonetic:": 

475 phonetic_pron = clean_node( 

476 wxr, None, span_tag.children[index + 1 :] 

477 ).strip("] ") 

478 break 

479 else: 

480 pron_nodes.append(node) 

481 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)): 

482 zh_pron = zh_pron.strip("[]: ") 

483 if len(zh_pron) > 0: 

484 if "IPA" in span_tag.attrs.get("class", ""): 

485 sounds.append( 

486 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags) 

487 ) 

488 else: 

489 sounds.append( 

490 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags) 

491 ) 

492 if len(sounds) > 0: 

493 sounds[-1].raw_tags.extend(small_tags) 

494 if phonetic_pron != "": 

495 sounds.append( 

496 Sound( 

497 zh_pron=phonetic_pron, 

498 roman=roman, 

499 raw_tags=raw_tags + ["Phonetic"], 

500 ) 

501 ) 

502 return sounds 

503 

504 

505def split_zh_pron(zh_pron: str) -> list[str]: 

506 # split by comma and other symbols that outside parentheses 

507 parentheses = 0 

508 pron_list = [] 

509 pron = "" 

510 for c in zh_pron: 

511 if ( 

512 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/"))) 

513 and parentheses == 0 

514 and len(pron.strip()) > 0 

515 ): 

516 pron_list.append(pron.strip()) 

517 pron = "" 

518 elif c == "(": 

519 parentheses += 1 

520 pron += c 

521 elif c == ")": 

522 parentheses -= 1 

523 pron += c 

524 else: 

525 pron += c 

526 

527 if pron.strip() != "": 

528 pron_list.append(pron) 

529 return pron_list 

530 

531 

532def extract_zh_pron_homophones_table( 

533 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str] 

534) -> list[Sound]: 

535 sounds = [] 

536 for td_tag in table.find_html_recursively("td"): 

537 for span_tag in td_tag.find_html("span"): 

538 span_class = span_tag.attrs.get("class", "") 

539 span_lang = span_tag.attrs.get("lang", "") 

540 span_str = clean_node(wxr, None, span_tag) 

541 if ( 

542 span_str not in ["", "/"] 

543 and span_lang != "" 

544 and span_class in ["Hant", "Hans", "Hani"] 

545 ): 

546 sound = Sound(homophone=span_str, raw_tags=raw_tags) 

547 if span_class == "Hant": 

548 sound.tags.append("Traditional-Chinese") 

549 elif span_class == "Hans": 

550 sound.tags.append("Simplified-Chinese") 

551 sounds.append(sound) 

552 return sounds 

553 

554 

555def extract_rhymes_template( 

556 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

557): 

558 expanded_node = wxr.wtp.parse( 

559 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

560 ) 

561 for link_node in expanded_node.find_child(NodeKind.LINK): 

562 rhyme = clean_node(wxr, base_data, link_node) 

563 if rhyme != "": 

564 base_data.sounds.append(Sound(rhymes=rhyme)) 

565 

566 

567def extract_homophones_template( 

568 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

569): 

570 expanded_node = wxr.wtp.parse( 

571 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

572 ) 

573 homophones = [] 

574 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

575 for top_span in expanded_node.find_html( 

576 "span", attr_name="class", attr_value="homophones" 

577 ): 

578 for span_tag in top_span.find_html("span"): 

579 span_lang = span_tag.attrs.get("lang", "") 

580 span_class = span_tag.attrs.get("class", "").split() 

581 if "tr" in span_class and len(homophones) > 0: 

582 homophones[-1].roman = clean_node(wxr, None, span_tag) 

583 elif span_lang == lang_code: 

584 homophone = clean_node(wxr, None, span_tag) 

585 if homophone != "": 585 ↛ 578line 585 didn't jump to line 578 because the condition on line 585 was always true

586 homophones.append(Sound(homophone=homophone)) 

587 elif "qualifier-content" in span_class and len(homophones) > 0: 

588 raw_tag = clean_node(wxr, None, span_tag) 

589 if raw_tag != "": 589 ↛ 578line 589 didn't jump to line 578 because the condition on line 589 was always true

590 homophones[-1].raw_tags.append(raw_tag) 

591 translate_raw_tags(homophones[-1]) 

592 

593 base_data.sounds.extend(homophones) 

594 for link_node in expanded_node.find_child(NodeKind.LINK): 

595 clean_node(wxr, base_data, link_node) 

596 

597 

598def extract_hyphenation_template( 

599 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

600): 

601 expanded_node = wxr.wtp.parse( 

602 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

603 ) 

604 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

605 for span_tag in expanded_node.find_html( 

606 "span", attr_name="lang", attr_value=lang_code 

607 ): 

608 h_str = clean_node(wxr, None, span_tag) 

609 h_data = Hyphenation( 

610 parts=list(filter(None, map(str.strip, h_str.split("‧")))) 

611 ) 

612 if len(h_data.parts) > 0: 

613 base_data.hyphenations.append(h_data)