Coverage for src / wiktextract / extractor / zh / pronunciation.py: 66%

454 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-05 07:46 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from ..share import set_sound_file_url_fields 

14from .models import Hyphenation, Sound, WordEntry 

15from .tags import translate_raw_tags 

16 

17 

18def extract_pronunciation_section( 

19 wxr: WiktextractContext, base_data: WordEntry, level_node: WikiNode 

20) -> None: 

21 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

22 if t_node.template_name == "zh-forms": 

23 from .page import process_zh_forms 

24 

25 process_zh_forms(wxr, base_data, t_node) 

26 else: 

27 new_sounds, new_cats = process_pron_template(wxr, base_data, t_node) 

28 base_data.sounds.extend(new_sounds) 

29 base_data.categories.extend(new_cats) 

30 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

31 new_sounds, new_cats = process_pron_item_list_item( 

32 wxr, base_data, list_item_node 

33 ) 

34 base_data.sounds.extend(new_sounds) 

35 base_data.categories.extend(new_cats) 

36 

37 

38def process_pron_item_list_item( 

39 wxr: WiktextractContext, base_data: WordEntry, list_item_node: WikiNode 

40) -> tuple[list[Sound], list[str]]: 

41 raw_tags = [] 

42 sounds = [] 

43 categories = [] 

44 for t_node in list_item_node.find_child(NodeKind.TEMPLATE): 

45 if t_node.template_name.lower() in ["hyph", "hyphenation"]: 45 ↛ 46line 45 didn't jump to line 46 because the condition on line 45 was never true

46 extract_hyphenation_template(wxr, base_data, t_node) 

47 else: 

48 new_sounds, new_cats = process_pron_template( 

49 wxr, base_data, t_node, raw_tags 

50 ) 

51 sounds.extend(new_sounds) 

52 categories.extend(new_cats) 

53 return sounds, categories 

54 

55 

56def process_pron_template( 

57 wxr: WiktextractContext, 

58 base_data: WordEntry, 

59 template_node: TemplateNode, 

60 raw_tags: list[str] = [], 

61) -> tuple[list[Sound], list[str]]: 

62 template_name = template_node.template_name.lower() 

63 sounds = [] 

64 categories = [] 

65 new_sounds = [] 

66 new_cats = [] 

67 if template_name == "zh-pron": 

68 new_sounds, new_cats = process_zh_pron_template(wxr, template_node) 

69 elif template_name in ["rhymes", "rhyme"]: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 new_sounds, new_cats = extract_rhymes_template(wxr, template_node) 

71 elif template_name in ["homophones", "homophone", "hmp"]: 

72 new_sounds, new_cats = extract_homophones_template(wxr, template_node) 

73 elif template_name in ["a", "accent"]: 

74 # https://zh.wiktionary.org/wiki/Template:Accent 

75 raw_tags.append(clean_node(wxr, None, template_node).strip("()")) 

76 elif template_name in ["audio", "音"]: 76 ↛ 77line 76 didn't jump to line 77 because the condition on line 76 was never true

77 sounds.extend(process_audio_template(wxr, template_node, raw_tags)) 

78 elif template_name == "ipa" or template_name.endswith("-ipa"): 

79 new_sounds, new_cats = extract_ipa_template( 

80 wxr, template_node, raw_tags 

81 ) 

82 elif template_name == "enpr": 

83 sounds.extend(process_enpr_template(wxr, template_node, raw_tags)) 

84 elif template_name == "ja-pron": 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true

85 new_sounds, new_cats = extract_ja_pron_template(wxr, template_node) 

86 elif template_name == "th-pron": 86 ↛ 87line 86 didn't jump to line 87 because the condition on line 86 was never true

87 new_sounds, new_cats = extract_th_pron_template(wxr, template_node) 

88 elif template_name.endswith("-pr"): 88 ↛ 92line 88 didn't jump to line 92 because the condition on line 88 was always true

89 new_sounds, new_cats = extract_pl_pr_template( 

90 wxr, base_data, template_node 

91 ) 

92 sounds.extend(new_sounds) 

93 categories.extend(new_cats) 

94 return sounds, categories 

95 

96 

97def process_zh_pron_template( 

98 wxr: WiktextractContext, template_node: TemplateNode 

99) -> tuple[list[Sound], list[str]]: 

100 # https://zh.wiktionary.org/wiki/Template:Zh-pron 

101 expanded_node = wxr.wtp.parse( 

102 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

103 ) 

104 seen_lists = set() 

105 sounds = [] 

106 categories = {} 

107 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

108 if list_node not in seen_lists: 

109 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

110 sounds.extend( 

111 process_zh_pron_list_item(wxr, list_item, [], seen_lists) 

112 ) 

113 clean_node(wxr, categories, expanded_node) 

114 for sound in sounds: 

115 translate_raw_tags(sound) 

116 return sounds, categories.get("categories", []) 

117 

118 

119def process_zh_pron_list_item( 

120 wxr: WiktextractContext, 

121 list_item_node: WikiNode, 

122 raw_tags: list[str], 

123 seen_lists: set[WikiNode], 

124) -> list[Sound]: 

125 current_tags = raw_tags[:] 

126 sounds = [] 

127 is_first_small_tag = True 

128 for node in list_item_node.children: 

129 if isinstance(node, WikiNode): 

130 if node.kind == NodeKind.LINK: 

131 link_str = clean_node(wxr, None, node.largs) 

132 node_str = clean_node(wxr, None, node) 

133 if link_str.startswith("File:"): 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 filename = link_str.removeprefix("File:") 

135 sound_data = Sound(raw_tags=current_tags) 

136 set_sound_file_url_fields(wxr, filename, sound_data) 

137 sounds.append(sound_data) 

138 elif node_str != "": 

139 current_tags.append(node_str.strip("()")) 

140 elif isinstance(node, HTMLNode): 

141 if node.tag == "small": 

142 # remove "幫助"(help) <sup> tag 

143 if is_first_small_tag: 

144 raw_tag_text = clean_node( 

145 wxr, 

146 None, 

147 [ 

148 n 

149 for n in node.children 

150 if not ( 

151 isinstance(n, HTMLNode) and n.tag == "sup" 

152 ) 

153 ], 

154 ).rstrip(":") 

155 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text)) 

156 elif len(sounds) > 0: 156 ↛ 160line 156 didn't jump to line 160 because the condition on line 156 was always true

157 sounds[-1].raw_tags.extend( 

158 split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

159 ) 

160 is_first_small_tag = False 

161 elif node.tag == "span": 

162 sounds.extend(extract_zh_pron_span(wxr, node, current_tags)) 

163 elif ( 

164 node.tag == "table" 

165 and len(current_tags) > 0 

166 and current_tags[-1] == "同音詞" 

167 ): 

168 sounds.extend( 

169 extract_zh_pron_homophones_table( 

170 wxr, node, current_tags 

171 ) 

172 ) 

173 

174 elif node.kind == NodeKind.LIST: 174 ↛ 128line 174 didn't jump to line 128 because the condition on line 174 was always true

175 seen_lists.add(node) 

176 for next_list_item in node.find_child(NodeKind.LIST_ITEM): 

177 sounds.extend( 

178 process_zh_pron_list_item( 

179 wxr, 

180 next_list_item, 

181 current_tags, 

182 seen_lists, 

183 ) 

184 ) 

185 return sounds 

186 

187 

188def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]: 

189 raw_tags = [] 

190 if "(" not in raw_tag_text and "(" not in raw_tag_text: 

191 for raw_tag in re.split(r",|,|:|、|;|;|和(?!$)", raw_tag_text): 

192 raw_tag = raw_tag.strip().removeprefix("包括").strip() 

193 if raw_tag != "": 

194 raw_tags.append(raw_tag) 

195 else: 

196 processed_offsets = [] 

197 for match in re.finditer(r"\([^()]+\)|([^()]+)", raw_tag_text): 

198 processed_offsets.append((match.start(), match.end())) 

199 raw_tags.extend( 

200 split_zh_pron_raw_tag( 

201 raw_tag_text[match.start() + 1 : match.end() - 1] 

202 ) 

203 ) 

204 not_processed = "" 

205 last_end = 0 

206 for start, end in processed_offsets: 

207 not_processed += raw_tag_text[last_end:start] 

208 last_end = end 

209 not_processed += raw_tag_text[last_end:] 

210 if not_processed != raw_tag_text: 

211 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags 

212 else: 

213 raw_tags.append(not_processed) 

214 return raw_tags 

215 

216 

217def extract_zh_pron_span( 

218 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str] 

219) -> list[Sound]: 

220 sounds = [] 

221 small_tags = [] 

222 pron_nodes = [] 

223 roman = "" 

224 phonetic_pron = "" 

225 for index, node in enumerate(span_tag.children): 

226 if isinstance(node, HTMLNode) and node.tag == "small": 

227 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

228 elif ( 

229 isinstance(node, HTMLNode) 

230 and node.tag == "span" 

231 and "-Latn" in node.attrs.get("lang", "") 

232 ): 

233 roman = clean_node(wxr, None, node).strip("() ") 

234 elif isinstance(node, str) and node.strip() == "[實際讀音:": 

235 phonetic_pron = clean_node( 

236 wxr, None, span_tag.children[index + 1 :] 

237 ).strip("] ") 

238 break 

239 else: 

240 pron_nodes.append(node) 

241 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)): 

242 zh_pron = zh_pron.strip("[]: ") 

243 if len(zh_pron) > 0: 243 ↛ 241line 243 didn't jump to line 241 because the condition on line 243 was always true

244 if "IPA" in span_tag.attrs.get("class", ""): 

245 sounds.append( 

246 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags) 

247 ) 

248 else: 

249 sounds.append( 

250 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags) 

251 ) 

252 if len(sounds) > 0: 

253 sounds[-1].raw_tags.extend(small_tags) 

254 if phonetic_pron != "": 

255 sounds.append( 

256 Sound( 

257 zh_pron=phonetic_pron, 

258 roman=roman, 

259 raw_tags=raw_tags + ["實際讀音"], 

260 ) 

261 ) 

262 return sounds 

263 

264 

265def split_zh_pron(zh_pron: str) -> list[str]: 

266 # split by comma and other symbols that outside parentheses 

267 parentheses = 0 

268 pron_list = [] 

269 pron = "" 

270 for c in zh_pron: 

271 if ( 

272 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/"))) 

273 and parentheses == 0 

274 and len(pron.strip()) > 0 

275 ): 

276 pron_list.append(pron.strip()) 

277 pron = "" 

278 elif c in ["(", "("]: 

279 parentheses += 1 

280 pron += c 

281 elif c in [")", ")"]: 

282 parentheses -= 1 

283 pron += c 

284 else: 

285 pron += c 

286 

287 if pron.strip() != "": 

288 pron_list.append(pron) 

289 return pron_list 

290 

291 

292def extract_zh_pron_homophones_table( 

293 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str] 

294) -> list[Sound]: 

295 sounds = [] 

296 for td_tag in table.find_html_recursively("td"): 

297 for span_tag in td_tag.find_html("span"): 

298 span_class = span_tag.attrs.get("class", "") 

299 span_lang = span_tag.attrs.get("lang", "") 

300 span_str = clean_node(wxr, None, span_tag) 

301 if ( 

302 span_str not in ["", "/"] 

303 and span_lang != "" 

304 and span_class in ["Hant", "Hans", "Hani"] 

305 ): 

306 sound = Sound(homophone=span_str, raw_tags=raw_tags) 

307 if span_class == "Hant": 

308 sound.tags.append("Traditional-Chinese") 

309 elif span_class == "Hans": 

310 sound.tags.append("Simplified-Chinese") 

311 sounds.append(sound) 

312 return sounds 

313 

314 

315def extract_homophones_template( 

316 wxr: WiktextractContext, t_node: TemplateNode 

317) -> tuple[list[Sound], list[str]]: 

318 # https://zh.wiktionary.org/wiki/Template:homophones 

319 expanded_node = wxr.wtp.parse( 

320 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

321 ) 

322 homophones = [] 

323 cats = {} 

324 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

325 for top_span in expanded_node.find_html( 

326 "span", attr_name="class", attr_value="homophones" 

327 ): 

328 for span_tag in top_span.find_html("span"): 

329 span_lang = span_tag.attrs.get("lang", "") 

330 span_class = span_tag.attrs.get("class", "").split() 

331 if "Latn" in span_class and len(homophones) > 0: 331 ↛ 332line 331 didn't jump to line 332 because the condition on line 331 was never true

332 homophones[-1].roman = clean_node(wxr, None, span_tag) 

333 elif span_lang == lang_code: 333 ↛ 337line 333 didn't jump to line 337 because the condition on line 333 was always true

334 homophone = clean_node(wxr, None, span_tag) 

335 if homophone != "": 335 ↛ 328line 335 didn't jump to line 328 because the condition on line 335 was always true

336 homophones.append(Sound(homophone=homophone)) 

337 elif "qualifier-content" in span_class and len(homophones) > 0: 

338 raw_tag = clean_node(wxr, None, span_tag) 

339 if raw_tag != "": 

340 homophones[-1].raw_tags.append(raw_tag) 

341 translate_raw_tags(homophones[-1]) 

342 for link_node in expanded_node.find_child(NodeKind.LINK): 

343 clean_node(wxr, cats, link_node) 

344 return homophones, cats.get("categories", []) 

345 

346 

347def process_audio_template( 

348 wxr: WiktextractContext, template_node: TemplateNode, raw_tags: list[str] 

349) -> list[Sound]: 

350 # https://zh.wiktionary.org/wiki/Template:Audio 

351 sound_file = clean_node( 

352 wxr, None, template_node.template_parameters.get(2, "") 

353 ) 

354 sound_data = Sound() 

355 set_sound_file_url_fields(wxr, sound_file, sound_data) 

356 raw_tag = clean_node( 

357 wxr, None, template_node.template_parameters.get(3, "") 

358 ) 

359 if len(raw_tag) > 0: 

360 sound_data.raw_tags.append(raw_tag) 

361 sound_data.raw_tags.extend(raw_tags) 

362 return [sound_data] 

363 

364 

365def extract_ipa_template( 

366 wxr: WiktextractContext, 

367 t_node: TemplateNode, 

368 raw_tags: list[str], 

369) -> tuple[list[Sound], list[str]]: 

370 # https://zh.wiktionary.org/wiki/Template:IPA 

371 cats = {} 

372 sounds = [] 

373 expanded_node = wxr.wtp.parse( 

374 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

375 ) 

376 clean_node(wxr, cats, expanded_node) 

377 no_list_nodes = [] 

378 for node in expanded_node.children: 

379 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

380 for list_item in node.find_child(NodeKind.LIST_ITEM): 

381 sounds.extend(extract_ipa_list_item(wxr, list_item, raw_tags)) 

382 else: 

383 no_list_nodes.append(node) 

384 if len(no_list_nodes) > 0: 384 ↛ 388line 384 didn't jump to line 388 because the condition on line 384 was always true

385 tmp_node = WikiNode(NodeKind.ROOT, 0) 

386 tmp_node.children = no_list_nodes 

387 sounds.extend(extract_ipa_list_item(wxr, tmp_node, raw_tags)) 

388 return sounds, cats.get("categories", []) 

389 

390 

391def extract_ipa_list_item( 

392 wxr: WiktextractContext, list_item: WikiNode, shared_raw_tags: list[str] 

393) -> list[Sound]: 

394 sounds = [] 

395 shared_raw_tags = shared_raw_tags[:] 

396 raw_tags = [] 

397 after_colon = False 

398 for node in list_item.children: 

399 if isinstance(node, str) and (":" in node or ":" in node): 

400 after_colon = True 

401 elif isinstance(node, HTMLNode) and node.tag == "span": 

402 span_class = node.attrs.get("class", "").split() 

403 if ( 

404 "qualifier-content" in span_class 

405 or "ib-content" in span_class 

406 or "usage-label-accent" in span_class 

407 ): 

408 for raw_tag in ( 

409 clean_node(wxr, None, node).strip("() ").split(",") 

410 ): 

411 raw_tag = raw_tag.strip() 

412 if raw_tag != "": 412 ↛ 408line 412 didn't jump to line 408 because the condition on line 412 was always true

413 if after_colon: 

414 raw_tags.append(raw_tag) 

415 else: 

416 shared_raw_tags.append(raw_tag) 

417 elif "IPA" in span_class: 

418 sound = Sound( 

419 ipa=clean_node(wxr, None, node), 

420 raw_tags=shared_raw_tags + raw_tags, 

421 ) 

422 if sound.ipa != "": 422 ↛ 425line 422 didn't jump to line 425 because the condition on line 422 was always true

423 translate_raw_tags(sound) 

424 sounds.append(sound) 

425 raw_tags.clear() 

426 elif "Latn" in span_class: 426 ↛ 427line 426 didn't jump to line 427 because the condition on line 426 was never true

427 sound = Sound( 

428 roman=clean_node(wxr, None, node), 

429 raw_tags=shared_raw_tags + raw_tags, 

430 ) 

431 if sound.roman != "": 

432 translate_raw_tags(sound) 

433 sounds.append(sound) 

434 raw_tags.clear() 

435 return sounds 

436 

437 

438def process_enpr_template( 

439 wxr: WiktextractContext, 

440 template_node: TemplateNode, 

441 raw_tags: list[str], 

442) -> list[Sound]: 

443 # https://zh.wiktionary.org/wiki/Template:enPR 

444 sounds = [] 

445 for index in range(1, 4): 445 ↛ 455line 445 didn't jump to line 455 because the loop on line 445 didn't complete

446 if index not in template_node.template_parameters: 

447 break 

448 sound = Sound( 

449 enpr=clean_node( 

450 wxr, None, template_node.template_parameters.get(index) 

451 ), 

452 raw_tags=raw_tags, 

453 ) 

454 sounds.append(sound) 

455 return sounds 

456 

457 

458def extract_ja_pron_template( 

459 wxr: WiktextractContext, t_node: TemplateNode 

460) -> tuple[list[Sound], list[str]]: 

461 JA_PRON_ACCENTS = { 

462 "中高型": "Nakadaka", 

463 "平板型": "Heiban", 

464 "頭高型": "Atamadaka", 

465 "尾高型": "Odaka", 

466 } 

467 expanded_node = wxr.wtp.parse( 

468 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

469 ) 

470 cats = {} 

471 sounds = [] 

472 for li_tag in expanded_node.find_html_recursively("li"): 

473 sound = Sound() 

474 for span_tag in li_tag.find_html("span"): 

475 span_class = span_tag.attrs.get("class", "").split() 

476 if "usage-label-accent" in span_class: 

477 raw_tag = clean_node(wxr, None, span_tag).strip("() ") 

478 if raw_tag != "": 

479 sound.raw_tags.append(raw_tag) 

480 elif "IPA" in span_class: 

481 sound.ipa = clean_node(wxr, None, span_tag) 

482 elif "Latn" in span_class: 

483 sound.roman = clean_node(wxr, None, span_tag) 

484 elif span_tag.attrs.get("lang", "") == "ja": 

485 sound.other = clean_node(wxr, None, span_tag) 

486 for link_node in li_tag.find_child(NodeKind.LINK): 

487 link_text = clean_node(wxr, None, link_node) 

488 if link_text in JA_PRON_ACCENTS: 

489 sound.tags.append(JA_PRON_ACCENTS[link_text]) 

490 if sound.ipa != "" or sound.other != "": 

491 translate_raw_tags(sound) 

492 sounds.append(sound) 

493 

494 audio_file = t_node.template_parameters.get( 

495 "a", t_node.template_parameters.get("audio", "") 

496 ).strip() 

497 if audio_file != "": 

498 sound = Sound() 

499 set_sound_file_url_fields(wxr, audio_file, sound) 

500 sounds.append(sound) 

501 clean_node(wxr, cats, expanded_node) 

502 return sounds, cats.get("categories", []) 

503 

504 

505def extract_th_pron_template( 

506 wxr: WiktextractContext, t_node: TemplateNode 

507) -> tuple[list[Sound], list[str]]: 

508 @dataclass 

509 class TableHeader: 

510 raw_tags: list[str] 

511 rowspan: int 

512 

513 expanded_node = wxr.wtp.parse( 

514 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

515 ) 

516 cats = {} 

517 sounds = [] 

518 for table_tag in expanded_node.find_html("table"): 

519 row_headers = [] 

520 for tr_tag in table_tag.find_html("tr"): 

521 field = "other" 

522 new_headers = [] 

523 for header in row_headers: 

524 if header.rowspan > 1: 

525 header.rowspan -= 1 

526 new_headers.append(header) 

527 row_headers = new_headers 

528 for th_tag in tr_tag.find_html("th"): 

529 header_str = clean_node(wxr, None, th_tag) 

530 if header_str.startswith("(標準泰語) IPA"): 

531 field = "ipa" 

532 elif header_str.startswith("同音詞"): 

533 field = "homophone" 

534 elif header_str == "音頻": 

535 field = "audio" 

536 elif header_str != "": 

537 rowspan = 1 

538 rowspan_str = th_tag.attrs.get("rowspan", "1") 

539 if re.fullmatch(r"\d+", rowspan_str): 

540 rowspan = int(rowspan_str) 

541 header = TableHeader([], rowspan) 

542 for line in header_str.splitlines(): 

543 for raw_tag in line.strip("{}\n ").split(";"): 

544 raw_tag = raw_tag.strip() 

545 if raw_tag != "": 

546 header.raw_tags.append(raw_tag) 

547 row_headers.append(header) 

548 

549 for td_tag in tr_tag.find_html("td"): 

550 if field == "audio": 

551 for link_node in td_tag.find_child(NodeKind.LINK): 

552 filename = clean_node(wxr, None, link_node.largs[0]) 

553 if filename != "": 

554 sound = Sound() 

555 set_sound_file_url_fields(wxr, filename, sound) 

556 sounds.append(sound) 

557 elif field == "homophone": 

558 for span_tag in td_tag.find_html_recursively( 

559 "span", attr_name="lang", attr_value="th" 

560 ): 

561 word = clean_node(wxr, None, span_tag) 

562 if word != "": 

563 sounds.append(Sound(homophone=word)) 

564 else: 

565 raw_tags = [] 

566 for html_node in td_tag.find_child_recursively( 

567 NodeKind.HTML 

568 ): 

569 if html_node.tag == "small": 

570 node_str = clean_node(wxr, None, html_node) 

571 if node_str.startswith("[") and node_str.endswith( 

572 "]" 

573 ): 

574 for raw_tag in node_str.strip("[]").split(","): 

575 raw_tag = raw_tag.strip() 

576 if raw_tag != "": 

577 raw_tags.append(raw_tag) 

578 elif len(sounds) > 0: 

579 sounds[-1].roman = node_str 

580 elif html_node.tag == "span": 

581 node_str = clean_node(wxr, None, html_node) 

582 span_lang = html_node.attrs.get("lang", "") 

583 span_class = html_node.attrs.get("class", "") 

584 if node_str != "" and ( 

585 span_lang == "th" or span_class in ["IPA", "tr"] 

586 ): 

587 sound = Sound(raw_tags=raw_tags) 

588 for header in row_headers: 

589 sound.raw_tags.extend(header.raw_tags) 

590 translate_raw_tags(sound) 

591 if "romanization" in sound.tags: 

592 field = "roman" 

593 setattr(sound, field, node_str) 

594 sounds.append(sound) 

595 

596 clean_node(wxr, cats, expanded_node) 

597 return sounds, cats.get("categories", []) 

598 

599 

600def extract_rhymes_template( 

601 wxr: WiktextractContext, t_node: TemplateNode 

602) -> tuple[list[Sound], list[str]]: 

603 expanded_node = wxr.wtp.parse( 

604 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

605 ) 

606 return extract_rhymes_list_item(wxr, expanded_node) 

607 

608 

609def extract_rhymes_list_item( 

610 wxr: WiktextractContext, list_item: WikiNode 

611) -> tuple[list[Sound], list[str]]: 

612 sounds = [] 

613 cats = {} 

614 for link_node in list_item.find_child(NodeKind.LINK): 

615 rhyme = clean_node(wxr, cats, link_node) 

616 if rhyme != "": 616 ↛ 614line 616 didn't jump to line 614 because the condition on line 616 was always true

617 sounds.append(Sound(rhymes=rhyme)) 

618 return sounds, cats.get("categories", []) 

619 

620 

621def extract_hyphenation_template( 

622 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

623): 

624 expanded_node = wxr.wtp.parse( 

625 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

626 ) 

627 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

628 extract_hyphenation_list_item(wxr, base_data, expanded_node, lang_code) 

629 

630 

631def extract_hyphenation_list_item( 

632 wxr: WiktextractContext, 

633 base_data: WordEntry, 

634 list_item: WikiNode, 

635 lang_code: str, 

636): 

637 for span_tag in list_item.find_html( 

638 "span", attr_name="lang", attr_value=lang_code 

639 ): 

640 h_str = clean_node(wxr, None, span_tag) 

641 h_data = Hyphenation( 

642 parts=list(filter(None, map(str.strip, h_str.split("‧")))) 

643 ) 

644 if len(h_data.parts) > 0: 644 ↛ 637line 644 didn't jump to line 637 because the condition on line 644 was always true

645 base_data.hyphenations.append(h_data) 

646 

647 

648def extract_pl_pr_template( 

649 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

650) -> tuple[list[Sound], list[str]]: 

651 sounds = [] 

652 cats = {} 

653 expanded_node = wxr.wtp.parse( 

654 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

655 ) 

656 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

657 skip_list = False 

658 for html_node in list_item.find_child(NodeKind.HTML): 

659 if html_node.tag == "table": 

660 sounds.extend(extract_pl_pr_sound_table(wxr, html_node)) 

661 skip_list = True 

662 break 

663 elif ( 

664 html_node.tag == "span" 

665 and "IPA" in html_node.attrs.get("class", "").split() 

666 ): 

667 sounds.extend(extract_ipa_list_item(wxr, list_item, [])) 

668 skip_list = True 

669 break 

670 if skip_list: 

671 continue 

672 for index, node in enumerate(list_item.children): 672 ↛ 656line 672 didn't jump to line 656 because the loop on line 672 didn't complete

673 if isinstance(node, str) and (":" in node or ":" in node): 673 ↛ 672line 673 didn't jump to line 672 because the condition on line 673 was always true

674 m = re.search(r":|:", node) 

675 list_type = clean_node( 

676 wxr, None, list_item.children[:index] + [node[: m.start()]] 

677 ) 

678 if list_type == "韻部": 

679 new_sounds, _ = extract_rhymes_list_item(wxr, list_item) 

680 sounds.extend(new_sounds) 

681 break 

682 elif list_type == "音節化": 682 ↛ 672line 682 didn't jump to line 672 because the condition on line 682 was always true

683 extract_hyphenation_list_item( 

684 wxr, base_data, list_item, "pl" 

685 ) 

686 break 

687 

688 clean_node(wxr, cats, expanded_node) 

689 return sounds, cats.get("categories", []) 

690 

691 

692def extract_pl_pr_sound_table( 

693 wxr: WiktextractContext, table_node: HTMLNode 

694) -> list[Sound]: 

695 sounds = [] 

696 for tr_node in table_node.find_html("tr"): 

697 raw_tag = "" 

698 for td_node in tr_node.find_html("td"): 

699 td_class = td_node.attrs.get("class", "").split() 

700 if td_class == []: 

701 for i_node in td_node.find_html("i"): 

702 raw_tag = clean_node(wxr, None, i_node) 

703 elif "audiofile" in td_class: 

704 for link_node in td_node.find_child(NodeKind.LINK): 

705 if len(link_node.largs) > 0 and len(link_node.largs[0]) > 0: 705 ↛ 704line 705 didn't jump to line 704 because the condition on line 705 was always true

706 file_name = clean_node( 

707 wxr, None, link_node.largs[0][0] 

708 ).removeprefix("File:") 

709 if file_name != "": 709 ↛ 704line 709 didn't jump to line 704 because the condition on line 709 was always true

710 sound = Sound() 

711 set_sound_file_url_fields(wxr, file_name, sound) 

712 if raw_tag != "": 712 ↛ 715line 712 didn't jump to line 715 because the condition on line 712 was always true

713 sound.raw_tags.append(raw_tag) 

714 translate_raw_tags(sound) 

715 sounds.append(sound) 

716 return sounds