Coverage for src/wiktextract/extractor/en/pronunciation.py: 82%

775 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-05-29 08:54 +0000

1import hashlib 

2import re 

3import urllib 

4from copy import deepcopy 

5from dataclasses import dataclass 

6from typing import Iterator, Literal, NamedTuple 

7 

8from wikitextprocessor import ( 

9 HTMLNode, 

10 LevelNode, 

11 NodeKind, 

12 TemplateNode, 

13 WikiNode, 

14) 

15 

16from ...clean import clean_value 

17from ...datautils import data_append, data_extend, split_at_comma_semi 

18from ...page import LEVEL_KINDS, clean_node, is_panel_template 

19from ...tags import valid_tags 

20from ...wxr_context import WiktextractContext 

21from ..share import create_audio_url_dict 

22from .form_descriptions import ( 

23 classify_desc, 

24 decode_tags, 

25 parse_pronunciation_tags, 

26) 

27from .parts_of_speech import part_of_speech_map 

28from .type_utils import Hyphenation, SoundData, TemplateArgs, WordData 

29 

30# Prefixes, tags, and regexp for finding romanizations from the pronuncation 

31# section 

32pron_romanizations = { 

33 " Revised Romanization ": "romanization revised", 

34 " Revised Romanization (translit.) ": ( 

35 "romanization revised transliteration" 

36 ), 

37 " McCune-Reischauer ": "McCune-Reischauer romanization", 

38 " McCune–Reischauer ": "McCune-Reischauer romanization", 

39 " Yale Romanization ": "Yale romanization", 

40} 

41pron_romanization_re = re.compile( 

42 "(?m)^(" 

43 + "|".join( 

44 re.escape(x) 

45 for x in sorted(pron_romanizations.keys(), key=len, reverse=True) 

46 ) 

47 + ")([^\n]+)" 

48) 

49 

50IPA_EXTRACT = r"^(\((.+)\) )?(IPA⁽ᵏᵉʸ⁾|enPR): ((.+?)( \(([^(]+)\))?\s*)$" 

51IPA_EXTRACT_RE = re.compile(IPA_EXTRACT) 

52 

53 

54class PronunciationPosMatch(NamedTuple): 

55 pos_values: list[str] 

56 residual: str 

57 

58 

59class PronunciationPosPrefix(NamedTuple): 

60 pos_values: list[str] 

61 text: str 

62 is_persistent: bool 

63 

64 

65PRON_POS_TEMPLATE_NAMES = { 

66 "q", 

67 "qualifier", 

68 "qual", 

69 "i", 

70 "sense", 

71 "a", 

72 "accent", 

73 "lb", 

74 "lbl", 

75 "label", 

76} 

77 

78 

79def normalize_pronunciation_pos_label(label: str) -> str: 

80 label = label.strip().lower() 

81 label = re.sub(r"\s+", " ", label) 

82 # Drop explanatory suffixes such as "noun (barren areas)" before 

83 # matching the label against part-of-speech names. 

84 label = re.sub(r"\s*\([^)]*\)\s*$", "", label).strip() 

85 label = label.strip(" \t\n\r():") 

86 label = re.sub(r"\s+senses?$", "", label).strip() 

87 return label 

88 

89 

90def split_pronunciation_pos_text(text: str) -> PronunciationPosMatch: 

91 pos_values: list[str] = [] 

92 residual: list[str] = [] 

93 for part in re.split(r"\s*(?:[,;]|\band\b|\bor\b)\s*", text): 

94 part = part.strip() 

95 if not part: 

96 continue 

97 normalized = normalize_pronunciation_pos_label(part) 

98 if normalized in part_of_speech_map: 

99 pos = part_of_speech_map[normalized]["pos"] 

100 if pos not in pos_values: 100 ↛ 93line 100 didn't jump to line 93 because the condition on line 100 was always true

101 pos_values.append(pos) 

102 else: 

103 residual.append(part) 

104 return PronunciationPosMatch(pos_values, ", ".join(residual)) 

105 

106 

107def set_sound_pos(sound: SoundData, pos_values: list[str] | None) -> None: 

108 if not pos_values: 

109 return 

110 sound["pos"] = list(pos_values) # type: ignore[typeddict-unknown-key] 

111 

112 

113def parse_pronunciation_tags_with_pos( 

114 wxr: WiktextractContext, text: str, sound: SoundData 

115) -> list[str]: 

116 match = split_pronunciation_pos_text(text) 

117 set_sound_pos(sound, match.pos_values) 

118 if match.residual: 118 ↛ 120line 118 didn't jump to line 120 because the condition on line 118 was always true

119 parse_pronunciation_tags(wxr, match.residual, sound) 

120 return match.pos_values 

121 

122 

123def extract_pos_prefix(text: str) -> PronunciationPosPrefix | None: 

124 stripped = text.strip() 

125 if not (stripped.startswith("(") and stripped.endswith(")")): 

126 bare_match = split_pronunciation_pos_text(text) 

127 if bare_match.pos_values and not bare_match.residual: 

128 return PronunciationPosPrefix(bare_match.pos_values, "", True) 

129 

130 colon_match = re.match(r"\s*([^:()]+?)\s*:\s*(.*)$", text) 

131 if colon_match: 

132 match = split_pronunciation_pos_text(colon_match.group(1)) 

133 if match.pos_values and not match.residual: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 return PronunciationPosPrefix( 

135 match.pos_values, colon_match.group(2).strip(), True 

136 ) 

137 

138 paren_match = re.match(r"\s*\(([^()]*)\)\s*(.*)$", text) 

139 if paren_match: 

140 match = split_pronunciation_pos_text(paren_match.group(1)) 

141 if match.pos_values and not match.residual: 

142 return PronunciationPosPrefix( 

143 match.pos_values, paren_match.group(2).strip(), False 

144 ) 

145 

146 return None 

147 

148 

149def extract_pronunciation_pos_template( 

150 wxr: WiktextractContext, 

151 name: str, 

152 ht: TemplateArgs, 

153 lang_code: str, 

154) -> PronunciationPosMatch: 

155 if name in {"a", "accent", "lb", "lbl", "label"}: 

156 pos_args = [ 

157 value 

158 for key, value in ht.items() 

159 if isinstance(key, int) and key >= 2 

160 ] 

161 if not pos_args and ht.get(1) != lang_code: 

162 pos_args = [ht.get(1, "")] 

163 else: 

164 pos_args = [ 

165 value 

166 for key, value in ht.items() 

167 if isinstance(key, int) and key >= 1 

168 ] 

169 

170 pos_values: list[str] = [] 

171 residual: list[str] = [] 

172 for arg in pos_args: 

173 text = clean_node(wxr, None, [arg]) 

174 match = split_pronunciation_pos_text(text) 

175 for pos in match.pos_values: 

176 if pos not in pos_values: 176 ↛ 175line 176 didn't jump to line 175 because the condition on line 176 was always true

177 pos_values.append(pos) 

178 if match.residual: 

179 residual.append(match.residual) 

180 return PronunciationPosMatch(pos_values, ", ".join(residual)) 

181 

182 

183def extract_pron_template( 

184 wxr: WiktextractContext, tname: str, targs: TemplateArgs, expanded: str 

185) -> tuple[SoundData, list[SoundData]] | None: 

186 """In post_template_fn, this is used to handle all enPR and IPA templates 

187 so that we can leave breadcrumbs in the text that can later be handled 

188 there. We return a `base_data` so that if there are two 

189 or more templates on the same line, like this: 

190 (Tags for the whole line, really) enPR: foo, IPA(keys): /foo/ 

191 then we can apply base_data fields to other templates, too, if needed. 

192 """ 

193 cleaned = clean_value(wxr, expanded) 

194 # print(f"extract_pron_template input: {tname=} {expanded=}-> {cleaned=}") 

195 m = IPA_EXTRACT_RE.match(cleaned) 

196 if not m: 

197 wxr.wtp.error( 

198 f"Text cannot match IPA_EXTRACT_RE regex: " 

199 f"{cleaned=}, {tname=}, {targs=}", 

200 sortid="en/pronunciation/54", 

201 ) 

202 return None 

203 # for i, group in enumerate(m.groups()): 

204 # print(i + 1, repr(group)) 

205 main_qual = m.group(2) or "" 

206 if "qq" in targs: 

207 # If the template has been given a qualifier that applies to 

208 # every entry, but which also happens to appear at the end 

209 # which can be confused with the post-qualifier of a single 

210 # entry in the style of "... /ipa3/ (foo) (bar)", where foo 

211 # might not be present so the bar looks like it only might 

212 # apply to `/ipa3/` 

213 pron_body = m.group(5) 

214 post_qual = m.group(7) 

215 else: 

216 pron_body = m.group(4) 

217 post_qual = "" 

218 

219 if not pron_body: 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true

220 wxr.wtp.error( 

221 f"Regex failed to find 'body' from {cleaned=}", 

222 sortid="en/pronunciation/81", 

223 ) 

224 return None 

225 

226 base_data: SoundData = {} 

227 if main_qual: 

228 parse_pronunciation_tags(wxr, main_qual, base_data) 

229 if post_qual: 

230 parse_pronunciation_tags(wxr, post_qual, base_data) 

231 # This base_data is used as the base copy for all entries from this 

232 # template, but it is also returned so that its contents may be applied 

233 # to other templates on the same line. 

234 # print(f"{base_data=}") 

235 

236 sound_datas: list[SoundData] = [] 

237 

238 parts: list[list[str]] = [[]] 

239 inside = 0 

240 current: list[str] = [] 

241 for i, p in enumerate(re.split(r"(\s*,|;|\(|\)\s*)", pron_body)): 

242 # Split the line on commas and semicolons outside of parens. This 

243 # gives us lines with "(main-qualifier) /phon/ (post-qualifier, maybe)" 

244 # print(f" {i=}, {p=}") 

245 comp = p.strip() 

246 if not p: 

247 continue 

248 if comp == "(": 

249 if not inside and i > 0: 249 ↛ 252line 249 didn't jump to line 252 because the condition on line 249 was always true

250 if stripped := "".join(current).strip(): 

251 parts[-1].append("".join(current).strip()) # type:ignore[arg-type] 

252 current = [p] 

253 inside += 1 

254 continue 

255 if comp == ")": 

256 inside -= 1 

257 if not inside: 257 ↛ 262line 257 didn't jump to line 262 because the condition on line 257 was always true

258 if stripped := "".join(current).strip(): 258 ↛ 262line 258 didn't jump to line 262 because the condition on line 258 was always true

259 current.append(p) 

260 parts[-1].append("".join(current).strip()) # type:ignore[arg-type] 

261 current = [] 

262 continue 

263 if not inside and comp in (",", ";"): 

264 if stripped := "".join(current).strip(): 

265 parts[-1].append(stripped) # type:ignore[arg-type] 

266 current = [] 

267 parts.append([]) 

268 continue 

269 current.append(p) 

270 if current: 

271 parts[-1].append("".join(current).strip()) 

272 

273 # print(f">>>>>> {parts=}") 

274 new_parts: list[list[str]] = [] 

275 for entry in parts: 

276 if not entry: 276 ↛ 277line 276 didn't jump to line 277 because the condition on line 276 was never true

277 continue 

278 new_entry: list[str] = [] 

279 i1: int = entry[0].startswith("(") and entry[0].endswith(")") 

280 if i1: 

281 new_entry.append(entry[0][1:-1].strip()) 

282 else: 

283 new_entry.append("") 

284 i2: int = ( 

285 entry[-1].startswith("(") 

286 and entry[-1].endswith(")") 

287 and len(entry) > 1 

288 ) 

289 if i2 == 0: 

290 i2 = len(entry) 

291 else: 

292 i2 = -1 

293 new_entry.append("".join(entry[i1:i2]).strip()) 

294 if not new_entry[-1]: 294 ↛ 295line 294 didn't jump to line 295 because the condition on line 294 was never true

295 wxr.wtp.error( 

296 f"Missing IPA/enPRO sound data between qualifiers?{entry=}", 

297 sortid="en/pronunciation/153", 

298 ) 

299 if i2 == -1: 

300 new_entry.append(entry[-1][1:-1].strip()) 

301 else: 

302 new_entry.append("") 

303 new_parts.append(new_entry) 

304 

305 # print(f">>>>> {new_parts=}") 

306 

307 for part in new_parts: 

308 sd = deepcopy(base_data) 

309 if part[0]: 

310 parse_pronunciation_tags(wxr, part[0], sd) 

311 if part[2]: 

312 parse_pronunciation_tags(wxr, part[2], sd) 

313 if tname == "enPR": 

314 sd["enpr"] = part[1] 

315 else: 

316 sd["ipa"] = part[1] 

317 sound_datas.append(sd) 

318 

319 # print(f"BASE_DATA: {base_data}") 

320 # print(f"SOUND_DATAS: {sound_datas=}") 

321 

322 return base_data, sound_datas 

323 

324 

325def parse_pronunciation( 

326 wxr: WiktextractContext, 

327 level_node: LevelNode, 

328 data: WordData, 

329 etym_data: WordData, 

330 have_etym: bool, 

331 base_data: WordData, 

332 lang_code: str, 

333) -> None: 

334 """Parses the pronunciation section from a language section on a 

335 page.""" 

336 if level_node.kind in LEVEL_KINDS: 336 ↛ 349line 336 didn't jump to line 349 because the condition on line 336 was always true

337 contents: list[str | WikiNode | TemplateNode] = [] 

338 for node in level_node.children: 

339 if isinstance(node, TemplateNode): 

340 if node.template_name == "th-pron": 

341 extract_th_pron_template(wxr, data, node) 

342 elif node.template_name == "zh-pron": 

343 extract_zh_pron_template(wxr, data, node) 

344 else: 

345 contents.append(node) 

346 else: 

347 contents.append(node) 

348 else: 

349 contents = [level_node] 

350 # Remove subsections, such as Usage notes. They may contain IPAchar 

351 # templates in running text, and we do not want to extract IPAs from 

352 # those. 

353 # Filter out only LEVEL_KINDS; 'or' is doing heavy lifting here 

354 # Slip through not-WikiNodes, then slip through WikiNodes that 

355 # are not LEVEL_KINDS. 

356 contents = [ 

357 x 

358 for x in contents 

359 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS 

360 ] 

361 if not any( 

362 isinstance(x, WikiNode) and x.kind == NodeKind.LIST for x in contents 

363 ): 

364 # expand all templates 

365 new_contents: list[str | WikiNode | TemplateNode] = [] 

366 for lst in contents: 

367 if isinstance(lst, TemplateNode): 

368 temp = wxr.wtp.node_to_wikitext(lst) 

369 temp = wxr.wtp.expand(temp) 

370 temp_parsed = wxr.wtp.parse(temp) 

371 new_contents.extend(temp_parsed.children) 

372 else: 

373 new_contents.append(lst) 

374 contents = new_contents 

375 

376 if have_etym and data is base_data: 376 ↛ 377line 376 didn't jump to line 377 because the condition on line 376 was never true

377 data = etym_data 

378 pron_templates: list[tuple[SoundData, list[SoundData]]] = [] 

379 pron_pos_markers: list[list[str]] = [] 

380 hyphenations: list[Hyphenation] = [] 

381 audios: list[SoundData] = [] 

382 have_panel_templates = False 

383 

384 def parse_pronunciation_template_fn( 

385 name: str, ht: TemplateArgs 

386 ) -> str | None: 

387 """Handle pronunciation and hyphenation templates""" 

388 # _template_fn handles templates *before* they are expanded; 

389 # this allows for special handling before all the work needed 

390 # for expansion is done. 

391 nonlocal have_panel_templates 

392 if is_panel_template(wxr, name): 

393 have_panel_templates = True 

394 return "" 

395 if name == "audio": 

396 filename = ht.get(2) or "" 

397 audio: SoundData = {"audio": filename.strip()} 

398 dialect = ht.get("a", "") 

399 if "aa" in ht: 399 ↛ 400line 399 didn't jump to line 400 because the condition on line 399 was never true

400 dialect += ", " + ht.get("aa", "") 

401 if dialect: 

402 dialect = dialect.replace("<", "").replace(">", "") 

403 dialect = clean_node(wxr, None, [dialect]) 

404 for part in split_at_comma_semi(dialect): 

405 if "(" not in part: 

406 parse_pronunciation_tags(wxr, part, audio) 

407 else: 

408 for ppart in re.split(r"[][()]", part): 

409 parse_pronunciation_tags(wxr, ppart, audio) 

410 desc = ht.get(3) or "" 

411 desc = clean_node(wxr, None, [desc]) 

412 if desc: 412 ↛ 413line 412 didn't jump to line 413 because the condition on line 412 was never true

413 audio["text"] = desc 

414 m = re.search(r"\((([^()]|\([^()]*\))*)\)", desc) 

415 skip = False 

416 if m: 416 ↛ 417line 416 didn't jump to line 417 because the condition on line 416 was never true

417 par = m.group(1) 

418 cls = classify_desc(par) 

419 if cls == "tags": 

420 parse_pronunciation_tags(wxr, par, audio) 

421 else: 

422 skip = True 

423 if skip: 423 ↛ 424line 423 didn't jump to line 424 because the condition on line 423 was never true

424 return "" 

425 audios.append(audio) 

426 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

427 if name == "audio-IPA": 427 ↛ 428line 427 didn't jump to line 428 because the condition on line 427 was never true

428 filename = ht.get(2) or "" 

429 ipa = ht.get(3) or "" 

430 dial = ht.get("dial") 

431 audio = {"audio": filename.strip()} 

432 if dial: 

433 dial = clean_node(wxr, None, [dial]) 

434 audio["text"] = dial 

435 if ipa: 

436 audio["audio-ipa"] = ipa 

437 audios.append(audio) 

438 # The problem with these IPAs is that they often just describe 

439 # what's in the sound file, rather than giving the pronunciation 

440 # of the word alone. It is common for audio files to contain 

441 # multiple pronunciations or articles in the same file, and then 

442 # this IPA often describes what is in the file. 

443 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

444 if name == "audio-pron": 

445 filename = ht.get(2) or "" 

446 ipa = ht.get("ipa") or "" 

447 dial = ht.get("dial") 

448 country = ht.get("country") 

449 audio = {"audio": filename.strip()} 

450 if dial: 450 ↛ 454line 450 didn't jump to line 454 because the condition on line 450 was always true

451 dial = clean_node(wxr, None, [dial]) 

452 audio["text"] = dial 

453 parse_pronunciation_tags(wxr, dial, audio) 

454 if country: 454 ↛ 456line 454 didn't jump to line 456 because the condition on line 454 was always true

455 parse_pronunciation_tags(wxr, country, audio) 

456 if ipa: 456 ↛ 458line 456 didn't jump to line 458 because the condition on line 456 was always true

457 audio["audio-ipa"] = ipa 

458 audios.append(audio) 

459 # XXX do we really want to extract pronunciations from these? 

460 # Or are they spurious / just describing what is in the 

461 # audio file? 

462 # if ipa: 

463 # pron = {"ipa": ipa} 

464 # if dial: 

465 # parse_pronunciation_tags(wxr, dial, pron) 

466 # if country: 

467 # parse_pronunciation_tags(wxr, country, pron) 

468 # data_append(data, "sounds", pron) 

469 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

470 if name in ("hyph", "hyphenation"): 

471 # {{hyph|en|re|late|caption="Hyphenation UK:"}} 

472 # {{hyphenation|it|quiè|to||qui|è|to||quié|to||qui|é|to}} 

473 # and also nocaption=1 

474 caption = clean_node(wxr, None, ht.get("caption", "")) 

475 tagsets, _ = decode_tags(caption) 

476 # flatten the tagsets into one; it would be really weird to have 

477 # several tagsets for a hyphenation caption 

478 tags = sorted(set(tag for tagset in tagsets for tag in tagset)) 

479 # We'll just ignore any errors from tags, it's not very important 

480 # for hyphenation 

481 tags = [tag for tag in tags if not tag.startswith("error")] 

482 hyph_sequences: list[list[str]] = [[]] 

483 for text in [ 

484 t for (k, t) in ht.items() if (isinstance(k, int) and k >= 2) 

485 ]: 

486 if not text: 

487 hyph_sequences.append([]) 

488 else: 

489 hyph_sequences[-1].append(clean_node(wxr, None, text)) 

490 for seq in hyph_sequences: 

491 hyphenations.append(Hyphenation(parts=seq, tags=tags)) 

492 return "" 

493 return None 

494 

495 may_be_duplicates = False 

496 

497 def parse_pron_post_template_fn( 

498 name: str, ht: TemplateArgs, text: str 

499 ) -> str | None: 

500 # _post_template_fn handles templates *after* the work to expand 

501 # them has been done; this is exactly the same as _template_fn, 

502 # except with the additional expanded text as an input, and 

503 # possible side-effects from the expansion and recursion (like 

504 # calling other subtemplates that are handled in _template_fn. 

505 nonlocal may_be_duplicates 

506 if is_panel_template(wxr, name): 506 ↛ 507line 506 didn't jump to line 507 because the condition on line 506 was never true

507 return "" 

508 if name in PRON_POS_TEMPLATE_NAMES: 

509 pos_match = extract_pronunciation_pos_template( 

510 wxr, name, ht, lang_code 

511 ) 

512 if pos_match.pos_values: 

513 pron_pos_markers.append(pos_match.pos_values) 

514 marker = ( 

515 f"__PRON_POS_MARKER_{len(pron_pos_markers) - 1}__" 

516 ) 

517 if pos_match.residual: 517 ↛ 518line 517 didn't jump to line 518 because the condition on line 517 was never true

518 return f"{marker} ({pos_match.residual})" 

519 return marker 

520 if name in { 

521 *PRON_POS_TEMPLATE_NAMES, 

522 "l", 

523 "link", 

524 }: 

525 # Kludge: when these templates expand to /.../ or [...], 

526 # replace the expansion by something safe. This is used 

527 # to filter spurious IPA-looking expansions that aren't really 

528 # IPAs. We probably don't care about these templates in the 

529 # contexts where they expand to something containing these. 

530 v = re.sub(r'href="[^"]*"', "", text) # Ignore URLs 

531 v = re.sub(r'src="[^"]*"', "", v) 

532 if re.search(r"/[^/,]+?/|\[[^]0-9,/][^],/]*?\]", v): 

533 # Note: replacing by empty results in Lua errors that we 

534 # would rather not have. For example, voi/Middle Vietnamese 

535 # uses {{a|{{l{{vi|...}}}}, and the {{a|...}} will fail 

536 # if {{l|...}} returns empty. 

537 return "stripped-by-parse_pron_post_template_fn" 

538 if name in ("IPA", "enPR"): 

539 # Extract the data from IPA and enPR templates (same underlying 

540 # template) and replace them in-text with magical cookie that 

541 # can be later used to refer to the data's index inside 

542 # pron_templates. 

543 if pron_t := extract_pron_template(wxr, name, ht, text): 

544 pron_templates.append(pron_t) 

545 return f"__PRON_TEMPLATE_{len(pron_templates) - 1}__" 

546 # Catch templates that generate duplicate sound data entries 

547 # here; if the text produces a big, toggleable section, the 

548 # "header" for that section might be duplicated. Add more conditions 

549 # if necessary. 

550 if text.startswith("<") and "vsToggleElement" in text: 550 ↛ 551line 550 didn't jump to line 551 because the condition on line 550 was never true

551 may_be_duplicates = True 

552 return text 

553 

554 def flattened_tree(lines: list[WikiNode | str]) -> Iterator[WikiNode | str]: 

555 assert isinstance(lines, list) 

556 for line in lines: 

557 yield from flattened_tree1(line) 

558 

559 def flattened_tree1(node: WikiNode | str) -> Iterator[WikiNode | str]: 

560 assert isinstance(node, (WikiNode, str)) 

561 if isinstance(node, str): 

562 yield node 

563 return 

564 elif node.kind == NodeKind.LIST: 

565 for item in node.children: 

566 yield from flattened_tree1(item) 

567 elif node.kind == NodeKind.LIST_ITEM: 

568 new_children = [] 

569 sublist = None 

570 for child in node.children: 

571 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST: 

572 sublist = child 

573 else: 

574 new_children.append(child) 

575 node.children = new_children 

576 node.sarg = "*" 

577 yield node 

578 if sublist: 

579 yield from flattened_tree1(sublist) 

580 else: 

581 yield node 

582 

583 # XXX Do not use flattened_tree more than once here, for example for 

584 # debug printing... The underlying data is changed, and the separated 

585 # sublists disappear. 

586 

587 # Kludge for templates that generate several lines, but haven't 

588 # been caught by earlier kludges... 

589 def split_cleaned_node_on_newlines( 

590 contents: list[WikiNode | str], 

591 ) -> Iterator[str]: 

592 for litem in flattened_tree(contents): 

593 ipa_text = clean_node( 

594 wxr, 

595 data, 

596 litem, 

597 template_fn=parse_pronunciation_template_fn, 

598 post_template_fn=parse_pron_post_template_fn, 

599 ) 

600 for line in ipa_text.splitlines(): 

601 yield line 

602 

603 # have_pronunciations = False 

604 active_pos: list[str] | None = None 

605 

606 for line in split_cleaned_node_on_newlines(contents): 

607 prefix: str | None = None 

608 earlier_base_data: SoundData | None = None 

609 line_pos: list[str] | None = None 

610 current_group_sounds: list[SoundData] = [] 

611 line_has_sound = False 

612 if not line: 612 ↛ 613line 612 didn't jump to line 613 because the condition on line 612 was never true

613 continue 

614 

615 split_templates = re.split(r"__PRON_TEMPLATE_(\d+)__", line) 

616 for i, text in enumerate(split_templates): 

617 if not text: 

618 continue 

619 # clean up starts at the start of the line 

620 text = re.sub(r"^\**\s*", "", text).strip() 

621 if i == 0: 

622 # At the start of a line, check for stuff like "Noun:" 

623 # for active_pos; active_pos is a temporary data field 

624 # given to each saved SoundData entry which is later 

625 # used to sort the entries into their respective PoSes. 

626 if pos_prefix := extract_pos_prefix(text): 

627 text = pos_prefix.text 

628 line_pos = pos_prefix.pos_values 

629 if pos_prefix.is_persistent: 

630 active_pos = pos_prefix.pos_values 

631 if not text: 

632 continue 

633 

634 m = re.search(r"__PRON_POS_MARKER_(\d+)__", text) 

635 while m: 

636 if current_group_sounds and re.search( 

637 r"[,;]", text[: m.start()] 

638 ): 

639 current_group_sounds = [] 

640 pos_values = pron_pos_markers[int(m.group(1))] 

641 if current_group_sounds: 

642 for sound in current_group_sounds: 

643 set_sound_pos(sound, pos_values) 

644 line_pos = pos_values 

645 text = text[: m.start()] + text[m.end() :] 

646 m = re.search(r"__PRON_POS_MARKER_(\d+)__", text) 

647 text = text.strip() 

648 if not text: 

649 continue 

650 

651 if i % 2 == 1: 

652 # re.split (with capture groups) splits the lines so that 

653 # every even entry is a captured splitter; odd lines are either 

654 # empty strings or stuff around the splitters. 

655 base_pron_data, first_prons = pron_templates[int(text)] 

656 if base_pron_data: 

657 earlier_base_data = base_pron_data 

658 # print(f"Set {earlier_base_data=}") 

659 elif earlier_base_data is not None: 

660 # merge data from an earlier iteration of this loop 

661 for pr in first_prons: 

662 if "note" in pr and "note" in earlier_base_data: 662 ↛ 663line 662 didn't jump to line 663 because the condition on line 662 was never true

663 pr["note"] += ";" + earlier_base_data.get( 

664 "note", "" 

665 ) 

666 elif "note" in earlier_base_data: 666 ↛ 667line 666 didn't jump to line 667 because the condition on line 666 was never true

667 pr["note"] = earlier_base_data["note"] 

668 if "topics" in earlier_base_data: 668 ↛ 669line 668 didn't jump to line 669 because the condition on line 668 was never true

669 data_extend( 

670 pr, "topics", earlier_base_data["topics"] 

671 ) 

672 if "tags" in pr and "tags" in earlier_base_data: 672 ↛ 673line 672 didn't jump to line 673 because the condition on line 672 was never true

673 pr["tags"].extend(earlier_base_data["tags"]) 

674 elif "tags" in earlier_base_data: 674 ↛ 661line 674 didn't jump to line 661 because the condition on line 674 was always true

675 pr["tags"] = sorted(set(earlier_base_data["tags"])) 

676 for pr in first_prons: 

677 set_sound_pos(pr, line_pos or active_pos) 

678 if pr not in data.get("sounds", ()): 678 ↛ 680line 678 didn't jump to line 680 because the condition on line 678 was always true

679 data_append(data, "sounds", pr) 

680 current_group_sounds.append(pr) 

681 line_has_sound = True 

682 # This bit is handled 

683 continue 

684 

685 if "IPA" in text: 

686 field: Literal[ 

687 "audio", 

688 "audio-ipa", 

689 "enpr", 

690 "form", 

691 "hangeul", 

692 "homophone", 

693 "ipa", 

694 "mp3_url", 

695 "note", 

696 "ogg_url", 

697 "other", 

698 "rhymes", 

699 "tags", 

700 "text", 

701 "topics", 

702 "zh-pron", 

703 ] = "ipa" 

704 else: 

705 # This is used for Rhymes, Homophones, etc 

706 field = "other" 

707 

708 # Check if it contains Japanese "Tokyo" pronunciation with 

709 # special syntax 

710 m = re.search(r"(?m)\(Tokyo\) +([^ ]+) +\[", text) 

711 if m: 711 ↛ 712line 711 didn't jump to line 712 because the condition on line 711 was never true

712 pron: SoundData = {field: m.group(1)} # type: ignore[misc] 

713 set_sound_pos(pron, line_pos or active_pos) 

714 data_append(data, "sounds", pron) 

715 current_group_sounds.append(pron) 

716 line_has_sound = True 

717 # have_pronunciations = True 

718 continue 

719 

720 # Check if it contains Rhymes 

721 m = re.match(r"\s*Rhymes?: (.*)", text) 

722 if m: 

723 for ending in split_at_comma_semi(m.group(1)): 

724 ending = ending.strip() 

725 if ending: 725 ↛ 723line 725 didn't jump to line 723 because the condition on line 725 was always true

726 pron = {"rhymes": ending} 

727 set_sound_pos(pron, line_pos or active_pos) 

728 data_append(data, "sounds", pron) 

729 current_group_sounds.append(pron) 

730 line_has_sound = True 

731 # have_pronunciations = True 

732 continue 

733 

734 # Check if it contains homophones 

735 m = re.search(r"(?m)\bHomophones?: (.*)", text) 

736 if m: 

737 for w in split_at_comma_semi(m.group(1)): 

738 w = w.strip() 

739 if w: 739 ↛ 737line 739 didn't jump to line 737 because the condition on line 739 was always true

740 pron = {"homophone": w} 

741 set_sound_pos(pron, line_pos or active_pos) 

742 data_append(data, "sounds", pron) 

743 current_group_sounds.append(pron) 

744 line_has_sound = True 

745 # have_pronunciations = True 

746 continue 

747 

748 # Check if it contains Phonetic hangeul 

749 m = re.search(r"(?m)\bPhonetic hange?ul: \[([^]]+)\]", text) 

750 if m: 750 ↛ 751line 750 didn't jump to line 751 because the condition on line 750 was never true

751 seen = set() 

752 for w in m.group(1).split("/"): 

753 w = w.strip() 

754 if w and w not in seen: 

755 seen.add(w) 

756 pron = {"hangeul": w} 

757 set_sound_pos(pron, line_pos or active_pos) 

758 data_append(data, "sounds", pron) 

759 current_group_sounds.append(pron) 

760 line_has_sound = True 

761 # have_pronunciations = True 

762 

763 # This regex-based hyphenation detection left as backup 

764 m = re.search(r"\b(Syllabification|Hyphenation): *([^\n.]*)", text) 

765 if m: 

766 data_append(data, "hyphenation", m.group(2)) 

767 commaseparated = m.group(2).split(",") 

768 if len(commaseparated) > 1: 768 ↛ 779line 768 didn't jump to line 779 because the condition on line 768 was always true

769 for h in commaseparated: 

770 # That second characters looks like a dash but it's 

771 # actually unicode decimal code 8231, hyphenation dash 

772 # Add more delimiters here if needed. 

773 parts = re.split(r"-|‧", h.strip()) 

774 data_append( 

775 data, "hyphenations", Hyphenation(parts=parts) 

776 ) 

777 ... 

778 else: 

779 data_append( 

780 data, 

781 "hyphenations", 

782 Hyphenation(parts=m.group(2).split(sep="-")), 

783 ) 

784 # have_pronunciations = True 

785 

786 # See if it contains a word prefix restricting which forms the 

787 # pronunciation applies to (see amica/Latin) and/or parenthesized 

788 # tags. 

789 m = re.match( 

790 r"^[*#\s]*(([-\w]+):\s+)?\((([^()]|\([^()]*\))*?)\)", text 

791 ) 

792 if m: 

793 prefix = m.group(2) or "" 

794 tagstext = m.group(3) 

795 text = text[m.end() :] 

796 else: 

797 m = re.match(r"^[*#\s]*([-\w]+):\s+", text) 

798 if m: 

799 prefix = m.group(1) 

800 tagstext = "" 

801 text = text[m.end() :] 

802 else: 

803 # Spanish has tags before pronunciations, eg. aceite/Spanish 

804 m = re.match(r".*:\s+\(([^)]*)\)\s+(.*)", text) 

805 if m: 805 ↛ 806line 805 didn't jump to line 806 because the condition on line 805 was never true

806 tagstext = m.group(1) 

807 text = m.group(2) 

808 else: 

809 # No prefix. In this case, we inherit prefix 

810 # from previous entry. This particularly 

811 # applies for nested Audio files. 

812 tagstext = "" 

813 if tagstext: 

814 earlier_base_data = {} 

815 parse_pronunciation_tags_with_pos( 

816 wxr, tagstext, earlier_base_data 

817 ) 

818 

819 # Find romanizations from the pronunciation section (routinely 

820 # produced for Korean by {{ko-IPA}}) 

821 for m in re.finditer(pron_romanization_re, text): 821 ↛ 822line 821 didn't jump to line 822 because the loop on line 821 never started

822 prefix = m.group(1) 

823 w = m.group(2).strip() 

824 tag = pron_romanizations[prefix] 

825 form = {"form": w, "tags": tag.split()} 

826 data_append(data, "forms", form) 

827 

828 # Find IPA pronunciations 

829 for m in re.finditer( 

830 r"(?m)/[^][\n/,]+?/" r"|" r"\[[^]\n0-9,/][^],/]*?\]", text 

831 ): 

832 v = m.group(0) 

833 # The regexp above can match file links. Skip them. 

834 if v.startswith("[[File:"): 834 ↛ 835line 834 didn't jump to line 835 because the condition on line 834 was never true

835 continue 

836 if v == "/wiki.local/": 836 ↛ 837line 836 didn't jump to line 837 because the condition on line 836 was never true

837 continue 

838 if field == "ipa" and "__AUDIO_IGNORE_THIS__" in text: 838 ↛ 839line 838 didn't jump to line 839 because the condition on line 838 was never true

839 m = re.search(r"__AUDIO_IGNORE_THIS__(\d+)__", text) 

840 assert m 

841 idx = int(m.group(1)) 

842 if idx >= len(audios): 

843 continue 

844 if not audios[idx].get("audio-ipa"): 

845 audios[idx]["audio-ipa"] = v 

846 if prefix: 

847 audios[idx]["form"] = prefix 

848 else: 

849 if earlier_base_data: 

850 pron = deepcopy(earlier_base_data) 

851 pron[field] = v 

852 else: 

853 pron = {field: v} # type: ignore[misc] 

854 if prefix: 

855 pron["form"] = prefix 

856 if "pos" not in pron: 856 ↛ 858line 856 didn't jump to line 858 because the condition on line 856 was always true

857 set_sound_pos(pron, line_pos or active_pos) 

858 if may_be_duplicates is True: 858 ↛ 859line 858 didn't jump to line 859 because the condition on line 858 was never true

859 ok = True 

860 for comp_sound in data.get("sounds", []): 

861 # Python has dict comparison since 3.8 

862 if pron == comp_sound: 

863 ok = False 

864 break 

865 if ok: 

866 data_append(data, "sounds", pron) 

867 else: 

868 data_append(data, "sounds", pron) 

869 current_group_sounds.append(pron) 

870 line_has_sound = True 

871 # have_pronunciations = True 

872 if current_group_sounds and re.search(r"[,;]", text): 

873 current_group_sounds = [] 

874 if line_pos and not line_has_sound: 

875 active_pos = line_pos 

876 

877 # XXX what about {{hyphenation|...}}, {{hyph|...}} 

878 # and those used to be stored under "hyphenation" 

879 

880 # Add data that was collected in template_fn 

881 for audio in audios: 

882 if "audio" in audio: 882 ↛ 938line 882 didn't jump to line 938 because the condition on line 882 was always true

883 # Compute audio file URLs 

884 fn = audio["audio"] 

885 # Strip certain characters, e.g., left-to-right mark 

886 fn = re.sub(r"[\u200f\u200e]", "", fn) 

887 fn = fn.strip() 

888 fn = urllib.parse.unquote(fn) 

889 # First character is usually uppercased 

890 if re.match(r"^[a-z][a-z]+", fn): 

891 fn = fn[0].upper() + fn[1:] 

892 if fn in wxr.config.redirects: 892 ↛ 893line 892 didn't jump to line 893 because the condition on line 892 was never true

893 fn = wxr.config.redirects[fn] 

894 # File extension is lowercased 

895 # XXX some words seem to need this, some don't seem to 

896 # have this??? what is the exact rule? 

897 # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn) 

898 # Spaces are converted to underscores 

899 fn = re.sub(r"\s+", "_", fn) 

900 # Compute hash digest part 

901 h = hashlib.md5() 

902 hname = fn.encode("utf-8") 

903 h.update(hname) 

904 digest = h.hexdigest() 

905 # Quote filename for URL 

906 qfn = urllib.parse.quote(fn) 

907 # For safety when writing files 

908 qfn = qfn.replace("/", "__slash__") 

909 if re.search(r"(?i)\.(ogg|oga)$", fn): 

910 ogg = ( 

911 "https://upload.wikimedia.org/wikipedia/" 

912 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn) 

913 ) 

914 else: 

915 ogg = ( 

916 "https://upload.wikimedia.org/wikipedia/" 

917 "commons/transcoded/" 

918 "{}/{}/{}/{}.ogg".format( 

919 digest[:1], digest[:2], qfn, qfn 

920 ) 

921 ) 

922 if re.search(r"(?i)\.(mp3)$", fn): 922 ↛ 923line 922 didn't jump to line 923 because the condition on line 922 was never true

923 mp3 = ( 

924 "https://upload.wikimedia.org/wikipedia/" 

925 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn) 

926 ) 

927 else: 

928 mp3 = ( 

929 "https://upload.wikimedia.org/wikipedia/" 

930 "commons/transcoded/" 

931 "{}/{}/{}/{}.mp3".format( 

932 digest[:1], digest[:2], qfn, qfn 

933 ) 

934 ) 

935 audio["ogg_url"] = ogg 

936 audio["mp3_url"] = mp3 

937 set_sound_pos(audio, line_pos or active_pos) 

938 if audio not in data.get("sounds", ()): 

939 data_append(data, "sounds", audio) 

940 

941 # if audios: 

942 # have_pronunciations = True 

943 audios = [] 

944 

945 data_extend(data, "hyphenations", hyphenations) 

946 hyphenations = [] 

947 

948 ## I have commented out the otherwise unused have_pronunciation 

949 ## toggles; uncomment them to use this debug print 

950 # if not have_pronunciations and not have_panel_templates: 

951 # wxr.wtp.debug("no pronunciations found from pronunciation section", 

952 # sortid="pronunciations/533") 

953 

954 

955def extract_th_pron_template( 

956 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode 

957): 

958 # https://en.wiktionary.org/wiki/Template:th-pron 

959 @dataclass 

960 class TableHeader: 

961 raw_tags: list[str] 

962 rowspan: int 

963 

964 expanded_node = wxr.wtp.parse( 

965 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

966 ) 

967 sounds = [] 

968 for table_tag in expanded_node.find_html("table"): 

969 row_headers = [] 

970 for tr_tag in table_tag.find_html("tr"): 

971 field = "other" 

972 new_headers = [] 

973 for header in row_headers: 

974 if header.rowspan > 1: 

975 header.rowspan -= 1 

976 new_headers.append(header) 

977 row_headers = new_headers 

978 for th_tag in tr_tag.find_html("th"): 

979 header_str = clean_node(wxr, None, th_tag) 

980 if header_str.startswith("(standard) IPA"): 

981 field = "ipa" 

982 elif header_str.startswith("Homophones"): 982 ↛ 983line 982 didn't jump to line 983 because the condition on line 982 was never true

983 field = "homophone" 

984 elif header_str == "Audio": 

985 field = "audio" 

986 elif header_str != "": 986 ↛ 978line 986 didn't jump to line 978 because the condition on line 986 was always true

987 rowspan = 1 

988 rowspan_str = th_tag.attrs.get("rowspan", "1") 

989 if re.fullmatch(r"\d+", rowspan_str): 989 ↛ 991line 989 didn't jump to line 991 because the condition on line 989 was always true

990 rowspan = int(rowspan_str) 

991 header = TableHeader([], rowspan) 

992 for line in header_str.splitlines(): 

993 for raw_tag in line.strip("{}\n ").split(";"): 

994 raw_tag = raw_tag.strip() 

995 if raw_tag != "": 995 ↛ 993line 995 didn't jump to line 993 because the condition on line 995 was always true

996 header.raw_tags.append(raw_tag) 

997 row_headers.append(header) 

998 

999 for td_tag in tr_tag.find_html("td"): 

1000 if field == "audio": 

1001 for link_node in td_tag.find_child(NodeKind.LINK): 

1002 filename = clean_node(wxr, None, link_node.largs[0]) 

1003 if filename != "": 1003 ↛ 1001line 1003 didn't jump to line 1001 because the condition on line 1003 was always true

1004 sound = create_audio_url_dict(filename) 

1005 sounds.append(sound) 

1006 elif field == "homophone": 1006 ↛ 1007line 1006 didn't jump to line 1007 because the condition on line 1006 was never true

1007 for span_tag in td_tag.find_html_recursively( 

1008 "span", attr_name="lang", attr_value="th" 

1009 ): 

1010 word = clean_node(wxr, None, span_tag) 

1011 if word != "": 

1012 sounds.append({"homophone": word}) 

1013 else: 

1014 raw_tags = [] 

1015 for html_node in td_tag.find_child_recursively( 

1016 NodeKind.HTML 

1017 ): 

1018 if html_node.tag == "small": 

1019 node_str = clean_node(wxr, None, html_node) 

1020 if node_str.startswith("[") and node_str.endswith( 

1021 "]" 

1022 ): 

1023 for raw_tag in node_str.strip("[]").split(","): 

1024 raw_tag = raw_tag.strip() 

1025 if raw_tag != "": 1025 ↛ 1023line 1025 didn't jump to line 1023 because the condition on line 1025 was always true

1026 raw_tags.append(raw_tag) 

1027 elif len(sounds) > 0: 1027 ↛ 1015line 1027 didn't jump to line 1015 because the condition on line 1027 was always true

1028 sounds[-1]["roman"] = node_str 

1029 elif html_node.tag == "span": 

1030 node_str = clean_node(wxr, None, html_node) 

1031 span_lang = html_node.attrs.get("lang", "") 

1032 span_class = html_node.attrs.get("class", "") 

1033 if node_str != "" and ( 

1034 span_lang == "th" or span_class in ["IPA", "tr"] 

1035 ): 

1036 sound = {} 

1037 for raw_tag in raw_tags: 

1038 if raw_tag in valid_tags: 1038 ↛ 1041line 1038 didn't jump to line 1041 because the condition on line 1038 was always true

1039 data_append(sound, "tags", raw_tag) 

1040 else: 

1041 data_append(sound, "raw_tags", raw_tag) 

1042 for header in row_headers: 

1043 for raw_tag in header.raw_tags: 

1044 if raw_tag.lower() in valid_tags: 

1045 data_append( 

1046 sound, "tags", raw_tag.lower() 

1047 ) 

1048 else: 

1049 data_append( 

1050 sound, "raw_tags", raw_tag 

1051 ) 

1052 if "romanization" in sound.get("tags", []): 

1053 field = "roman" 

1054 sound[field] = node_str 

1055 sounds.append(sound) 

1056 

1057 clean_node(wxr, word_entry, expanded_node) 

1058 data_extend(word_entry, "sounds", sounds) 

1059 

1060 

1061def extract_zh_pron_template( 

1062 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode 

1063): 

1064 # https://en.wiktionary.org/wiki/Template:zh-pron 

1065 expanded_node = wxr.wtp.parse( 

1066 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

1067 ) 

1068 seen_lists = set() 

1069 sounds = [] 

1070 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

1071 if list_node not in seen_lists: 

1072 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

1073 sounds.extend( 

1074 extract_zh_pron_list_item(wxr, list_item, [], seen_lists) 

1075 ) 

1076 clean_node(wxr, word_entry, expanded_node) 

1077 data_extend(word_entry, "sounds", sounds) 

1078 

1079 

1080def extract_zh_pron_list_item( 

1081 wxr: WiktextractContext, 

1082 list_item: WikiNode, 

1083 raw_tags: list[str], 

1084 seen_lists: set[WikiNode], 

1085) -> list[SoundData]: 

1086 current_tags = raw_tags[:] 

1087 sounds = [] 

1088 is_first_small_tag = True 

1089 for node in list_item.children: 

1090 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

1091 link_str = clean_node(wxr, None, node.largs) 

1092 node_str = clean_node(wxr, None, node) 

1093 if link_str.startswith("File:"): 1093 ↛ 1094line 1093 didn't jump to line 1094 because the condition on line 1093 was never true

1094 sound = create_audio_url_dict(link_str.removeprefix("File:")) 

1095 sound["raw_tags"] = current_tags[:] 

1096 translate_zh_pron_raw_tags(sound) 

1097 sounds.append(sound) 

1098 elif node_str != "": 1098 ↛ 1089line 1098 didn't jump to line 1089 because the condition on line 1098 was always true

1099 current_tags.append(node_str) 

1100 elif isinstance(node, HTMLNode): 

1101 if node.tag == "small": 

1102 if is_first_small_tag: 1102 ↛ 1113line 1102 didn't jump to line 1113 because the condition on line 1102 was always true

1103 raw_tag_text = clean_node( 

1104 wxr, 

1105 None, 

1106 [ 

1107 n 

1108 for n in node.children 

1109 if not (isinstance(n, HTMLNode) and n.tag == "sup") 

1110 ], 

1111 ) 

1112 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text)) 

1113 elif len(sounds) > 0: 

1114 data_extend( 

1115 sounds[-1], 

1116 "raw_tags", 

1117 split_zh_pron_raw_tag(clean_node(wxr, None, node)), 

1118 ) 

1119 translate_zh_pron_raw_tags(sounds[-1]) 

1120 is_first_small_tag = False 

1121 elif node.tag == "span": 

1122 sounds.extend(extract_zh_pron_span(wxr, node, current_tags)) 

1123 elif ( 1123 ↛ 1128line 1123 didn't jump to line 1128 because the condition on line 1123 was never true

1124 node.tag == "table" 

1125 and len(current_tags) > 0 

1126 and current_tags[-1] == "Homophones" 

1127 ): 

1128 sounds.extend( 

1129 extract_zh_pron_homophone_table(wxr, node, current_tags) 

1130 ) 

1131 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

1132 seen_lists.add(node) 

1133 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

1134 sounds.extend( 

1135 extract_zh_pron_list_item( 

1136 wxr, child_list_item, current_tags, seen_lists 

1137 ) 

1138 ) 

1139 

1140 return sounds 

1141 

1142 

1143def extract_zh_pron_homophone_table( 

1144 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str] 

1145) -> list[SoundData]: 

1146 sounds = [] 

1147 for td_tag in table.find_html_recursively("td"): 

1148 for span_tag in td_tag.find_html("span"): 

1149 span_class = span_tag.attrs.get("class", "") 

1150 span_lang = span_tag.attrs.get("lang", "") 

1151 span_str = clean_node(wxr, None, span_tag) 

1152 if ( 

1153 span_str not in ["", "/"] 

1154 and span_lang != "" 

1155 and span_class in ["Hant", "Hans", "Hani"] 

1156 ): 

1157 sound = {"homophone": span_str, "raw_tags": raw_tags[:]} 

1158 if span_class == "Hant": 

1159 data_append(sound, "tags", "Traditional-Chinese") 

1160 elif span_class == "Hans": 

1161 data_append(sound, "tags", "Simplified-Chinese") 

1162 translate_zh_pron_raw_tags(sound) 

1163 sounds.append(sound) 

1164 

1165 return sounds 

1166 

1167 

1168def translate_zh_pron_raw_tags(sound: SoundData): 

1169 from .zh_pron_tags import ZH_PRON_TAGS 

1170 

1171 raw_tags = [] 

1172 for raw_tag in sound.get("raw_tags", []): 

1173 if raw_tag in ZH_PRON_TAGS: 

1174 tr_tag = ZH_PRON_TAGS[raw_tag] 

1175 if isinstance(tr_tag, str): 

1176 data_append(sound, "tags", tr_tag) 

1177 elif isinstance(tr_tag, list) and tr_tag not in sound.get( 1177 ↛ 1172line 1177 didn't jump to line 1172 because the condition on line 1177 was always true

1178 "tags", [] 

1179 ): 

1180 data_extend(sound, "tags", tr_tag) 

1181 elif raw_tag in valid_tags: 

1182 if raw_tag not in sound.get("tags", []): 1182 ↛ 1172line 1182 didn't jump to line 1172 because the condition on line 1182 was always true

1183 data_append(sound, "tags", raw_tag) 

1184 elif raw_tag not in raw_tags: 1184 ↛ 1172line 1184 didn't jump to line 1172 because the condition on line 1184 was always true

1185 raw_tags.append(raw_tag) 

1186 

1187 if len(raw_tags) > 0: 

1188 sound["raw_tags"] = raw_tags 

1189 elif "raw_tags" in sound: 1189 ↛ exitline 1189 didn't return from function 'translate_zh_pron_raw_tags' because the condition on line 1189 was always true

1190 del sound["raw_tags"] 

1191 

1192 

1193def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]: 

1194 raw_tags = [] 

1195 if "(" not in raw_tag_text: 

1196 for raw_tag in re.split(r",|:|;| and ", raw_tag_text): 

1197 raw_tag = raw_tag.strip().removeprefix("incl. ").strip() 

1198 if raw_tag != "": 

1199 raw_tags.append(raw_tag) 

1200 else: 

1201 processed_offsets = [] 

1202 for match in re.finditer(r"\([^()]+\)", raw_tag_text): 

1203 processed_offsets.append((match.start(), match.end())) 

1204 raw_tags.extend( 

1205 split_zh_pron_raw_tag( 

1206 raw_tag_text[match.start() + 1 : match.end() - 1] 

1207 ) 

1208 ) 

1209 not_processed = "" 

1210 last_end = 0 

1211 for start, end in processed_offsets: 

1212 not_processed += raw_tag_text[last_end:start] 

1213 last_end = end 

1214 not_processed += raw_tag_text[last_end:] 

1215 if not_processed != raw_tag_text: 1215 ↛ 1218line 1215 didn't jump to line 1218 because the condition on line 1215 was always true

1216 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags 

1217 else: 

1218 raw_tags.append(not_processed) 

1219 

1220 return raw_tags 

1221 

1222 

1223def extract_zh_pron_span( 

1224 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str] 

1225) -> list[SoundData]: 

1226 sounds = [] 

1227 small_tags = [] 

1228 pron_nodes = [] 

1229 roman = "" 

1230 phonetic_pron = "" 

1231 for index, node in enumerate(span_tag.children): 

1232 if isinstance(node, HTMLNode) and node.tag == "small": 1232 ↛ 1233line 1232 didn't jump to line 1233 because the condition on line 1232 was never true

1233 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

1234 elif ( 1234 ↛ 1239line 1234 didn't jump to line 1239 because the condition on line 1234 was never true

1235 isinstance(node, HTMLNode) 

1236 and node.tag == "span" 

1237 and "-Latn" in node.attrs.get("lang", "") 

1238 ): 

1239 roman = clean_node(wxr, None, node).strip("() ") 

1240 elif isinstance(node, str) and node.strip() == "[Phonetic:": 1240 ↛ 1241line 1240 didn't jump to line 1241 because the condition on line 1240 was never true

1241 phonetic_pron = clean_node( 

1242 wxr, None, span_tag.children[index + 1 :] 

1243 ).strip("] ") 

1244 break 

1245 else: 

1246 pron_nodes.append(node) 

1247 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)): 

1248 zh_pron = zh_pron.strip("[]: ") 

1249 if len(zh_pron) > 0: 1249 ↛ 1247line 1249 didn't jump to line 1247 because the condition on line 1249 was always true

1250 if "IPA" in span_tag.attrs.get("class", ""): 1250 ↛ 1251line 1250 didn't jump to line 1251 because the condition on line 1250 was never true

1251 sound = {"ipa": zh_pron, "raw_tags": raw_tags[:]} 

1252 else: 

1253 sound = {"zh_pron": zh_pron, "raw_tags": raw_tags[:]} 

1254 if roman != "": 1254 ↛ 1255line 1254 didn't jump to line 1255 because the condition on line 1254 was never true

1255 sound["roman"] = roman 

1256 sounds.append(sound) 

1257 if len(sounds) > 0: 1257 ↛ 1259line 1257 didn't jump to line 1259 because the condition on line 1257 was always true

1258 data_extend(sounds[-1], "raw_tags", small_tags) 

1259 if phonetic_pron != "": 1259 ↛ 1260line 1259 didn't jump to line 1260 because the condition on line 1259 was never true

1260 sound = { 

1261 "zh_pron": phonetic_pron, 

1262 "raw_tags": raw_tags[:] + ["Phonetic"], 

1263 } 

1264 if roman != "": 

1265 sound["roman"] = roman 

1266 sounds.append(sound) 

1267 for sound in sounds: 

1268 translate_zh_pron_raw_tags(sound) 

1269 return sounds 

1270 

1271 

1272def split_zh_pron(zh_pron: str) -> list[str]: 

1273 # split by comma and other symbols that outside parentheses 

1274 parentheses = 0 

1275 pron_list = [] 

1276 pron = "" 

1277 for c in zh_pron: 

1278 if ( 

1279 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/"))) 

1280 and parentheses == 0 

1281 and len(pron.strip()) > 0 

1282 ): 

1283 pron_list.append(pron.strip()) 

1284 pron = "" 

1285 elif c == "(": 

1286 parentheses += 1 

1287 pron += c 

1288 elif c == ")": 

1289 parentheses -= 1 

1290 pron += c 

1291 else: 

1292 pron += c 

1293 

1294 if pron.strip() != "": 1294 ↛ 1296line 1294 didn't jump to line 1296 because the condition on line 1294 was always true

1295 pron_list.append(pron) 

1296 return pron_list