Coverage for src / wiktextract / extractor / en / pronunciation.py: 80%

668 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-05-04 10:35 +0000

1import hashlib 

2import re 

3import urllib 

4from copy import deepcopy 

5from dataclasses import dataclass 

6from typing import Iterator, Literal 

7 

8from wikitextprocessor import ( 

9 HTMLNode, 

10 LevelNode, 

11 NodeKind, 

12 TemplateNode, 

13 WikiNode, 

14) 

15 

16from ...clean import clean_value 

17from ...datautils import data_append, data_extend, split_at_comma_semi 

18from ...page import LEVEL_KINDS, clean_node, is_panel_template 

19from ...tags import valid_tags 

20from ...wxr_context import WiktextractContext 

21from ..share import create_audio_url_dict 

22from .form_descriptions import ( 

23 classify_desc, 

24 decode_tags, 

25 parse_pronunciation_tags, 

26) 

27from .parts_of_speech import part_of_speech_map 

28from .type_utils import Hyphenation, SoundData, TemplateArgs, WordData 

29 

30# Prefixes, tags, and regexp for finding romanizations from the pronuncation 

31# section 

32pron_romanizations = { 

33 " Revised Romanization ": "romanization revised", 

34 " Revised Romanization (translit.) ": "romanization revised transliteration", 

35 " McCune-Reischauer ": "McCune-Reischauer romanization", 

36 " McCune–Reischauer ": "McCune-Reischauer romanization", 

37 " Yale Romanization ": "Yale romanization", 

38} 

39pron_romanization_re = re.compile( 

40 "(?m)^(" 

41 + "|".join( 

42 re.escape(x) 

43 for x in sorted(pron_romanizations.keys(), key=len, reverse=True) 

44 ) 

45 + ")([^\n]+)" 

46) 

47 

48IPA_EXTRACT = r"^(\((.+)\) )?(IPA⁽ᵏᵉʸ⁾|enPR): ((.+?)( \(([^(]+)\))?\s*)$" 

49IPA_EXTRACT_RE = re.compile(IPA_EXTRACT) 

50 

51 

52def extract_pron_template( 

53 wxr: WiktextractContext, tname: str, targs: TemplateArgs, expanded: str 

54) -> tuple[SoundData, list[SoundData]] | None: 

55 """In post_template_fn, this is used to handle all enPR and IPA templates 

56 so that we can leave breadcrumbs in the text that can later be handled 

57 there. We return a `base_data` so that if there are two 

58 or more templates on the same line, like this: 

59 (Tags for the whole line, really) enPR: foo, IPA(keys): /foo/ 

60 then we can apply base_data fields to other templates, too, if needed. 

61 """ 

62 cleaned = clean_value(wxr, expanded) 

63 # print(f"extract_pron_template input: {tname=} {expanded=}-> {cleaned=}") 

64 m = IPA_EXTRACT_RE.match(cleaned) 

65 if not m: 

66 wxr.wtp.error( 

67 f"Text cannot match IPA_EXTRACT_RE regex: " 

68 f"{cleaned=}, {tname=}, {targs=}", 

69 sortid="en/pronunciation/54", 

70 ) 

71 return None 

72 # for i, group in enumerate(m.groups()): 

73 # print(i + 1, repr(group)) 

74 main_qual = m.group(2) or "" 

75 if "qq" in targs: 

76 # If the template has been given a qualifier that applies to 

77 # every entry, but which also happens to appear at the end 

78 # which can be confused with the post-qualifier of a single 

79 # entry in the style of "... /ipa3/ (foo) (bar)", where foo 

80 # might not be present so the bar looks like it only might 

81 # apply to `/ipa3/` 

82 pron_body = m.group(5) 

83 post_qual = m.group(7) 

84 else: 

85 pron_body = m.group(4) 

86 post_qual = "" 

87 

88 if not pron_body: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 wxr.wtp.error( 

90 f"Regex failed to find 'body' from {cleaned=}", 

91 sortid="en/pronunciation/81", 

92 ) 

93 return None 

94 

95 base_data: SoundData = {} 

96 if main_qual: 

97 parse_pronunciation_tags(wxr, main_qual, base_data) 

98 if post_qual: 

99 parse_pronunciation_tags(wxr, post_qual, base_data) 

100 # This base_data is used as the base copy for all entries from this 

101 # template, but it is also returned so that its contents may be applied 

102 # to other templates on the same line. 

103 # print(f"{base_data=}") 

104 

105 sound_datas: list[SoundData] = [] 

106 

107 parts: list[list[str]] = [[]] 

108 inside = 0 

109 current: list[str] = [] 

110 for i, p in enumerate(re.split(r"(\s*,|;|\(|\)\s*)", pron_body)): 

111 # Split the line on commas and semicolons outside of parens. This 

112 # gives us lines with "(main-qualifier) /phon/ (post-qualifier, maybe)" 

113 # print(f" {i=}, {p=}") 

114 comp = p.strip() 

115 if not p: 

116 continue 

117 if comp == "(": 

118 if not inside and i > 0: 118 ↛ 121line 118 didn't jump to line 121 because the condition on line 118 was always true

119 if stripped := "".join(current).strip(): 

120 parts[-1].append("".join(current).strip()) # type:ignore[arg-type] 

121 current = [p] 

122 inside += 1 

123 continue 

124 if comp == ")": 

125 inside -= 1 

126 if not inside: 126 ↛ 131line 126 didn't jump to line 131 because the condition on line 126 was always true

127 if stripped := "".join(current).strip(): 127 ↛ 131line 127 didn't jump to line 131 because the condition on line 127 was always true

128 current.append(p) 

129 parts[-1].append("".join(current).strip()) # type:ignore[arg-type] 

130 current = [] 

131 continue 

132 if not inside and comp in (",", ";"): 

133 if stripped := "".join(current).strip(): 

134 parts[-1].append(stripped) # type:ignore[arg-type] 

135 current = [] 

136 parts.append([]) 

137 continue 

138 current.append(p) 

139 if current: 

140 parts[-1].append("".join(current).strip()) 

141 

142 # print(f">>>>>> {parts=}") 

143 new_parts: list[list[str]] = [] 

144 for entry in parts: 

145 if not entry: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 continue 

147 new_entry: list[str] = [] 

148 i1: int = entry[0].startswith("(") and entry[0].endswith(")") 

149 if i1: 

150 new_entry.append(entry[0][1:-1].strip()) 

151 else: 

152 new_entry.append("") 

153 i2: int = ( 

154 entry[-1].startswith("(") 

155 and entry[-1].endswith(")") 

156 and len(entry) > 1 

157 ) 

158 if i2 == 0: 

159 i2 = len(entry) 

160 else: 

161 i2 = -1 

162 new_entry.append("".join(entry[i1:i2]).strip()) 

163 if not new_entry[-1]: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 wxr.wtp.error( 

165 f"Missing IPA/enPRO sound data between qualifiers?{entry=}", 

166 sortid="en/pronunciation/153", 

167 ) 

168 if i2 == -1: 

169 new_entry.append(entry[-1][1:-1].strip()) 

170 else: 

171 new_entry.append("") 

172 new_parts.append(new_entry) 

173 

174 # print(f">>>>> {new_parts=}") 

175 

176 for part in new_parts: 

177 sd = deepcopy(base_data) 

178 if part[0]: 

179 parse_pronunciation_tags(wxr, part[0], sd) 

180 if part[2]: 

181 parse_pronunciation_tags(wxr, part[2], sd) 

182 if tname == "enPR": 

183 sd["enpr"] = part[1] 

184 else: 

185 sd["ipa"] = part[1] 

186 sound_datas.append(sd) 

187 

188 # print(f"BASE_DATA: {base_data}") 

189 # print(f"SOUND_DATAS: {sound_datas=}") 

190 

191 return base_data, sound_datas 

192 

193 

194def parse_pronunciation( 

195 wxr: WiktextractContext, 

196 level_node: LevelNode, 

197 data: WordData, 

198 etym_data: WordData, 

199 have_etym: bool, 

200 base_data: WordData, 

201 lang_code: str, 

202) -> None: 

203 """Parses the pronunciation section from a language section on a 

204 page.""" 

205 if level_node.kind in LEVEL_KINDS: 205 ↛ 218line 205 didn't jump to line 218 because the condition on line 205 was always true

206 contents: list[str | WikiNode | TemplateNode] = [] 

207 for node in level_node.children: 

208 if isinstance(node, TemplateNode): 

209 if node.template_name == "th-pron": 

210 extract_th_pron_template(wxr, data, node) 

211 elif node.template_name == "zh-pron": 

212 extract_zh_pron_template(wxr, data, node) 

213 else: 

214 contents.append(node) 

215 else: 

216 contents.append(node) 

217 else: 

218 contents = [level_node] 

219 # Remove subsections, such as Usage notes. They may contain IPAchar 

220 # templates in running text, and we do not want to extract IPAs from 

221 # those. 

222 # Filter out only LEVEL_KINDS; 'or' is doing heavy lifting here 

223 # Slip through not-WikiNodes, then slip through WikiNodes that 

224 # are not LEVEL_KINDS. 

225 contents = [ 

226 x 

227 for x in contents 

228 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS 

229 ] 

230 if not any( 

231 isinstance(x, WikiNode) and x.kind == NodeKind.LIST for x in contents 

232 ): 

233 # expand all templates 

234 new_contents: list[str | WikiNode | TemplateNode] = [] 

235 for lst in contents: 

236 if isinstance(lst, TemplateNode): 

237 temp = wxr.wtp.node_to_wikitext(lst) 

238 temp = wxr.wtp.expand(temp) 

239 temp_parsed = wxr.wtp.parse(temp) 

240 new_contents.extend(temp_parsed.children) 

241 else: 

242 new_contents.append(lst) 

243 contents = new_contents 

244 

245 if have_etym and data is base_data: 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true

246 data = etym_data 

247 pron_templates: list[tuple[SoundData, list[SoundData]]] = [] 

248 hyphenations: list[Hyphenation] = [] 

249 audios: list[SoundData] = [] 

250 have_panel_templates = False 

251 

252 def parse_pronunciation_template_fn( 

253 name: str, ht: TemplateArgs 

254 ) -> str | None: 

255 """Handle pronunciation and hyphenation templates""" 

256 # _template_fn handles templates *before* they are expanded; 

257 # this allows for special handling before all the work needed 

258 # for expansion is done. 

259 nonlocal have_panel_templates 

260 if is_panel_template(wxr, name): 

261 have_panel_templates = True 

262 return "" 

263 if name == "audio": 

264 filename = ht.get(2) or "" 

265 audio: SoundData = {"audio": filename.strip()} 

266 dialect = ht.get("a", "") 

267 if "aa" in ht: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true

268 dialect += ", " + ht.get("aa", "") 

269 if dialect: 

270 dialect = dialect.replace("<", "").replace(">", "") 

271 dialect = clean_node(wxr, None, [dialect]) 

272 for part in split_at_comma_semi(dialect): 

273 if "(" not in part: 

274 parse_pronunciation_tags(wxr, part, audio) 

275 else: 

276 for ppart in re.split(r"[][()]", part): 

277 parse_pronunciation_tags(wxr, ppart, audio) 

278 desc = ht.get(3) or "" 

279 desc = clean_node(wxr, None, [desc]) 

280 if desc: 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true

281 audio["text"] = desc 

282 m = re.search(r"\((([^()]|\([^()]*\))*)\)", desc) 

283 skip = False 

284 if m: 284 ↛ 285line 284 didn't jump to line 285 because the condition on line 284 was never true

285 par = m.group(1) 

286 cls = classify_desc(par) 

287 if cls == "tags": 

288 parse_pronunciation_tags(wxr, par, audio) 

289 else: 

290 skip = True 

291 if skip: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true

292 return "" 

293 audios.append(audio) 

294 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

295 if name == "audio-IPA": 295 ↛ 296line 295 didn't jump to line 296 because the condition on line 295 was never true

296 filename = ht.get(2) or "" 

297 ipa = ht.get(3) or "" 

298 dial = ht.get("dial") 

299 audio = {"audio": filename.strip()} 

300 if dial: 

301 dial = clean_node(wxr, None, [dial]) 

302 audio["text"] = dial 

303 if ipa: 

304 audio["audio-ipa"] = ipa 

305 audios.append(audio) 

306 # The problem with these IPAs is that they often just describe 

307 # what's in the sound file, rather than giving the pronunciation 

308 # of the word alone. It is common for audio files to contain 

309 # multiple pronunciations or articles in the same file, and then 

310 # this IPA often describes what is in the file. 

311 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

312 if name == "audio-pron": 

313 filename = ht.get(2) or "" 

314 ipa = ht.get("ipa") or "" 

315 dial = ht.get("dial") 

316 country = ht.get("country") 

317 audio = {"audio": filename.strip()} 

318 if dial: 318 ↛ 322line 318 didn't jump to line 322 because the condition on line 318 was always true

319 dial = clean_node(wxr, None, [dial]) 

320 audio["text"] = dial 

321 parse_pronunciation_tags(wxr, dial, audio) 

322 if country: 322 ↛ 324line 322 didn't jump to line 324 because the condition on line 322 was always true

323 parse_pronunciation_tags(wxr, country, audio) 

324 if ipa: 324 ↛ 326line 324 didn't jump to line 326 because the condition on line 324 was always true

325 audio["audio-ipa"] = ipa 

326 audios.append(audio) 

327 # XXX do we really want to extract pronunciations from these? 

328 # Or are they spurious / just describing what is in the 

329 # audio file? 

330 # if ipa: 

331 # pron = {"ipa": ipa} 

332 # if dial: 

333 # parse_pronunciation_tags(wxr, dial, pron) 

334 # if country: 

335 # parse_pronunciation_tags(wxr, country, pron) 

336 # data_append(data, "sounds", pron) 

337 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

338 if name in ("hyph", "hyphenation"): 

339 # {{hyph|en|re|late|caption="Hyphenation UK:"}} 

340 # {{hyphenation|it|quiè|to||qui|è|to||quié|to||qui|é|to}} 

341 # and also nocaption=1 

342 caption = clean_node(wxr, None, ht.get("caption", "")) 

343 tagsets, _ = decode_tags(caption) 

344 # flatten the tagsets into one; it would be really weird to have 

345 # several tagsets for a hyphenation caption 

346 tags = sorted(set(tag for tagset in tagsets for tag in tagset)) 

347 # We'll just ignore any errors from tags, it's not very important 

348 # for hyphenation 

349 tags = [tag for tag in tags if not tag.startswith("error")] 

350 hyph_sequences: list[list[str]] = [[]] 

351 for text in [ 

352 t for (k, t) in ht.items() if (isinstance(k, int) and k >= 2) 

353 ]: 

354 if not text: 

355 hyph_sequences.append([]) 

356 else: 

357 hyph_sequences[-1].append(clean_node(wxr, None, text)) 

358 for seq in hyph_sequences: 

359 hyphenations.append(Hyphenation(parts=seq, tags=tags)) 

360 return "" 

361 return None 

362 

363 may_be_duplicates = False 

364 

365 def parse_pron_post_template_fn( 

366 name: str, ht: TemplateArgs, text: str 

367 ) -> str | None: 

368 # _post_template_fn handles templates *after* the work to expand 

369 # them has been done; this is exactly the same as _template_fn, 

370 # except with the additional expanded text as an input, and 

371 # possible side-effects from the expansion and recursion (like 

372 # calling other subtemplates that are handled in _template_fn. 

373 nonlocal may_be_duplicates 

374 if is_panel_template(wxr, name): 374 ↛ 375line 374 didn't jump to line 375 because the condition on line 374 was never true

375 return "" 

376 if name in { 

377 "q", 

378 "qualifier", 

379 "sense", 

380 "a", 

381 "accent", 

382 "l", 

383 "link", 

384 "lb", 

385 "lbl", 

386 "label", 

387 }: 

388 # Kludge: when these templates expand to /.../ or [...], 

389 # replace the expansion by something safe. This is used 

390 # to filter spurious IPA-looking expansions that aren't really 

391 # IPAs. We probably don't care about these templates in the 

392 # contexts where they expand to something containing these. 

393 v = re.sub(r'href="[^"]*"', "", text) # Ignore URLs 

394 v = re.sub(r'src="[^"]*"', "", v) 

395 if re.search(r"/[^/,]+?/|\[[^]0-9,/][^],/]*?\]", v): 395 ↛ 401line 395 didn't jump to line 401 because the condition on line 395 was always true

396 # Note: replacing by empty results in Lua errors that we 

397 # would rather not have. For example, voi/Middle Vietnamese 

398 # uses {{a|{{l{{vi|...}}}}, and the {{a|...}} will fail 

399 # if {{l|...}} returns empty. 

400 return "stripped-by-parse_pron_post_template_fn" 

401 if name in ("IPA", "enPR"): 

402 # Extract the data from IPA and enPR templates (same underlying 

403 # template) and replace them in-text with magical cookie that 

404 # can be later used to refer to the data's index inside 

405 # pron_templates. 

406 if pron_t := extract_pron_template(wxr, name, ht, text): 

407 pron_templates.append(pron_t) 

408 return f"__PRON_TEMPLATE_{len(pron_templates) - 1}__" 

409 # Catch templates that generate duplicate sound data entries 

410 # here; if the text produces a big, toggleable section, the 

411 # "header" for that section might be duplicated. Add more conditions 

412 # if necessary. 

413 if text.startswith("<") and "vsToggleElement" in text: 413 ↛ 414line 413 didn't jump to line 414 because the condition on line 413 was never true

414 may_be_duplicates = True 

415 return text 

416 

417 def flattened_tree(lines: list[WikiNode | str]) -> Iterator[WikiNode | str]: 

418 assert isinstance(lines, list) 

419 for line in lines: 

420 yield from flattened_tree1(line) 

421 

422 def flattened_tree1(node: WikiNode | str) -> Iterator[WikiNode | str]: 

423 assert isinstance(node, (WikiNode, str)) 

424 if isinstance(node, str): 

425 yield node 

426 return 

427 elif node.kind == NodeKind.LIST: 

428 for item in node.children: 

429 yield from flattened_tree1(item) 

430 elif node.kind == NodeKind.LIST_ITEM: 

431 new_children = [] 

432 sublist = None 

433 for child in node.children: 

434 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST: 

435 sublist = child 

436 else: 

437 new_children.append(child) 

438 node.children = new_children 

439 node.sarg = "*" 

440 yield node 

441 if sublist: 

442 yield from flattened_tree1(sublist) 

443 else: 

444 yield node 

445 

446 # XXX Do not use flattened_tree more than once here, for example for 

447 # debug printing... The underlying data is changed, and the separated 

448 # sublists disappear. 

449 

450 # Kludge for templates that generate several lines, but haven't 

451 # been caught by earlier kludges... 

452 def split_cleaned_node_on_newlines( 

453 contents: list[WikiNode | str], 

454 ) -> Iterator[str]: 

455 for litem in flattened_tree(contents): 

456 ipa_text = clean_node( 

457 wxr, 

458 data, 

459 litem, 

460 template_fn=parse_pronunciation_template_fn, 

461 post_template_fn=parse_pron_post_template_fn, 

462 ) 

463 for line in ipa_text.splitlines(): 

464 yield line 

465 

466 # have_pronunciations = False 

467 active_pos: str | None = None 

468 

469 for line in split_cleaned_node_on_newlines(contents): 

470 prefix: str | None = None 

471 earlier_base_data: SoundData | None = None 

472 if not line: 472 ↛ 473line 472 didn't jump to line 473 because the condition on line 472 was never true

473 continue 

474 

475 split_templates = re.split(r"__PRON_TEMPLATE_(\d+)__", line) 

476 for i, text in enumerate(split_templates): 

477 if not text: 

478 continue 

479 # clean up starts at the start of the line 

480 text = re.sub(r"^\**\s*", "", text).strip() 

481 if i == 0: 

482 # At the start of a line, check for stuff like "Noun:" 

483 # for active_pos; active_pos is a temporary data field 

484 # given to each saved SoundData entry which is later 

485 # used to sort the entries into their respective PoSes. 

486 m = re.match(r"\s*(\w+\s?\w*)\s*:?\s*", text) 

487 if m: 

488 if (m_lower := m.group(1).lower()) in part_of_speech_map: 

489 active_pos = part_of_speech_map[m_lower]["pos"] 

490 text = text[m.end() :].strip() 

491 if not text: 

492 continue 

493 if i % 2 == 1: 

494 # re.split (with capture groups) splits the lines so that 

495 # every even entry is a captured splitter; odd lines are either 

496 # empty strings or stuff around the splitters. 

497 base_pron_data, first_prons = pron_templates[int(text)] 

498 if base_pron_data: 

499 earlier_base_data = base_pron_data 

500 # print(f"Set {earlier_base_data=}") 

501 elif earlier_base_data is not None: 

502 # merge data from an earlier iteration of this loop 

503 for pr in first_prons: 

504 if "note" in pr and "note" in earlier_base_data: 504 ↛ 505line 504 didn't jump to line 505 because the condition on line 504 was never true

505 pr["note"] += ";" + earlier_base_data.get( 

506 "note", "" 

507 ) 

508 elif "note" in earlier_base_data: 508 ↛ 509line 508 didn't jump to line 509 because the condition on line 508 was never true

509 pr["note"] = earlier_base_data["note"] 

510 if "topics" in earlier_base_data: 510 ↛ 511line 510 didn't jump to line 511 because the condition on line 510 was never true

511 data_extend( 

512 pr, "topics", earlier_base_data["topics"] 

513 ) 

514 if "tags" in pr and "tags" in earlier_base_data: 514 ↛ 515line 514 didn't jump to line 515 because the condition on line 514 was never true

515 pr["tags"].extend(earlier_base_data["tags"]) 

516 elif "tags" in earlier_base_data: 516 ↛ 503line 516 didn't jump to line 503 because the condition on line 516 was always true

517 pr["tags"] = sorted(set(earlier_base_data["tags"])) 

518 for pr in first_prons: 

519 if active_pos: 

520 pr["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

521 if pr not in data.get("sounds", ()): 521 ↛ 518line 521 didn't jump to line 518 because the condition on line 521 was always true

522 data_append(data, "sounds", pr) 

523 # This bit is handled 

524 continue 

525 

526 if "IPA" in text: 

527 field: Literal[ 

528 "audio", 

529 "audio-ipa", 

530 "enpr", 

531 "form", 

532 "hangeul", 

533 "homophone", 

534 "ipa", 

535 "mp3_url", 

536 "note", 

537 "ogg_url", 

538 "other", 

539 "rhymes", 

540 "tags", 

541 "text", 

542 "topics", 

543 "zh-pron", 

544 ] = "ipa" 

545 else: 

546 # This is used for Rhymes, Homophones, etc 

547 field = "other" 

548 

549 # Check if it contains Japanese "Tokyo" pronunciation with 

550 # special syntax 

551 pron: SoundData 

552 m = re.search(r"(?m)\(Tokyo\) +([^ ]+) +\[", text) 

553 if m: 553 ↛ 554line 553 didn't jump to line 554 because the condition on line 553 was never true

554 pron = {field: m.group(1)} # type: ignore[misc] 

555 if active_pos: 

556 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

557 data_append(data, "sounds", pron) 

558 # have_pronunciations = True 

559 continue 

560 

561 # Check if it contains Rhymes 

562 m = re.match(r"\s*Rhymes?: (.*)", text) 

563 if m: 

564 for ending in split_at_comma_semi(m.group(1)): 

565 ending = ending.strip() 

566 if ending: 566 ↛ 564line 566 didn't jump to line 564 because the condition on line 566 was always true

567 pron = {"rhymes": ending} 

568 if active_pos: 

569 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

570 data_append(data, "sounds", pron) 

571 # have_pronunciations = True 

572 continue 

573 

574 # Check if it contains homophones 

575 m = re.search(r"(?m)\bHomophones?: (.*)", text) 

576 if m: 

577 for w in split_at_comma_semi(m.group(1)): 

578 w = w.strip() 

579 if w: 579 ↛ 577line 579 didn't jump to line 577 because the condition on line 579 was always true

580 pron = {"homophone": w} 

581 if active_pos: 

582 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

583 data_append(data, "sounds", pron) 

584 # have_pronunciations = True 

585 continue 

586 

587 # Check if it contains Phonetic hangeul 

588 m = re.search(r"(?m)\bPhonetic hange?ul: \[([^]]+)\]", text) 

589 if m: 589 ↛ 590line 589 didn't jump to line 590 because the condition on line 589 was never true

590 seen = set() 

591 for w in m.group(1).split("/"): 

592 w = w.strip() 

593 if w and w not in seen: 

594 seen.add(w) 

595 pron = {"hangeul": w} 

596 if active_pos: 

597 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

598 data_append(data, "sounds", pron) 

599 # have_pronunciations = True 

600 

601 # This regex-based hyphenation detection left as backup 

602 m = re.search(r"\b(Syllabification|Hyphenation): *([^\n.]*)", text) 

603 if m: 

604 data_append(data, "hyphenation", m.group(2)) 

605 commaseparated = m.group(2).split(",") 

606 if len(commaseparated) > 1: 606 ↛ 617line 606 didn't jump to line 617 because the condition on line 606 was always true

607 for h in commaseparated: 

608 # That second characters looks like a dash but it's 

609 # actually unicode decimal code 8231, hyphenation dash 

610 # Add more delimiters here if needed. 

611 parts = re.split(r"-|‧", h.strip()) 

612 data_append( 

613 data, "hyphenations", Hyphenation(parts=parts) 

614 ) 

615 ... 

616 else: 

617 data_append( 

618 data, 

619 "hyphenations", 

620 Hyphenation(parts=m.group(2).split(sep="-")), 

621 ) 

622 # have_pronunciations = True 

623 

624 # See if it contains a word prefix restricting which forms the 

625 # pronunciation applies to (see amica/Latin) and/or parenthesized 

626 # tags. 

627 m = re.match( 

628 r"^[*#\s]*(([-\w]+):\s+)?\((([^()]|\([^()]*\))*?)\)", text 

629 ) 

630 if m: 

631 prefix = m.group(2) or "" 

632 tagstext = m.group(3) 

633 text = text[m.end() :] 

634 else: 

635 m = re.match(r"^[*#\s]*([-\w]+):\s+", text) 

636 if m: 

637 prefix = m.group(1) 

638 tagstext = "" 

639 text = text[m.end() :] 

640 else: 

641 # Spanish has tags before pronunciations, eg. aceite/Spanish 

642 m = re.match(r".*:\s+\(([^)]*)\)\s+(.*)", text) 

643 if m: 643 ↛ 644line 643 didn't jump to line 644 because the condition on line 643 was never true

644 tagstext = m.group(1) 

645 text = m.group(2) 

646 else: 

647 # No prefix. In this case, we inherit prefix 

648 # from previous entry. This particularly 

649 # applies for nested Audio files. 

650 tagstext = "" 

651 if tagstext: 

652 earlier_base_data = {} 

653 parse_pronunciation_tags(wxr, tagstext, earlier_base_data) 

654 

655 # Find romanizations from the pronunciation section (routinely 

656 # produced for Korean by {{ko-IPA}}) 

657 for m in re.finditer(pron_romanization_re, text): 657 ↛ 658line 657 didn't jump to line 658 because the loop on line 657 never started

658 prefix = m.group(1) 

659 w = m.group(2).strip() 

660 tag = pron_romanizations[prefix] 

661 form = {"form": w, "tags": tag.split()} 

662 data_append(data, "forms", form) 

663 

664 # Find IPA pronunciations 

665 for m in re.finditer( 

666 r"(?m)/[^][\n/,]+?/" r"|" r"\[[^]\n0-9,/][^],/]*?\]", text 

667 ): 

668 v = m.group(0) 

669 # The regexp above can match file links. Skip them. 

670 if v.startswith("[[File:"): 670 ↛ 671line 670 didn't jump to line 671 because the condition on line 670 was never true

671 continue 

672 if v == "/wiki.local/": 672 ↛ 673line 672 didn't jump to line 673 because the condition on line 672 was never true

673 continue 

674 if field == "ipa" and "__AUDIO_IGNORE_THIS__" in text: 674 ↛ 675line 674 didn't jump to line 675 because the condition on line 674 was never true

675 m = re.search(r"__AUDIO_IGNORE_THIS__(\d+)__", text) 

676 assert m 

677 idx = int(m.group(1)) 

678 if idx >= len(audios): 

679 continue 

680 if not audios[idx].get("audio-ipa"): 

681 audios[idx]["audio-ipa"] = v 

682 if prefix: 

683 audios[idx]["form"] = prefix 

684 else: 

685 if earlier_base_data: 

686 pron = deepcopy(earlier_base_data) 

687 pron[field] = v 

688 else: 

689 pron = {field: v} # type: ignore[misc] 

690 if active_pos: 

691 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

692 if prefix: 

693 pron["form"] = prefix 

694 if active_pos: 

695 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

696 if may_be_duplicates is True: 696 ↛ 697line 696 didn't jump to line 697 because the condition on line 696 was never true

697 ok = True 

698 for comp_sound in data.get("sounds", []): 

699 # Python has dict comparison since 3.8 

700 if pron == comp_sound: 

701 ok = False 

702 break 

703 if ok: 

704 data_append(data, "sounds", pron) 

705 else: 

706 data_append(data, "sounds", pron) 

707 # have_pronunciations = True 

708 

709 # XXX what about {{hyphenation|...}}, {{hyph|...}} 

710 # and those used to be stored under "hyphenation" 

711 

712 # Add data that was collected in template_fn 

713 for audio in audios: 

714 if "audio" in audio: 714 ↛ 771line 714 didn't jump to line 771 because the condition on line 714 was always true

715 # Compute audio file URLs 

716 fn = audio["audio"] 

717 # Strip certain characters, e.g., left-to-right mark 

718 fn = re.sub(r"[\u200f\u200e]", "", fn) 

719 fn = fn.strip() 

720 fn = urllib.parse.unquote(fn) 

721 # First character is usually uppercased 

722 if re.match(r"^[a-z][a-z]+", fn): 

723 fn = fn[0].upper() + fn[1:] 

724 if fn in wxr.config.redirects: 724 ↛ 725line 724 didn't jump to line 725 because the condition on line 724 was never true

725 fn = wxr.config.redirects[fn] 

726 # File extension is lowercased 

727 # XXX some words seem to need this, some don't seem to 

728 # have this??? what is the exact rule? 

729 # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn) 

730 # Spaces are converted to underscores 

731 fn = re.sub(r"\s+", "_", fn) 

732 # Compute hash digest part 

733 h = hashlib.md5() 

734 hname = fn.encode("utf-8") 

735 h.update(hname) 

736 digest = h.hexdigest() 

737 # Quote filename for URL 

738 qfn = urllib.parse.quote(fn) 

739 # For safety when writing files 

740 qfn = qfn.replace("/", "__slash__") 

741 if re.search(r"(?i)\.(ogg|oga)$", fn): 

742 ogg = ( 

743 "https://upload.wikimedia.org/wikipedia/" 

744 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn) 

745 ) 

746 else: 

747 ogg = ( 

748 "https://upload.wikimedia.org/wikipedia/" 

749 "commons/transcoded/" 

750 "{}/{}/{}/{}.ogg".format( 

751 digest[:1], digest[:2], qfn, qfn 

752 ) 

753 ) 

754 if re.search(r"(?i)\.(mp3)$", fn): 754 ↛ 755line 754 didn't jump to line 755 because the condition on line 754 was never true

755 mp3 = ( 

756 "https://upload.wikimedia.org/wikipedia/" 

757 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn) 

758 ) 

759 else: 

760 mp3 = ( 

761 "https://upload.wikimedia.org/wikipedia/" 

762 "commons/transcoded/" 

763 "{}/{}/{}/{}.mp3".format( 

764 digest[:1], digest[:2], qfn, qfn 

765 ) 

766 ) 

767 audio["ogg_url"] = ogg 

768 audio["mp3_url"] = mp3 

769 if active_pos: 

770 audio["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

771 if audio not in data.get("sounds", ()): 

772 data_append(data, "sounds", audio) 

773 

774 # if audios: 

775 # have_pronunciations = True 

776 audios = [] 

777 

778 data_extend(data, "hyphenations", hyphenations) 

779 hyphenations = [] 

780 

781 ## I have commented out the otherwise unused have_pronunciation 

782 ## toggles; uncomment them to use this debug print 

783 # if not have_pronunciations and not have_panel_templates: 

784 # wxr.wtp.debug("no pronunciations found from pronunciation section", 

785 # sortid="pronunciations/533") 

786 

787 

788def extract_th_pron_template( 

789 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode 

790): 

791 # https://en.wiktionary.org/wiki/Template:th-pron 

792 @dataclass 

793 class TableHeader: 

794 raw_tags: list[str] 

795 rowspan: int 

796 

797 expanded_node = wxr.wtp.parse( 

798 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

799 ) 

800 sounds = [] 

801 for table_tag in expanded_node.find_html("table"): 

802 row_headers = [] 

803 for tr_tag in table_tag.find_html("tr"): 

804 field = "other" 

805 new_headers = [] 

806 for header in row_headers: 

807 if header.rowspan > 1: 

808 header.rowspan -= 1 

809 new_headers.append(header) 

810 row_headers = new_headers 

811 for th_tag in tr_tag.find_html("th"): 

812 header_str = clean_node(wxr, None, th_tag) 

813 if header_str.startswith("(standard) IPA"): 

814 field = "ipa" 

815 elif header_str.startswith("Homophones"): 815 ↛ 816line 815 didn't jump to line 816 because the condition on line 815 was never true

816 field = "homophone" 

817 elif header_str == "Audio": 

818 field = "audio" 

819 elif header_str != "": 819 ↛ 811line 819 didn't jump to line 811 because the condition on line 819 was always true

820 rowspan = 1 

821 rowspan_str = th_tag.attrs.get("rowspan", "1") 

822 if re.fullmatch(r"\d+", rowspan_str): 822 ↛ 824line 822 didn't jump to line 824 because the condition on line 822 was always true

823 rowspan = int(rowspan_str) 

824 header = TableHeader([], rowspan) 

825 for line in header_str.splitlines(): 

826 for raw_tag in line.strip("{}\n ").split(";"): 

827 raw_tag = raw_tag.strip() 

828 if raw_tag != "": 828 ↛ 826line 828 didn't jump to line 826 because the condition on line 828 was always true

829 header.raw_tags.append(raw_tag) 

830 row_headers.append(header) 

831 

832 for td_tag in tr_tag.find_html("td"): 

833 if field == "audio": 

834 for link_node in td_tag.find_child(NodeKind.LINK): 

835 filename = clean_node(wxr, None, link_node.largs[0]) 

836 if filename != "": 836 ↛ 834line 836 didn't jump to line 834 because the condition on line 836 was always true

837 sound = create_audio_url_dict(filename) 

838 sounds.append(sound) 

839 elif field == "homophone": 839 ↛ 840line 839 didn't jump to line 840 because the condition on line 839 was never true

840 for span_tag in td_tag.find_html_recursively( 

841 "span", attr_name="lang", attr_value="th" 

842 ): 

843 word = clean_node(wxr, None, span_tag) 

844 if word != "": 

845 sounds.append({"homophone": word}) 

846 else: 

847 raw_tags = [] 

848 for html_node in td_tag.find_child_recursively( 

849 NodeKind.HTML 

850 ): 

851 if html_node.tag == "small": 

852 node_str = clean_node(wxr, None, html_node) 

853 if node_str.startswith("[") and node_str.endswith( 

854 "]" 

855 ): 

856 for raw_tag in node_str.strip("[]").split(","): 

857 raw_tag = raw_tag.strip() 

858 if raw_tag != "": 858 ↛ 856line 858 didn't jump to line 856 because the condition on line 858 was always true

859 raw_tags.append(raw_tag) 

860 elif len(sounds) > 0: 860 ↛ 848line 860 didn't jump to line 848 because the condition on line 860 was always true

861 sounds[-1]["roman"] = node_str 

862 elif html_node.tag == "span": 

863 node_str = clean_node(wxr, None, html_node) 

864 span_lang = html_node.attrs.get("lang", "") 

865 span_class = html_node.attrs.get("class", "") 

866 if node_str != "" and ( 

867 span_lang == "th" or span_class in ["IPA", "tr"] 

868 ): 

869 sound = {} 

870 for raw_tag in raw_tags: 

871 if raw_tag in valid_tags: 871 ↛ 874line 871 didn't jump to line 874 because the condition on line 871 was always true

872 data_append(sound, "tags", raw_tag) 

873 else: 

874 data_append(sound, "raw_tags", raw_tag) 

875 for header in row_headers: 

876 for raw_tag in header.raw_tags: 

877 if raw_tag.lower() in valid_tags: 

878 data_append( 

879 sound, "tags", raw_tag.lower() 

880 ) 

881 else: 

882 data_append( 

883 sound, "raw_tags", raw_tag 

884 ) 

885 if "romanization" in sound.get("tags", []): 

886 field = "roman" 

887 sound[field] = node_str 

888 sounds.append(sound) 

889 

890 clean_node(wxr, word_entry, expanded_node) 

891 data_extend(word_entry, "sounds", sounds) 

892 

893 

894def extract_zh_pron_template( 

895 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode 

896): 

897 # https://en.wiktionary.org/wiki/Template:zh-pron 

898 expanded_node = wxr.wtp.parse( 

899 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

900 ) 

901 seen_lists = set() 

902 sounds = [] 

903 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

904 if list_node not in seen_lists: 

905 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

906 sounds.extend( 

907 extract_zh_pron_list_item(wxr, list_item, [], seen_lists) 

908 ) 

909 clean_node(wxr, word_entry, expanded_node) 

910 data_extend(word_entry, "sounds", sounds) 

911 

912 

913def extract_zh_pron_list_item( 

914 wxr: WiktextractContext, 

915 list_item: WikiNode, 

916 raw_tags: list[str], 

917 seen_lists: set[WikiNode], 

918) -> list[SoundData]: 

919 current_tags = raw_tags[:] 

920 sounds = [] 

921 is_first_small_tag = True 

922 for node in list_item.children: 

923 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

924 link_str = clean_node(wxr, None, node.largs) 

925 node_str = clean_node(wxr, None, node) 

926 if link_str.startswith("File:"): 926 ↛ 927line 926 didn't jump to line 927 because the condition on line 926 was never true

927 sound = create_audio_url_dict(link_str.removeprefix("File:")) 

928 sound["raw_tags"] = current_tags[:] 

929 translate_zh_pron_raw_tags(sound) 

930 sounds.append(sound) 

931 elif node_str != "": 931 ↛ 922line 931 didn't jump to line 922 because the condition on line 931 was always true

932 current_tags.append(node_str) 

933 elif isinstance(node, HTMLNode): 

934 if node.tag == "small": 

935 if is_first_small_tag: 935 ↛ 946line 935 didn't jump to line 946 because the condition on line 935 was always true

936 raw_tag_text = clean_node( 

937 wxr, 

938 None, 

939 [ 

940 n 

941 for n in node.children 

942 if not (isinstance(n, HTMLNode) and n.tag == "sup") 

943 ], 

944 ) 

945 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text)) 

946 elif len(sounds) > 0: 

947 data_extend( 

948 sounds[-1], 

949 "raw_tags", 

950 split_zh_pron_raw_tag(clean_node(wxr, None, node)), 

951 ) 

952 translate_zh_pron_raw_tags(sounds[-1]) 

953 is_first_small_tag = False 

954 elif node.tag == "span": 

955 sounds.extend(extract_zh_pron_span(wxr, node, current_tags)) 

956 elif ( 956 ↛ 961line 956 didn't jump to line 961 because the condition on line 956 was never true

957 node.tag == "table" 

958 and len(current_tags) > 0 

959 and current_tags[-1] == "Homophones" 

960 ): 

961 sounds.extend( 

962 extract_zh_pron_homophone_table(wxr, node, current_tags) 

963 ) 

964 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

965 seen_lists.add(node) 

966 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

967 sounds.extend( 

968 extract_zh_pron_list_item( 

969 wxr, child_list_item, current_tags, seen_lists 

970 ) 

971 ) 

972 

973 return sounds 

974 

975 

976def extract_zh_pron_homophone_table( 

977 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str] 

978) -> list[SoundData]: 

979 sounds = [] 

980 for td_tag in table.find_html_recursively("td"): 

981 for span_tag in td_tag.find_html("span"): 

982 span_class = span_tag.attrs.get("class", "") 

983 span_lang = span_tag.attrs.get("lang", "") 

984 span_str = clean_node(wxr, None, span_tag) 

985 if ( 

986 span_str not in ["", "/"] 

987 and span_lang != "" 

988 and span_class in ["Hant", "Hans", "Hani"] 

989 ): 

990 sound = {"homophone": span_str, "raw_tags": raw_tags[:]} 

991 if span_class == "Hant": 

992 data_append(sound, "tags", "Traditional-Chinese") 

993 elif span_class == "Hans": 

994 data_append(sound, "tags", "Simplified-Chinese") 

995 translate_zh_pron_raw_tags(sound) 

996 sounds.append(sound) 

997 

998 return sounds 

999 

1000 

1001def translate_zh_pron_raw_tags(sound: SoundData): 

1002 from .zh_pron_tags import ZH_PRON_TAGS 

1003 

1004 raw_tags = [] 

1005 for raw_tag in sound.get("raw_tags", []): 

1006 if raw_tag in ZH_PRON_TAGS: 

1007 tr_tag = ZH_PRON_TAGS[raw_tag] 

1008 if isinstance(tr_tag, str): 

1009 data_append(sound, "tags", tr_tag) 

1010 elif isinstance(tr_tag, list) and tr_tag not in sound.get( 1010 ↛ 1005line 1010 didn't jump to line 1005 because the condition on line 1010 was always true

1011 "tags", [] 

1012 ): 

1013 data_extend(sound, "tags", tr_tag) 

1014 elif raw_tag in valid_tags: 

1015 if raw_tag not in sound.get("tags", []): 1015 ↛ 1005line 1015 didn't jump to line 1005 because the condition on line 1015 was always true

1016 data_append(sound, "tags", raw_tag) 

1017 elif raw_tag not in raw_tags: 1017 ↛ 1005line 1017 didn't jump to line 1005 because the condition on line 1017 was always true

1018 raw_tags.append(raw_tag) 

1019 

1020 if len(raw_tags) > 0: 

1021 sound["raw_tags"] = raw_tags 

1022 elif "raw_tags" in sound: 1022 ↛ exitline 1022 didn't return from function 'translate_zh_pron_raw_tags' because the condition on line 1022 was always true

1023 del sound["raw_tags"] 

1024 

1025 

1026def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]: 

1027 raw_tags = [] 

1028 if "(" not in raw_tag_text: 

1029 for raw_tag in re.split(r",|:|;| and ", raw_tag_text): 

1030 raw_tag = raw_tag.strip().removeprefix("incl. ").strip() 

1031 if raw_tag != "": 

1032 raw_tags.append(raw_tag) 

1033 else: 

1034 processed_offsets = [] 

1035 for match in re.finditer(r"\([^()]+\)", raw_tag_text): 

1036 processed_offsets.append((match.start(), match.end())) 

1037 raw_tags.extend( 

1038 split_zh_pron_raw_tag( 

1039 raw_tag_text[match.start() + 1 : match.end() - 1] 

1040 ) 

1041 ) 

1042 not_processed = "" 

1043 last_end = 0 

1044 for start, end in processed_offsets: 

1045 not_processed += raw_tag_text[last_end:start] 

1046 last_end = end 

1047 not_processed += raw_tag_text[last_end:] 

1048 if not_processed != raw_tag_text: 1048 ↛ 1051line 1048 didn't jump to line 1051 because the condition on line 1048 was always true

1049 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags 

1050 else: 

1051 raw_tags.append(not_processed) 

1052 

1053 return raw_tags 

1054 

1055 

1056def extract_zh_pron_span( 

1057 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str] 

1058) -> list[SoundData]: 

1059 sounds = [] 

1060 small_tags = [] 

1061 pron_nodes = [] 

1062 roman = "" 

1063 phonetic_pron = "" 

1064 for index, node in enumerate(span_tag.children): 

1065 if isinstance(node, HTMLNode) and node.tag == "small": 1065 ↛ 1066line 1065 didn't jump to line 1066 because the condition on line 1065 was never true

1066 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

1067 elif ( 1067 ↛ 1072line 1067 didn't jump to line 1072 because the condition on line 1067 was never true

1068 isinstance(node, HTMLNode) 

1069 and node.tag == "span" 

1070 and "-Latn" in node.attrs.get("lang", "") 

1071 ): 

1072 roman = clean_node(wxr, None, node).strip("() ") 

1073 elif isinstance(node, str) and node.strip() == "[Phonetic:": 1073 ↛ 1074line 1073 didn't jump to line 1074 because the condition on line 1073 was never true

1074 phonetic_pron = clean_node( 

1075 wxr, None, span_tag.children[index + 1 :] 

1076 ).strip("] ") 

1077 break 

1078 else: 

1079 pron_nodes.append(node) 

1080 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)): 

1081 zh_pron = zh_pron.strip("[]: ") 

1082 if len(zh_pron) > 0: 1082 ↛ 1080line 1082 didn't jump to line 1080 because the condition on line 1082 was always true

1083 if "IPA" in span_tag.attrs.get("class", ""): 1083 ↛ 1084line 1083 didn't jump to line 1084 because the condition on line 1083 was never true

1084 sound = {"ipa": zh_pron, "raw_tags": raw_tags[:]} 

1085 else: 

1086 sound = {"zh_pron": zh_pron, "raw_tags": raw_tags[:]} 

1087 if roman != "": 1087 ↛ 1088line 1087 didn't jump to line 1088 because the condition on line 1087 was never true

1088 sound["roman"] = roman 

1089 sounds.append(sound) 

1090 if len(sounds) > 0: 1090 ↛ 1092line 1090 didn't jump to line 1092 because the condition on line 1090 was always true

1091 data_extend(sounds[-1], "raw_tags", small_tags) 

1092 if phonetic_pron != "": 1092 ↛ 1093line 1092 didn't jump to line 1093 because the condition on line 1092 was never true

1093 sound = { 

1094 "zh_pron": phonetic_pron, 

1095 "raw_tags": raw_tags[:] + ["Phonetic"], 

1096 } 

1097 if roman != "": 

1098 sound["roman"] = roman 

1099 sounds.append(sound) 

1100 for sound in sounds: 

1101 translate_zh_pron_raw_tags(sound) 

1102 return sounds 

1103 

1104 

1105def split_zh_pron(zh_pron: str) -> list[str]: 

1106 # split by comma and other symbols that outside parentheses 

1107 parentheses = 0 

1108 pron_list = [] 

1109 pron = "" 

1110 for c in zh_pron: 

1111 if ( 

1112 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/"))) 

1113 and parentheses == 0 

1114 and len(pron.strip()) > 0 

1115 ): 

1116 pron_list.append(pron.strip()) 

1117 pron = "" 

1118 elif c == "(": 

1119 parentheses += 1 

1120 pron += c 

1121 elif c == ")": 

1122 parentheses -= 1 

1123 pron += c 

1124 else: 

1125 pron += c 

1126 

1127 if pron.strip() != "": 1127 ↛ 1129line 1127 didn't jump to line 1129 because the condition on line 1127 was always true

1128 pron_list.append(pron) 

1129 return pron_list