Coverage for src/wiktextract/extractor/en/pronunciation.py: 67%

535 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-11 10:26 +0000

1import hashlib 

2import re 

3import urllib 

4from copy import deepcopy 

5from typing import Iterator, Optional, Union 

6 

7from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

8 

9from ...clean import clean_value 

10from ...datautils import data_append, data_extend, split_at_comma_semi 

11from ...page import LEVEL_KINDS, clean_node, is_panel_template 

12from ...tags import valid_tags 

13from ...wxr_context import WiktextractContext 

14from .form_descriptions import ( 

15 classify_desc, 

16 decode_tags, 

17 parse_pronunciation_tags, 

18) 

19from .parts_of_speech import part_of_speech_map 

20from .type_utils import Hyphenation, SoundData, TemplateArgs, WordData 

21from .zh_pron_tags import ZH_PRON_TAGS 

22 

23# Prefixes, tags, and regexp for finding romanizations from the pronuncation 

24# section 

25pron_romanizations = { 

26 " Revised Romanization ": "romanization revised", 

27 " Revised Romanization (translit.) ": "romanization revised transliteration", 

28 " McCune-Reischauer ": "McCune-Reischauer romanization", 

29 " McCune–Reischauer ": "McCune-Reischauer romanization", 

30 " Yale Romanization ": "Yale romanization", 

31} 

32pron_romanization_re = re.compile( 

33 "(?m)^(" 

34 + "|".join( 

35 re.escape(x) 

36 for x in sorted(pron_romanizations.keys(), key=len, reverse=True) 

37 ) 

38 + ")([^\n]+)" 

39) 

40 

41IPA_EXTRACT = r"^(\((.+)\) )?(IPA⁽ᵏᵉʸ⁾|enPR): ((.+?)( \(([^(]+)\))?\s*)$" 

42IPA_EXTRACT_RE = re.compile(IPA_EXTRACT) 

43 

44 

45def extract_pron_template( 

46 wxr: WiktextractContext, tname: str, targs: TemplateArgs, expanded: str 

47) -> Optional[tuple[SoundData, list[SoundData]]]: 

48 """In post_template_fn, this is used to handle all enPR and IPA templates 

49 so that we can leave breadcrumbs in the text that can later be handled 

50 there. We return a `base_data` so that if there are two 

51 or more templates on the same line, like this: 

52 (Tags for the whole line, really) enPR: foo, IPA(keys): /foo/ 

53 then we can apply base_data fields to other templates, too, if needed. 

54 """ 

55 cleaned = clean_value(wxr, expanded) 

56 # print(f"extract_pron_template input: {tname=} {expanded=}-> {cleaned=}") 

57 m = IPA_EXTRACT_RE.match(cleaned) 

58 if not m: 

59 wxr.wtp.error( 

60 f"Text cannot match IPA_EXTRACT_RE regex: " 

61 f"{cleaned=}, {tname=}, {targs=}", 

62 sortid="en/pronunciation/54", 

63 ) 

64 return None 

65 # for i, group in enumerate(m.groups()): 

66 # print(i + 1, repr(group)) 

67 main_qual = m.group(2) or "" 

68 if "qq" in targs: 

69 # If the template has been given a qualifier that applies to 

70 # every entry, but which also happens to appear at the end 

71 # which can be confused with the post-qualifier of a single 

72 # entry in the style of "... /ipa3/ (foo) (bar)", where foo 

73 # might not be present so the bar looks like it only might 

74 # apply to `/ipa3/` 

75 pron_body = m.group(5) 

76 post_qual = m.group(7) 

77 else: 

78 pron_body = m.group(4) 

79 post_qual = "" 

80 

81 if not pron_body: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 wxr.wtp.error( 

83 f"Regex failed to find 'body' from {cleaned=}", 

84 sortid="en/pronunciation/81", 

85 ) 

86 return None 

87 

88 base_data: SoundData = {} 

89 if main_qual: 

90 parse_pronunciation_tags(wxr, main_qual, base_data) 

91 if post_qual: 

92 parse_pronunciation_tags(wxr, post_qual, base_data) 

93 # This base_data is used as the base copy for all entries from this 

94 # template, but it is also returned so that its contents may be applied 

95 # to other templates on the same line. 

96 # print(f"{base_data=}") 

97 

98 sound_datas: list[SoundData] = [] 

99 

100 parts: list[list[str]] = [[]] 

101 inside = 0 

102 current: list[str] = [] 

103 for i, p in enumerate(re.split(r"(\s*,|;|\(|\)\s*)", pron_body)): 

104 # Split the line on commas and semicolons outside of parens. This 

105 # gives us lines with "(main-qualifier) /phon/ (post-qualifier, maybe)" 

106 # print(f" {i=}, {p=}") 

107 comp = p.strip() 

108 if not p: 

109 continue 

110 if comp == "(": 

111 if not inside and i > 0: 111 ↛ 114line 111 didn't jump to line 114 because the condition on line 111 was always true

112 if stripped := "".join(current).strip(): 

113 parts[-1].append("".join(current).strip()) # type:ignore[arg-type] 

114 current = [p] 

115 inside += 1 

116 continue 

117 if comp == ")": 

118 inside -= 1 

119 if not inside: 119 ↛ 124line 119 didn't jump to line 124 because the condition on line 119 was always true

120 if stripped := "".join(current).strip(): 120 ↛ 124line 120 didn't jump to line 124 because the condition on line 120 was always true

121 current.append(p) 

122 parts[-1].append("".join(current).strip()) # type:ignore[arg-type] 

123 current = [] 

124 continue 

125 if not inside and comp in (",", ";"): 

126 if stripped := "".join(current).strip(): 

127 parts[-1].append(stripped) # type:ignore[arg-type] 

128 current = [] 

129 parts.append([]) 

130 continue 

131 current.append(p) 

132 if current: 

133 parts[-1].append("".join(current).strip()) 

134 

135 # print(f">>>>>> {parts=}") 

136 new_parts: list[list[str]] = [] 

137 for entry in parts: 

138 if not entry: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 continue 

140 new_entry: list[str] = [] 

141 i1: int = entry[0].startswith("(") and entry[0].endswith(")") 

142 if i1: 

143 new_entry.append(entry[0][1:-1].strip()) 

144 else: 

145 new_entry.append("") 

146 i2: int = ( 

147 entry[-1].startswith("(") 

148 and entry[-1].endswith(")") 

149 and len(entry) > 1 

150 ) 

151 if i2 == 0: 

152 i2 = len(entry) 

153 else: 

154 i2 = -1 

155 new_entry.append("".join(entry[i1:i2]).strip()) 

156 if not new_entry[-1]: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true

157 wxr.wtp.error( 

158 f"Missing IPA/enPRO sound data between qualifiers?" f"{entry=}", 

159 sortid="en/pronunciation/153", 

160 ) 

161 if i2 == -1: 

162 new_entry.append(entry[-1][1:-1].strip()) 

163 else: 

164 new_entry.append("") 

165 new_parts.append(new_entry) 

166 

167 # print(f">>>>> {new_parts=}") 

168 

169 for part in new_parts: 

170 sd = deepcopy(base_data) 

171 if part[0]: 

172 parse_pronunciation_tags(wxr, part[0], sd) 

173 if part[2]: 

174 parse_pronunciation_tags(wxr, part[2], sd) 

175 if tname == "enPR": 

176 sd["enpr"] = part[1] 

177 else: 

178 sd["ipa"] = part[1] 

179 sound_datas.append(sd) 

180 

181 # print(f"BASE_DATA: {base_data}") 

182 # print(f"SOUND_DATAS: {sound_datas=}") 

183 

184 return base_data, sound_datas 

185 

186 

187def parse_pronunciation( 

188 wxr: WiktextractContext, 

189 node: WikiNode, 

190 data: WordData, 

191 etym_data: WordData, 

192 have_etym: bool, 

193 base_data: WordData, 

194 lang_code: str, 

195) -> None: 

196 """Parses the pronunciation section from a language section on a 

197 page.""" 

198 assert isinstance(node, WikiNode) 

199 if node.kind in LEVEL_KINDS: 199 ↛ 202line 199 didn't jump to line 202 because the condition on line 199 was always true

200 contents = node.children 

201 else: 

202 contents = [node] 

203 # Remove subsections, such as Usage notes. They may contain IPAchar 

204 # templates in running text, and we do not want to extract IPAs from 

205 # those. 

206 # Filter out only LEVEL_KINDS; 'or' is doing heavy lifting here 

207 # Slip through not-WikiNodes, then slip through WikiNodes that 

208 # are not LEVEL_KINDS. 

209 contents = [ 

210 x 

211 for x in contents 

212 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS 

213 ] 

214 if not any( 

215 isinstance(x, WikiNode) and x.kind == NodeKind.LIST for x in contents 

216 ): 

217 # expand all templates 

218 new_contents: list[Union[str, WikiNode]] = [] 

219 for lst in contents: 

220 if ( 

221 isinstance(lst, TemplateNode) 

222 and isinstance(lst.largs[0][0], str) 

223 and lst.largs[0][0].strip() != "zh-pron" 

224 ): 

225 temp = wxr.wtp.node_to_wikitext(lst) 

226 temp = wxr.wtp.expand(temp) 

227 temp_parsed = wxr.wtp.parse(temp) 

228 new_contents.extend(temp_parsed.children) 

229 else: 

230 new_contents.append(lst) 

231 contents = new_contents 

232 

233 if have_etym and data is base_data: 233 ↛ 234line 233 didn't jump to line 234 because the condition on line 233 was never true

234 data = etym_data 

235 pron_templates: list[tuple[SoundData, list[SoundData]]] = [] 

236 hyphenations: list[Hyphenation] = [] 

237 audios = [] 

238 have_panel_templates = False 

239 

240 def parse_pronunciation_template_fn( 

241 name: str, ht: TemplateArgs 

242 ) -> Optional[str]: 

243 """Handle pronunciation and hyphenation templates""" 

244 # _template_fn handles templates *before* they are expanded; 

245 # this allows for special handling before all the work needed 

246 # for expansion is done. 

247 nonlocal have_panel_templates 

248 if is_panel_template(wxr, name): 

249 have_panel_templates = True 

250 return "" 

251 if name == "audio": 

252 filename = ht.get(2) or "" 

253 desc = ht.get(3) or "" 

254 desc = clean_node(wxr, None, [desc]) 

255 audio: SoundData = {"audio": filename.strip()} 

256 if desc: 256 ↛ 257line 256 didn't jump to line 257 because the condition on line 256 was never true

257 audio["text"] = desc 

258 m = re.search(r"\((([^()]|\([^()]*\))*)\)", desc) 

259 skip = False 

260 if m: 260 ↛ 261line 260 didn't jump to line 261 because the condition on line 260 was never true

261 par = m.group(1) 

262 cls = classify_desc(par) 

263 if cls == "tags": 

264 parse_pronunciation_tags(wxr, par, audio) 

265 else: 

266 skip = True 

267 if skip: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true

268 return "" 

269 audios.append(audio) 

270 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

271 if name == "audio-IPA": 271 ↛ 272line 271 didn't jump to line 272 because the condition on line 271 was never true

272 filename = ht.get(2) or "" 

273 ipa = ht.get(3) or "" 

274 dial = ht.get("dial") 

275 audio = {"audio": filename.strip()} 

276 if dial: 

277 dial = clean_node(wxr, None, [dial]) 

278 audio["text"] = dial 

279 if ipa: 

280 audio["audio-ipa"] = ipa 

281 audios.append(audio) 

282 # The problem with these IPAs is that they often just describe 

283 # what's in the sound file, rather than giving the pronunciation 

284 # of the word alone. It is common for audio files to contain 

285 # multiple pronunciations or articles in the same file, and then 

286 # this IPA often describes what is in the file. 

287 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

288 if name == "audio-pron": 

289 filename = ht.get(2) or "" 

290 ipa = ht.get("ipa") or "" 

291 dial = ht.get("dial") 

292 country = ht.get("country") 

293 audio = {"audio": filename.strip()} 

294 if dial: 294 ↛ 298line 294 didn't jump to line 298 because the condition on line 294 was always true

295 dial = clean_node(wxr, None, [dial]) 

296 audio["text"] = dial 

297 parse_pronunciation_tags(wxr, dial, audio) 

298 if country: 298 ↛ 300line 298 didn't jump to line 300 because the condition on line 298 was always true

299 parse_pronunciation_tags(wxr, country, audio) 

300 if ipa: 300 ↛ 302line 300 didn't jump to line 302 because the condition on line 300 was always true

301 audio["audio-ipa"] = ipa 

302 audios.append(audio) 

303 # XXX do we really want to extract pronunciations from these? 

304 # Or are they spurious / just describing what is in the 

305 # audio file? 

306 # if ipa: 

307 # pron = {"ipa": ipa} 

308 # if dial: 

309 # parse_pronunciation_tags(wxr, dial, pron) 

310 # if country: 

311 # parse_pronunciation_tags(wxr, country, pron) 

312 # data_append(data, "sounds", pron) 

313 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

314 if name in ("hyph", "hyphenation"): 

315 # {{hyph|en|re|late|caption="Hyphenation UK:"}} 

316 # {{hyphenation|it|quiè|to||qui|è|to||quié|to||qui|é|to}} 

317 # and also nocaption=1 

318 caption = clean_node(wxr, None, ht.get("caption", "")) 

319 tagsets, _ = decode_tags(caption) 

320 # flatten the tagsets into one; it would be really weird to have 

321 # several tagsets for a hyphenation caption 

322 tags = list(set(tag for tagset in tagsets for tag in tagset)) 

323 # We'll just ignore any errors from tags, it's not very important 

324 # for hyphenation 

325 tags = [tag for tag in tags if not tag.startswith("error")] 

326 hyph_sequences: list[list[str]] = [[]] 

327 for text in [ 

328 t for (k, t) in ht.items() if (isinstance(k, int) and k >= 2) 

329 ]: 

330 if not text: 

331 hyph_sequences.append([]) 

332 else: 

333 hyph_sequences[-1].append(clean_node(wxr, None, text)) 

334 for seq in hyph_sequences: 

335 hyphenations.append(Hyphenation(parts=seq, tags=tags)) 

336 return "" 

337 return None 

338 

339 def parse_pron_post_template_fn( 

340 name: str, ht: TemplateArgs, text: str 

341 ) -> Optional[str]: 

342 # _post_template_fn handles templates *after* the work to expand 

343 # them has been done; this is exactly the same as _template_fn, 

344 # except with the additional expanded text as an input, and 

345 # possible side-effects from the expansion and recursion (like 

346 # calling other subtemplates that are handled in _template_fn. 

347 if is_panel_template(wxr, name): 347 ↛ 348line 347 didn't jump to line 348 because the condition on line 347 was never true

348 return "" 

349 if name in { 

350 "q", 

351 "qualifier", 

352 "sense", 

353 "a", 

354 "accent", 

355 "l", 

356 "link", 

357 "lb", 

358 "lbl", 

359 "label", 

360 }: 

361 # Kludge: when these templates expand to /.../ or [...], 

362 # replace the expansion by something safe. This is used 

363 # to filter spurious IPA-looking expansions that aren't really 

364 # IPAs. We probably don't care about these templates in the 

365 # contexts where they expand to something containing these. 

366 v = re.sub(r'href="[^"]*"', "", text) # Ignore URLs 

367 v = re.sub(r'src="[^"]*"', "", v) 

368 if re.search(r"/[^/,]+?/|\[[^]0-9,/][^],/]*?\]", v): 368 ↛ 374line 368 didn't jump to line 374 because the condition on line 368 was always true

369 # Note: replacing by empty results in Lua errors that we 

370 # would rather not have. For example, voi/Middle Vietnamese 

371 # uses {{a|{{l{{vi|...}}}}, and the {{a|...}} will fail 

372 # if {{l|...}} returns empty. 

373 return "stripped-by-parse_pron_post_template_fn" 

374 if name in ("IPA", "enPR"): 

375 # Extract the data from IPA and enPR templates (same underlying 

376 # template) and replace them in-text with magical cookie that 

377 # can be later used to refer to the data's index inside 

378 # pron_templates. 

379 if pron_t := extract_pron_template(wxr, name, ht, text): 

380 pron_templates.append(pron_t) 

381 return f"__PRON_TEMPLATE_{len(pron_templates)-1}__" 

382 return text 

383 

384 def parse_expanded_zh_pron( 

385 node: WikiNode, 

386 parent_hdrs: list[str], 

387 specific_hdrs: list[str], 

388 unknown_header_tags: set[str], 

389 ) -> None: 

390 def generate_pron( 

391 v, new_parent_hdrs: list[str], new_specific_hdrs: list[str] 

392 ) -> Optional[SoundData]: 

393 pron: SoundData = {} 

394 pron["tags"] = [] 

395 pron["zh-pron"] = v.strip() 

396 for hdr in new_parent_hdrs + new_specific_hdrs: 

397 hdr = hdr.strip() 

398 valid_hdr = re.sub(r"\s+", "-", hdr) 

399 if hdr in ZH_PRON_TAGS: 

400 for tag in ZH_PRON_TAGS[hdr]: 

401 if tag not in pron["tags"]: 

402 pron["tags"].append(tag) 

403 elif valid_hdr in valid_tags: 

404 if valid_hdr not in pron["tags"]: 

405 pron["tags"].append(valid_hdr) 

406 else: 

407 unknown_header_tags.add(hdr) 

408 # convert into normal IPA format if has the IPA flag 

409 if "IPA" in pron["tags"]: 

410 pron["ipa"] = v 

411 del pron["zh-pron"] 

412 pron["tags"].remove("IPA") 

413 # convert into IPA but retain the Sinological-IPA tag 

414 elif "Sinological-IPA" in pron["tags"]: 

415 pron["ipa"] = v 

416 del pron["zh-pron"] 

417 

418 if not (pron.get("zh-pron") or pron.get("ipa")): 

419 return None 

420 return pron 

421 

422 if isinstance(node, list): 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true

423 for item in node: 

424 parse_expanded_zh_pron( 

425 item, parent_hdrs, specific_hdrs, unknown_header_tags 

426 ) 

427 return 

428 if not isinstance(node, WikiNode): 428 ↛ 429line 428 didn't jump to line 429 because the condition on line 428 was never true

429 return 

430 if node.kind != NodeKind.LIST: 430 ↛ 436line 430 didn't jump to line 436 because the condition on line 430 was always true

431 for item in node.children: 

432 parse_expanded_zh_pron( 

433 item, parent_hdrs, specific_hdrs, unknown_header_tags 

434 ) 

435 return 

436 for item in node.children: 

437 assert isinstance(item, WikiNode) 

438 assert item.kind == NodeKind.LIST_ITEM 

439 base_item = list( 

440 x 

441 for x in item.children 

442 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

443 ) 

444 text = clean_node(wxr, None, base_item) 

445 # print(f"{parent_hdrs} zhpron: {text}") # XXX remove me 

446 text = re.sub(r"(?s)\(Note:.*?\)", "", text) 

447 # Kludge to clean up text like 

448 # '(Standard Chinese, erhua-ed) (旋兒/旋儿)' where 

449 # the hanzi are examples 

450 hanzi_m = re.match(r"\s*(\([^()]*\))\s*\(([^()]*)\)\s*$", text) 

451 if hanzi_m: 

452 if re.search("[\u4e00-\u9fff]", hanzi_m.group(2)): 

453 text = hanzi_m.group(1) 

454 new_parent_hdrs = list(parent_hdrs) 

455 new_specific_hdrs = list(specific_hdrs) 

456 # look no further, here be dragons... 

457 

458 if ": " in text or ":" in text: 

459 parts = re.split(r": |:", text) 

460 m = re.match( 

461 r"\s*\((([^():]+)\s*(:|:)?\s*([^():]*))\)\s*$", text 

462 ) 

463 # Matches lines with stuff like "(Hokkien: Xiamen, Quanzhou)" 

464 # thrown into new_parent_hdrs 

465 if m: 

466 new_parent_hdrs.append(m.group(2).strip()) 

467 for hdr in m.group(4).split(","): 

468 new_specific_hdrs.append(hdr.strip()) 

469 else: 

470 # if "Zhangzhou" in text: 

471 # print("\nFOUND IN:", text, "\n") 

472 # print("PARTS: ", repr(parts)) 

473 # print(f" PARTS: {parts}") 

474 extra_tags = parts[0] 

475 # Kludge to handle how (Hokkien: Locations) and 

476 # IPA (Specific Location) interact; this is why 

477 # specific_hdrs was introduced to the soup, just 

478 # to specify which are actual hierarchical higher 

479 # level tags (Min'nan, Hokkien, etc.) which should 

480 # always be present and then use specific_hdrs 

481 # for that list of misc sublocations and subdialects 

482 # that can be overridden by more specific stuff 

483 # later. 

484 m = re.match(r"\s*IPA\s*\((.*)\)\s*$", extra_tags) 

485 if m: 

486 new_parent_hdrs.append("IPA") 

487 new_specific_hdrs = [ 

488 s.strip() for s in m.group(1).split(",") 

489 ] 

490 extra_tags = extra_tags[m.end() :] 

491 

492 m = re.match(r"\s*\([^()]*,[^()]*\)\s*$", extra_tags) 

493 if m: 

494 extra_tags = extra_tags.strip()[1:-1] # remove parens 

495 new_parent_hdrs.extend( 

496 s.strip() for s in extra_tags.split(",") 

497 ) 

498 elif extra_tags: 

499 new_parent_hdrs.append(extra_tags) 

500 

501 v = ":".join(parts[1:]) 

502 

503 # check for phrases 

504 if ("," in (wxr.wtp.title or "")) and len( 

505 v.split(" ") 

506 ) + v.count(",") == len(wxr.wtp.title or ""): 

507 # This just captures exact matches where you have 

508 # the pronunciation of the whole phrase and nothing 

509 # else. Split on spaces, then because we're not 

510 # splitting next to a comma we need to add the 

511 # count of commas so that it synchs up with the 

512 # unicode string length of the original hanzi, 

513 # where the comma is a separate character (unlike 

514 # in the split list, where it's part of a space- 

515 # separated string, like "teo⁴,". 

516 vals = [v] 

517 pron = generate_pron( 

518 v, new_parent_hdrs, new_specific_hdrs 

519 ) 

520 

521 if pron: 

522 pron["tags"] = list(sorted(pron["tags"])) 

523 if pron not in data.get("sounds", ()): 

524 data_append(data, "sounds", pron) 

525 elif "→" in v: 

526 vals = re.split("→", v) 

527 for v in vals: 

528 pron = generate_pron( 

529 v, new_parent_hdrs, new_specific_hdrs 

530 ) 

531 if pron: 

532 m = re.match( 

533 r"([^()]+)\s*\(toneless" 

534 r" final syllable variant\)\s*", 

535 v, 

536 ) 

537 if m: 

538 pron["zh-pron"] = m.group(1).strip() 

539 pron["tags"].append( 

540 "toneless-final-syllable-variant" 

541 ) 

542 

543 pron["tags"] = list(sorted(pron["tags"])) 

544 if pron not in data.get("sounds", ()): 

545 data_append(data, "sounds", pron) 

546 else: 

547 # split alternative pronunciations split 

548 # with "," or " / " 

549 vals = re.split(r"\s*,\s*|\s+/\s+", v) 

550 new_vals = [] 

551 for v2 in vals: 

552 if v2.startswith("/") and v2.endswith("/"): 

553 # items like /kɛiŋ²¹³⁻⁵³ ^((Ø-))ŋuaŋ³³/ 

554 new_vals.append(v2) 

555 else: 

556 # split in parentheses otherwise 

557 new_vals.extend(re.split(r"[()]", v2)) 

558 vals = new_vals 

559 for v in vals: 

560 pron = generate_pron( 

561 v, new_parent_hdrs, new_specific_hdrs 

562 ) 

563 if pron: 

564 pron["tags"] = list(sorted(pron["tags"])) 

565 if pron not in data.get("sounds", ()): 

566 data_append(data, "sounds", pron) 

567 else: 

568 new_parent_hdrs.append(text) 

569 

570 for x in item.children: 

571 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST: 

572 parse_expanded_zh_pron( 

573 x, new_parent_hdrs, specific_hdrs, unknown_header_tags 

574 ) 

575 

576 def parse_chinese_pron( 

577 contents: Union[list[Union[WikiNode, str]], WikiNode, str], 

578 unknown_header_tags: set[str], 

579 ) -> None: 

580 if isinstance(contents, list): 

581 for item in contents: 

582 parse_chinese_pron(item, unknown_header_tags) 

583 return 

584 if not isinstance(contents, WikiNode): 

585 return 

586 if contents.kind != NodeKind.TEMPLATE: 

587 for item in contents.children: 

588 parse_chinese_pron(item, unknown_header_tags) 

589 return 

590 if ( 

591 len(contents.largs[0]) == 1 

592 and isinstance(contents.largs[0][0], str) 

593 and contents.largs[0][0].strip() == "zh-pron" 

594 ): 

595 src = wxr.wtp.node_to_wikitext(contents) 

596 expanded = wxr.wtp.expand(src, templates_to_expand={"zh-pron"}) 

597 parsed = wxr.wtp.parse(expanded) 

598 parse_expanded_zh_pron(parsed, [], [], unknown_header_tags) 

599 else: 

600 for item in contents.children: 600 ↛ 601line 600 didn't jump to line 601 because the loop on line 600 never started

601 parse_chinese_pron(item, unknown_header_tags) 

602 return 

603 

604 if lang_code == "zh": 

605 unknown_header_tags: set[str] = set() 

606 parse_chinese_pron(contents, unknown_header_tags) 

607 for hdr in unknown_header_tags: 607 ↛ 608line 607 didn't jump to line 608 because the loop on line 607 never started

608 wxr.wtp.debug( 

609 f"Zh-pron header not found in zh_pron_tags or tags: " 

610 f"{repr(hdr)}", 

611 sortid="pronunciations/296/20230324", 

612 ) 

613 

614 def flattened_tree( 

615 lines: list[Union[WikiNode, str]], 

616 ) -> Iterator[Union[WikiNode, str]]: 

617 assert isinstance(lines, list) 

618 for line in lines: 

619 yield from flattened_tree1(line) 

620 

621 def flattened_tree1( 

622 node: Union[WikiNode, str], 

623 ) -> Iterator[Union[WikiNode, str]]: 

624 assert isinstance(node, (WikiNode, str)) 

625 if isinstance(node, str): 

626 yield node 

627 return 

628 elif node.kind == NodeKind.LIST: 

629 for item in node.children: 

630 yield from flattened_tree1(item) 

631 elif node.kind == NodeKind.LIST_ITEM: 

632 new_children = [] 

633 sublist = None 

634 for child in node.children: 

635 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST: 

636 sublist = child 

637 else: 

638 new_children.append(child) 

639 node.children = new_children 

640 node.sarg = "*" 

641 yield node 

642 if sublist: 

643 yield from flattened_tree1(sublist) 

644 else: 

645 yield node 

646 

647 # XXX Do not use flattened_tree more than once here, for example for 

648 # debug printing... The underlying data is changed, and the separated 

649 # sublists disappear. 

650 

651 # Kludge for templates that generate several lines, but haven't 

652 # been caught by earlier kludges... 

653 def split_cleaned_node_on_newlines( 

654 contents: list[Union[WikiNode, str]], 

655 ) -> Iterator[str]: 

656 for litem in flattened_tree(contents): 

657 ipa_text = clean_node( 

658 wxr, 

659 data, 

660 litem, 

661 template_fn=parse_pronunciation_template_fn, 

662 post_template_fn=parse_pron_post_template_fn, 

663 ) 

664 for line in ipa_text.splitlines(): 

665 yield line 

666 

667 # have_pronunciations = False 

668 active_pos: Optional[str] = None 

669 

670 for line in split_cleaned_node_on_newlines(contents): 

671 # print(f"{line=}") 

672 prefix: Optional[str] = None 

673 earlier_base_data: Optional[SoundData] = None 

674 if not line: 674 ↛ 675line 674 didn't jump to line 675 because the condition on line 674 was never true

675 continue 

676 

677 split_templates = re.split(r"__PRON_TEMPLATE_(\d+)__", line) 

678 for i, text in enumerate(split_templates): 

679 if not text: 

680 continue 

681 # clean up starts at the start of the line 

682 text = re.sub(r"^\**\s*", "", text).strip() 

683 if i == 0: 

684 # At the start of a line, check for stuff like "Noun:" 

685 # for active_pos; active_pos is a temporary data field 

686 # given to each saved SoundData entry which is later 

687 # used to sort the entries into their respective PoSes. 

688 m = re.match(r"\s*(\w+\s?\w*)\s*:?\s*", text) 

689 if m: 

690 if (m_lower := m.group(1).lower()) in part_of_speech_map: 

691 active_pos = part_of_speech_map[m_lower]["pos"] 

692 text = text[m.end() :].strip() 

693 if not text: 

694 continue 

695 if i % 2 == 1: 

696 # re.split (with capture groups) splits the lines so that 

697 # every even entry is a captured splitter; odd lines are either 

698 # empty strings or stuff around the splitters. 

699 base_pron_data, first_prons = pron_templates[int(text)] 

700 if base_pron_data: 

701 earlier_base_data = base_pron_data 

702 # print(f"Set {earlier_base_data=}") 

703 elif earlier_base_data is not None: 703 ↛ 720line 703 didn't jump to line 720 because the condition on line 703 was always true

704 # merge data from an earlier iteration of this loop 

705 for pr in first_prons: 

706 if "note" in pr and "note" in earlier_base_data: 706 ↛ 707line 706 didn't jump to line 707 because the condition on line 706 was never true

707 pr["note"] += ";" + earlier_base_data.get( 

708 "note", "" 

709 ) 

710 elif "note" in earlier_base_data: 710 ↛ 711line 710 didn't jump to line 711 because the condition on line 710 was never true

711 pr["note"] = earlier_base_data["note"] 

712 if "topics" in earlier_base_data: 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true

713 data_extend( 

714 pr, "topics", earlier_base_data["topics"] 

715 ) 

716 if "tags" in pr and "tags" in earlier_base_data: 716 ↛ 717line 716 didn't jump to line 717 because the condition on line 716 was never true

717 pr["tags"].extend(earlier_base_data["tags"]) 

718 elif "tags" in earlier_base_data: 718 ↛ 705line 718 didn't jump to line 705 because the condition on line 718 was always true

719 pr["tags"] = sorted(set(earlier_base_data["tags"])) 

720 for pr in first_prons: 

721 if active_pos: 

722 pr["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

723 if pr not in data.get("sounds", ()): 723 ↛ 720line 723 didn't jump to line 720 because the condition on line 723 was always true

724 data_append(data, "sounds", pr) 

725 # This bit is handled 

726 continue 

727 

728 if "IPA" in text: 

729 field = "ipa" 

730 else: 

731 # This is used for Rhymes, Homophones, etc 

732 field = "other" 

733 

734 # Check if it contains Japanese "Tokyo" pronunciation with 

735 # special syntax 

736 m = re.search(r"(?m)\(Tokyo\) +([^ ]+) +\[", text) 

737 if m: 737 ↛ 738line 737 didn't jump to line 738 because the condition on line 737 was never true

738 pron: SoundData = {field: m.group(1)} # type: ignore[misc] 

739 if active_pos: 

740 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

741 data_append(data, "sounds", pron) 

742 # have_pronunciations = True 

743 continue 

744 

745 # Check if it contains Rhymes 

746 m = re.match(r"\s*Rhymes?: (.*)", text) 

747 if m: 

748 for ending in split_at_comma_semi(m.group(1)): 

749 ending = ending.strip() 

750 if ending: 750 ↛ 748line 750 didn't jump to line 748 because the condition on line 750 was always true

751 pron = {"rhymes": ending} 

752 if active_pos: 

753 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

754 data_append(data, "sounds", pron) 

755 # have_pronunciations = True 

756 continue 

757 

758 # Check if it contains homophones 

759 m = re.search(r"(?m)\bHomophones?: (.*)", text) 

760 if m: 

761 for w in split_at_comma_semi(m.group(1)): 

762 w = w.strip() 

763 if w: 763 ↛ 761line 763 didn't jump to line 761 because the condition on line 763 was always true

764 pron = {"homophone": w} 

765 if active_pos: 

766 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

767 data_append(data, "sounds", pron) 

768 # have_pronunciations = True 

769 continue 

770 

771 # Check if it contains Phonetic hangeul 

772 m = re.search(r"(?m)\bPhonetic hange?ul: \[([^]]+)\]", text) 

773 if m: 773 ↛ 774line 773 didn't jump to line 774 because the condition on line 773 was never true

774 seen = set() 

775 for w in m.group(1).split("/"): 

776 w = w.strip() 

777 if w and w not in seen: 

778 seen.add(w) 

779 pron = {"hangeul": w} 

780 if active_pos: 

781 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

782 data_append(data, "sounds", pron) 

783 # have_pronunciations = True 

784 

785 # This regex-based hyphenation detection left as backup 

786 m = re.search(r"\b(Syllabification|Hyphenation): *([^\n.]*)", text) 

787 if m: 

788 data_append(data, "hyphenation", m.group(2)) 

789 commaseparated = m.group(2).split(",") 

790 if len(commaseparated) > 1: 790 ↛ 801line 790 didn't jump to line 801 because the condition on line 790 was always true

791 for h in commaseparated: 

792 # That second characters looks like a dash but it's 

793 # actually unicode decimal code 8231, hyphenation dash 

794 # Add more delimiters here if needed. 

795 parts = re.split(r"-|‧", h.strip()) 

796 data_append( 

797 data, "hyphenations", Hyphenation(parts=parts) 

798 ) 

799 ... 

800 else: 

801 data_append( 

802 data, 

803 "hyphenations", 

804 Hyphenation(parts=m.group(2).split(sep="-")), 

805 ) 

806 # have_pronunciations = True 

807 

808 # See if it contains a word prefix restricting which forms the 

809 # pronunciation applies to (see amica/Latin) and/or parenthesized 

810 # tags. 

811 m = re.match( 

812 r"^[*#\s]*(([-\w]+):\s+)?\((([^()]|\([^()]*\))*?)\)", text 

813 ) 

814 if m: 

815 prefix = m.group(2) or "" 

816 tagstext = m.group(3) 

817 text = text[m.end() :] 

818 else: 

819 m = re.match(r"^[*#\s]*([-\w]+):\s+", text) 

820 if m: 

821 prefix = m.group(1) 

822 tagstext = "" 

823 text = text[m.end() :] 

824 else: 

825 # Spanish has tags before pronunciations, eg. aceite/Spanish 

826 m = re.match(r".*:\s+\(([^)]*)\)\s+(.*)", text) 

827 if m: 827 ↛ 828line 827 didn't jump to line 828 because the condition on line 827 was never true

828 tagstext = m.group(1) 

829 text = m.group(2) 

830 else: 

831 # No prefix. In this case, we inherit prefix 

832 # from previous entry. This particularly 

833 # applies for nested Audio files. 

834 tagstext = "" 

835 if tagstext: 

836 earlier_base_data = {} 

837 parse_pronunciation_tags(wxr, tagstext, earlier_base_data) 

838 

839 # Find romanizations from the pronunciation section (routinely 

840 # produced for Korean by {{ko-IPA}}) 

841 for m in re.finditer(pron_romanization_re, text): 841 ↛ 842line 841 didn't jump to line 842 because the loop on line 841 never started

842 prefix = m.group(1) 

843 w = m.group(2).strip() 

844 tag = pron_romanizations[prefix] 

845 form = {"form": w, "tags": tag.split()} 

846 data_append(data, "forms", form) 

847 

848 # Find IPA pronunciations 

849 for m in re.finditer( 

850 r"(?m)/[^][\n/,]+?/" r"|" r"\[[^]\n0-9,/][^],/]*?\]", text 

851 ): 

852 v = m.group(0) 

853 # The regexp above can match file links. Skip them. 

854 if v.startswith("[[File:"): 854 ↛ 855line 854 didn't jump to line 855 because the condition on line 854 was never true

855 continue 

856 if v == "/wiki.local/": 856 ↛ 857line 856 didn't jump to line 857 because the condition on line 856 was never true

857 continue 

858 if field == "ipa" and "__AUDIO_IGNORE_THIS__" in text: 858 ↛ 859line 858 didn't jump to line 859 because the condition on line 858 was never true

859 m = re.search(r"__AUDIO_IGNORE_THIS__(\d+)__", text) 

860 assert m 

861 idx = int(m.group(1)) 

862 if idx >= len(audios): 

863 continue 

864 if not audios[idx].get("audio-ipa"): 

865 audios[idx]["audio-ipa"] = v 

866 if prefix: 

867 audios[idx]["form"] = prefix 

868 else: 

869 if earlier_base_data: 

870 pron = deepcopy(earlier_base_data) 

871 pron[field] = v 

872 else: 

873 pron = {field: v} # type: ignore[misc] 

874 if active_pos: 

875 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

876 if prefix: 

877 pron["form"] = prefix 

878 if active_pos: 

879 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

880 data_append(data, "sounds", pron) 

881 # have_pronunciations = True 

882 

883 # XXX what about {{hyphenation|...}}, {{hyph|...}} 

884 # and those used to be stored under "hyphenation" 

885 

886 # Add data that was collected in template_fn 

887 for audio in audios: 

888 if "audio" in audio: 888 ↛ 945line 888 didn't jump to line 945 because the condition on line 888 was always true

889 # Compute audio file URLs 

890 fn = audio["audio"] 

891 # Strip certain characters, e.g., left-to-right mark 

892 fn = re.sub(r"[\u200f\u200e]", "", fn) 

893 fn = fn.strip() 

894 fn = urllib.parse.unquote(fn) 

895 # First character is usually uppercased 

896 if re.match(r"^[a-z][a-z]+", fn): 

897 fn = fn[0].upper() + fn[1:] 

898 if fn in wxr.config.redirects: 898 ↛ 899line 898 didn't jump to line 899 because the condition on line 898 was never true

899 fn = wxr.config.redirects[fn] 

900 # File extension is lowercased 

901 # XXX some words seem to need this, some don't seem to 

902 # have this??? what is the exact rule? 

903 # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn) 

904 # Spaces are converted to underscores 

905 fn = re.sub(r"\s+", "_", fn) 

906 # Compute hash digest part 

907 h = hashlib.md5() 

908 hname = fn.encode("utf-8") 

909 h.update(hname) 

910 digest = h.hexdigest() 

911 # Quote filename for URL 

912 qfn = urllib.parse.quote(fn) 

913 # For safety when writing files 

914 qfn = qfn.replace("/", "__slash__") 

915 if re.search(r"(?i)\.(ogg|oga)$", fn): 

916 ogg = ( 

917 "https://upload.wikimedia.org/wikipedia/" 

918 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn) 

919 ) 

920 else: 

921 ogg = ( 

922 "https://upload.wikimedia.org/wikipedia/" 

923 "commons/transcoded/" 

924 "{}/{}/{}/{}.ogg".format( 

925 digest[:1], digest[:2], qfn, qfn 

926 ) 

927 ) 

928 if re.search(r"(?i)\.(mp3)$", fn): 928 ↛ 929line 928 didn't jump to line 929 because the condition on line 928 was never true

929 mp3 = ( 

930 "https://upload.wikimedia.org/wikipedia/" 

931 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn) 

932 ) 

933 else: 

934 mp3 = ( 

935 "https://upload.wikimedia.org/wikipedia/" 

936 "commons/transcoded/" 

937 "{}/{}/{}/{}.mp3".format( 

938 digest[:1], digest[:2], qfn, qfn 

939 ) 

940 ) 

941 audio["ogg_url"] = ogg 

942 audio["mp3_url"] = mp3 

943 if active_pos: 

944 audio["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

945 if audio not in data.get("sounds", ()): 

946 data_append(data, "sounds", audio) 

947 

948 # if audios: 

949 # have_pronunciations = True 

950 audios = [] 

951 

952 data_extend(data, "hyphenations", hyphenations) 

953 hyphenations = [] 

954 

955 ## I have commented out the otherwise unused have_pronunciation 

956 ## toggles; uncomment them to use this debug print 

957 # if not have_pronunciations and not have_panel_templates: 

958 # wxr.wtp.debug("no pronunciations found from pronunciation section", 

959 # sortid="pronunciations/533")