Coverage for src/wiktextract/extractor/en/pronunciation.py: 65%

512 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1import hashlib 

2import re 

3import urllib 

4from copy import deepcopy 

5from typing import Iterator, Optional, Union 

6 

7from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

8 

9from ...clean import clean_value 

10from ...datautils import data_append, data_extend, split_at_comma_semi 

11from ...page import LEVEL_KINDS, clean_node, is_panel_template 

12from ...tags import valid_tags 

13from ...wxr_context import WiktextractContext 

14from .form_descriptions import classify_desc, parse_pronunciation_tags 

15from .parts_of_speech import part_of_speech_map 

16from .type_utils import SoundData, TemplateArgs, WordData 

17from .zh_pron_tags import ZH_PRON_TAGS 

18 

19# Prefixes, tags, and regexp for finding romanizations from the pronuncation 

20# section 

21pron_romanizations = { 

22 " Revised Romanization ": "romanization revised", 

23 " Revised Romanization (translit.) ": "romanization revised transliteration", 

24 " McCune-Reischauer ": "McCune-Reischauer romanization", 

25 " McCune–Reischauer ": "McCune-Reischauer romanization", 

26 " Yale Romanization ": "Yale romanization", 

27} 

28pron_romanization_re = re.compile( 

29 "(?m)^(" 

30 + "|".join( 

31 re.escape(x) 

32 for x in sorted(pron_romanizations.keys(), key=len, reverse=True) 

33 ) 

34 + ")([^\n]+)" 

35) 

36 

37IPA_EXTRACT = r"^(\((.+)\) )?(IPA⁽ᵏᵉʸ⁾|enPR): ((.+?)( \(([^(]+)\))?\s*)$" 

38IPA_EXTRACT_RE = re.compile(IPA_EXTRACT) 

39 

40 

41def extract_pron_template( 

42 wxr: WiktextractContext, tname: str, targs: TemplateArgs, expanded: str 

43) -> Optional[tuple[SoundData, list[SoundData]]]: 

44 """In post_template_fn, this is used to handle all enPR and IPA templates 

45 so that we can leave breadcrumbs in the text that can later be handled 

46 there. We return a `base_data` so that if there are two 

47 or more templates on the same line, like this: 

48 (Tags for the whole line, really) enPR: foo, IPA(keys): /foo/ 

49 then we can apply base_data fields to other templates, too, if needed. 

50 """ 

51 cleaned = clean_value(wxr, expanded) 

52 # print(f"extract_pron_template input: {tname=} {expanded=}-> {cleaned=}") 

53 m = IPA_EXTRACT_RE.match(cleaned) 

54 if not m: 

55 wxr.wtp.error( 

56 f"Text cannot match IPA_EXTRACT_RE regex: " 

57 f"{cleaned=}, {tname=}, {targs=}", 

58 sortid="en/pronunciation/54", 

59 ) 

60 return None 

61 # for i, group in enumerate(m.groups()): 

62 # print(i + 1, repr(group)) 

63 main_qual = m.group(2) or "" 

64 if "qq" in targs: 

65 # If the template has been given a qualifier that applies to 

66 # every entry, but which also happens to appear at the end 

67 # which can be confused with the post-qualifier of a single 

68 # entry in the style of "... /ipa3/ (foo) (bar)", where foo 

69 # might not be present so the bar looks like it only might 

70 # apply to `/ipa3/` 

71 pron_body = m.group(5) 

72 post_qual = m.group(7) 

73 else: 

74 pron_body = m.group(4) 

75 post_qual = "" 

76 

77 if not pron_body: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 wxr.wtp.error( 

79 f"Regex failed to find 'body' from {cleaned=}", 

80 sortid="en/pronunciation/81", 

81 ) 

82 return None 

83 

84 base_data: SoundData = {} 

85 if main_qual: 

86 parse_pronunciation_tags(wxr, main_qual, base_data) 

87 if post_qual: 

88 parse_pronunciation_tags(wxr, post_qual, base_data) 

89 # This base_data is used as the base copy for all entries from this 

90 # template, but it is also returned so that its contents may be applied 

91 # to other templates on the same line. 

92 # print(f"{base_data=}") 

93 

94 sound_datas: list[SoundData] = [] 

95 

96 parts: list[list[str]] = [[]] 

97 inside = 0 

98 current: list[str] = [] 

99 for i, p in enumerate(re.split(r"(\s*,|;|\(|\)\s*)", pron_body)): 

100 # Split the line on commas and semicolons outside of parens. 

101 # This gives us lines with "(main-qualifier) /phon/ (post-qualifier, maybe)" 

102 # print(f" {i=}, {p=}") 

103 comp = p.strip() 

104 if not p: 

105 continue 

106 if comp == "(": 

107 if not inside and i > 0: 107 ↛ 110line 107 didn't jump to line 110 because the condition on line 107 was always true

108 if stripped := "".join(current).strip(): 

109 parts[-1].append("".join(current).strip()) # type:ignore[arg-type] 

110 current = [p] 

111 inside += 1 

112 continue 

113 if comp == ")": 

114 inside -= 1 

115 if not inside: 115 ↛ 120line 115 didn't jump to line 120 because the condition on line 115 was always true

116 if stripped := "".join(current).strip(): 116 ↛ 120line 116 didn't jump to line 120 because the condition on line 116 was always true

117 current.append(p) 

118 parts[-1].append("".join(current).strip()) # type:ignore[arg-type] 

119 current = [] 

120 continue 

121 if not inside and comp in (",", ";"): 

122 if stripped := "".join(current).strip(): 

123 parts[-1].append(stripped) # type:ignore[arg-type] 

124 current = [] 

125 parts.append([]) 

126 continue 

127 current.append(p) 

128 if current: 

129 parts[-1].append("".join(current).strip()) 

130 

131 # print(f">>>>>> {parts=}") 

132 new_parts: list[list[str]] = [] 

133 for entry in parts: 

134 if not entry: 134 ↛ 135line 134 didn't jump to line 135 because the condition on line 134 was never true

135 continue 

136 new_entry: list[str] = [] 

137 i1: int = entry[0].startswith("(") and entry[0].endswith(")") 

138 if i1: 

139 new_entry.append(entry[0][1:-1].strip()) 

140 else: 

141 new_entry.append("") 

142 i2: int = ( 

143 entry[-1].startswith("(") 

144 and entry[-1].endswith(")") 

145 and len(entry) > 1 

146 ) 

147 if i2 == 0: 

148 i2 = len(entry) 

149 else: 

150 i2 = -1 

151 new_entry.append("".join(entry[i1:i2]).strip()) 

152 if not new_entry[-1]: 152 ↛ 153line 152 didn't jump to line 153 because the condition on line 152 was never true

153 wxr.wtp.error( 

154 f"Missing IPA/enPRO sound data between qualifiers?" f"{entry=}", 

155 sortid="en/pronunciation/153", 

156 ) 

157 if i2 == -1: 

158 new_entry.append(entry[-1][1:-1].strip()) 

159 else: 

160 new_entry.append("") 

161 new_parts.append(new_entry) 

162 

163 # print(f">>>>> {new_parts=}") 

164 

165 for part in new_parts: 

166 sd = deepcopy(base_data) 

167 if part[0]: 

168 parse_pronunciation_tags(wxr, part[0], sd) 

169 if part[2]: 

170 parse_pronunciation_tags(wxr, part[2], sd) 

171 if tname == "enPR": 

172 sd["enpr"] = part[1] 

173 else: 

174 sd["ipa"] = part[1] 

175 sound_datas.append(sd) 

176 

177 # print(f"BASE_DATA: {base_data}") 

178 # print(f"SOUND_DATAS: {sound_datas=}") 

179 

180 return base_data, sound_datas 

181 

182 

183def parse_pronunciation( 

184 wxr: WiktextractContext, 

185 node: WikiNode, 

186 data: WordData, 

187 etym_data: WordData, 

188 have_etym: bool, 

189 base_data: WordData, 

190 lang_code: str, 

191) -> None: 

192 """Parses the pronunciation section from a language section on a 

193 page.""" 

194 assert isinstance(node, WikiNode) 

195 if node.kind in LEVEL_KINDS: 195 ↛ 198line 195 didn't jump to line 198 because the condition on line 195 was always true

196 contents = node.children 

197 else: 

198 contents = [node] 

199 # Remove subsections, such as Usage notes. They may contain IPAchar 

200 # templates in running text, and we do not want to extract IPAs from 

201 # those. 

202 # Filter out only LEVEL_KINDS; 'or' is doing heavy lifting here 

203 # Slip through not-WikiNodes, then slip through WikiNodes that 

204 # are not LEVEL_KINDS. 

205 contents = [ 

206 x 

207 for x in contents 

208 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS 

209 ] 

210 if not any( 

211 isinstance(x, WikiNode) and x.kind == NodeKind.LIST for x in contents 

212 ): 

213 # expand all templates 

214 new_contents: list[Union[str, WikiNode]] = [] 

215 for lst in contents: 

216 if ( 

217 isinstance(lst, TemplateNode) 

218 and isinstance(lst.largs[0][0], str) 

219 and lst.largs[0][0].strip() != "zh-pron" 

220 ): 

221 temp = wxr.wtp.node_to_wikitext(lst) 

222 temp = wxr.wtp.expand(temp) 

223 temp_parsed = wxr.wtp.parse(temp) 

224 new_contents.extend(temp_parsed.children) 

225 else: 

226 new_contents.append(lst) 

227 contents = new_contents 

228 

229 if have_etym and data is base_data: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true

230 data = etym_data 

231 pron_templates: list[tuple[SoundData, list[SoundData]]] = [] 

232 audios = [] 

233 have_panel_templates = False 

234 

235 def parse_pronunciation_template_fn( 

236 name: str, ht: TemplateArgs 

237 ) -> Optional[str]: 

238 # _template_fn handles templates *before* they are expanded; 

239 # this allows for special handling before all the work needed 

240 # for expansion is done. 

241 nonlocal have_panel_templates 

242 if is_panel_template(wxr, name): 

243 have_panel_templates = True 

244 return "" 

245 if name == "audio": 

246 filename = ht.get(2) or "" 

247 desc = ht.get(3) or "" 

248 desc = clean_node(wxr, None, [desc]) 

249 audio: SoundData = {"audio": filename.strip()} 

250 if desc: 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true

251 audio["text"] = desc 

252 m = re.search(r"\((([^()]|\([^()]*\))*)\)", desc) 

253 skip = False 

254 if m: 254 ↛ 255line 254 didn't jump to line 255 because the condition on line 254 was never true

255 par = m.group(1) 

256 cls = classify_desc(par) 

257 if cls == "tags": 

258 parse_pronunciation_tags(wxr, par, audio) 

259 else: 

260 skip = True 

261 if skip: 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true

262 return "" 

263 audios.append(audio) 

264 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

265 if name == "audio-IPA": 265 ↛ 266line 265 didn't jump to line 266 because the condition on line 265 was never true

266 filename = ht.get(2) or "" 

267 ipa = ht.get(3) or "" 

268 dial = ht.get("dial") 

269 audio = {"audio": filename.strip()} 

270 if dial: 

271 dial = clean_node(wxr, None, [dial]) 

272 audio["text"] = dial 

273 if ipa: 

274 audio["audio-ipa"] = ipa 

275 audios.append(audio) 

276 # The problem with these IPAs is that they often just describe 

277 # what's in the sound file, rather than giving the pronunciation 

278 # of the word alone. It is common for audio files to contain 

279 # multiple pronunciations or articles in the same file, and then 

280 # this IPA often describes what is in the file. 

281 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

282 if name == "audio-pron": 

283 filename = ht.get(2) or "" 

284 ipa = ht.get("ipa") or "" 

285 dial = ht.get("dial") 

286 country = ht.get("country") 

287 audio = {"audio": filename.strip()} 

288 if dial: 288 ↛ 292line 288 didn't jump to line 292 because the condition on line 288 was always true

289 dial = clean_node(wxr, None, [dial]) 

290 audio["text"] = dial 

291 parse_pronunciation_tags(wxr, dial, audio) 

292 if country: 292 ↛ 294line 292 didn't jump to line 294 because the condition on line 292 was always true

293 parse_pronunciation_tags(wxr, country, audio) 

294 if ipa: 294 ↛ 296line 294 didn't jump to line 296 because the condition on line 294 was always true

295 audio["audio-ipa"] = ipa 

296 audios.append(audio) 

297 # XXX do we really want to extract pronunciations from these? 

298 # Or are they spurious / just describing what is in the 

299 # audio file? 

300 # if ipa: 

301 # pron = {"ipa": ipa} 

302 # if dial: 

303 # parse_pronunciation_tags(wxr, dial, pron) 

304 # if country: 

305 # parse_pronunciation_tags(wxr, country, pron) 

306 # data_append(data, "sounds", pron) 

307 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

308 return None 

309 

310 def parse_pron_post_template_fn( 

311 name: str, ht: TemplateArgs, text: str 

312 ) -> Optional[str]: 

313 # _post_template_fn handles templates *after* the work to expand 

314 # them has been done; this is exactly the same as _template_fn, 

315 # except with the additional expanded text as an input, and 

316 # possible side-effects from the expansion and recursion (like 

317 # calling other subtemplates that are handled in _template_fn. 

318 if is_panel_template(wxr, name): 318 ↛ 319line 318 didn't jump to line 319 because the condition on line 318 was never true

319 return "" 

320 if name in { 

321 "q", 

322 "qualifier", 

323 "sense", 

324 "a", 

325 "accent", 

326 "l", 

327 "link", 

328 "lb", 

329 "lbl", 

330 "label", 

331 }: 

332 # Kludge: when these templates expand to /.../ or [...], 

333 # replace the expansion by something safe. This is used 

334 # to filter spurious IPA-looking expansions that aren't really 

335 # IPAs. We probably don't care about these templates in the 

336 # contexts where they expand to something containing these. 

337 v = re.sub(r'href="[^"]*"', "", text) # Ignore URLs 

338 v = re.sub(r'src="[^"]*"', "", v) 

339 if re.search(r"/[^/,]+?/|\[[^]0-9,/][^],/]*?\]", v): 339 ↛ 345line 339 didn't jump to line 345 because the condition on line 339 was always true

340 # Note: replacing by empty results in Lua errors that we 

341 # would rather not have. For example, voi/Middle Vietnamese 

342 # uses {{a|{{l{{vi|...}}}}, and the {{a|...}} will fail 

343 # if {{l|...}} returns empty. 

344 return "stripped-by-parse_pron_post_template_fn" 

345 if name in ("IPA", "enPR"): 

346 # Extract the data from IPA and enPR templates (same underlying 

347 # template) and replace them in-text with magical cookie that 

348 # can be later used to refer to the data's index inside 

349 # pron_templates. 

350 if pron_t := extract_pron_template(wxr, name, ht, text): 

351 pron_templates.append(pron_t) 

352 return f"__PRON_TEMPLATE_{len(pron_templates)-1}__" 

353 return text 

354 

355 def parse_expanded_zh_pron( 

356 node: WikiNode, 

357 parent_hdrs: list[str], 

358 specific_hdrs: list[str], 

359 unknown_header_tags: set[str], 

360 ) -> None: 

361 def generate_pron( 

362 v, new_parent_hdrs: list[str], new_specific_hdrs: list[str] 

363 ) -> Optional[SoundData]: 

364 pron: SoundData = {} 

365 pron["tags"] = [] 

366 pron["zh-pron"] = v.strip() 

367 for hdr in new_parent_hdrs + new_specific_hdrs: 

368 hdr = hdr.strip() 

369 valid_hdr = re.sub(r"\s+", "-", hdr) 

370 if hdr in ZH_PRON_TAGS: 

371 for tag in ZH_PRON_TAGS[hdr]: 

372 if tag not in pron["tags"]: 

373 pron["tags"].append(tag) 

374 elif valid_hdr in valid_tags: 

375 if valid_hdr not in pron["tags"]: 

376 pron["tags"].append(valid_hdr) 

377 else: 

378 unknown_header_tags.add(hdr) 

379 # convert into normal IPA format if has the IPA flag 

380 if "IPA" in pron["tags"]: 

381 pron["ipa"] = v 

382 del pron["zh-pron"] 

383 pron["tags"].remove("IPA") 

384 # convert into IPA but retain the Sinological-IPA tag 

385 elif "Sinological-IPA" in pron["tags"]: 

386 pron["ipa"] = v 

387 del pron["zh-pron"] 

388 

389 if not (pron.get("zh-pron") or pron.get("ipa")): 

390 return None 

391 return pron 

392 

393 if isinstance(node, list): 393 ↛ 394line 393 didn't jump to line 394 because the condition on line 393 was never true

394 for item in node: 

395 parse_expanded_zh_pron( 

396 item, parent_hdrs, specific_hdrs, unknown_header_tags 

397 ) 

398 return 

399 if not isinstance(node, WikiNode): 399 ↛ 400line 399 didn't jump to line 400 because the condition on line 399 was never true

400 return 

401 if node.kind != NodeKind.LIST: 401 ↛ 407line 401 didn't jump to line 407 because the condition on line 401 was always true

402 for item in node.children: 

403 parse_expanded_zh_pron( 

404 item, parent_hdrs, specific_hdrs, unknown_header_tags 

405 ) 

406 return 

407 for item in node.children: 

408 assert isinstance(item, WikiNode) 

409 assert item.kind == NodeKind.LIST_ITEM 

410 base_item = list( 

411 x 

412 for x in item.children 

413 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

414 ) 

415 text = clean_node(wxr, None, base_item) 

416 # print(f"{parent_hdrs} zhpron: {text}") # XXX remove me 

417 text = re.sub(r"(?s)\(Note:.*?\)", "", text) 

418 # Kludge to clean up text like 

419 # '(Standard Chinese, erhua-ed) (旋兒/旋儿)' where 

420 # the hanzi are examples 

421 hanzi_m = re.match(r"\s*(\([^()]*\))\s*\(([^()]*)\)\s*$", text) 

422 if hanzi_m: 

423 if re.search("[\u4e00-\u9fff]", hanzi_m.group(2)): 

424 text = hanzi_m.group(1) 

425 new_parent_hdrs = list(parent_hdrs) 

426 new_specific_hdrs = list(specific_hdrs) 

427 # look no further, here be dragons... 

428 

429 if ": " in text or ":" in text: 

430 parts = re.split(r": |:", text) 

431 m = re.match( 

432 r"\s*\((([^():]+)\s*(:|:)?\s*([^():]*))\)\s*$", text 

433 ) 

434 # Matches lines with stuff like "(Hokkien: Xiamen, Quanzhou)" 

435 # thrown into new_parent_hdrs 

436 if m: 

437 new_parent_hdrs.append(m.group(2).strip()) 

438 for hdr in m.group(4).split(","): 

439 new_specific_hdrs.append(hdr.strip()) 

440 else: 

441 # if "Zhangzhou" in text: 

442 # print("\nFOUND IN:", text, "\n") 

443 # print("PARTS: ", repr(parts)) 

444 # print(f" PARTS: {parts}") 

445 extra_tags = parts[0] 

446 # Kludge to handle how (Hokkien: Locations) and 

447 # IPA (Specific Location) interact; this is why 

448 # specific_hdrs was introduced to the soup, just 

449 # to specify which are actual hierarchical higher 

450 # level tags (Min'nan, Hokkien, etc.) which should 

451 # always be present and then use specific_hdrs 

452 # for that list of misc sublocations and subdialects 

453 # that can be overridden by more specific stuff 

454 # later. 

455 m = re.match(r"\s*IPA\s*\((.*)\)\s*$", extra_tags) 

456 if m: 

457 new_parent_hdrs.append("IPA") 

458 new_specific_hdrs = [ 

459 s.strip() for s in m.group(1).split(",") 

460 ] 

461 extra_tags = extra_tags[m.end() :] 

462 

463 m = re.match(r"\s*\([^()]*,[^()]*\)\s*$", extra_tags) 

464 if m: 

465 extra_tags = extra_tags.strip()[1:-1] # remove parens 

466 new_parent_hdrs.extend( 

467 s.strip() for s in extra_tags.split(",") 

468 ) 

469 elif extra_tags: 

470 new_parent_hdrs.append(extra_tags) 

471 

472 v = ":".join(parts[1:]) 

473 

474 # check for phrases 

475 if ("," in (wxr.wtp.title or "")) and len( 

476 v.split(" ") 

477 ) + v.count(",") == len(wxr.wtp.title or ""): 

478 # This just captures exact matches where you have 

479 # the pronunciation of the whole phrase and nothing 

480 # else. Split on spaces, then because we're not 

481 # splitting next to a comma we need to add the 

482 # count of commas so that it synchs up with the 

483 # unicode string length of the original hanzi, 

484 # where the comma is a separate character (unlike 

485 # in the split list, where it's part of a space- 

486 # separated string, like "teo⁴,". 

487 vals = [v] 

488 pron = generate_pron( 

489 v, new_parent_hdrs, new_specific_hdrs 

490 ) 

491 

492 if pron: 

493 pron["tags"] = list(sorted(pron["tags"])) 

494 if pron not in data.get("sounds", ()): 

495 data_append(data, "sounds", pron) 

496 elif "→" in v: 

497 vals = re.split("→", v) 

498 for v in vals: 

499 pron = generate_pron( 

500 v, new_parent_hdrs, new_specific_hdrs 

501 ) 

502 if pron: 

503 m = re.match( 

504 r"([^()]+)\s*\(toneless" 

505 r" final syllable variant\)\s*", 

506 v, 

507 ) 

508 if m: 

509 pron["zh-pron"] = m.group(1).strip() 

510 pron["tags"].append( 

511 "toneless-final-syllable-variant" 

512 ) 

513 

514 pron["tags"] = list(sorted(pron["tags"])) 

515 if pron not in data.get("sounds", ()): 

516 data_append(data, "sounds", pron) 

517 else: 

518 # split alternative pronunciations split 

519 # with "," or " / " 

520 vals = re.split(r"\s*,\s*|\s+/\s+", v) 

521 new_vals = [] 

522 for v2 in vals: 

523 if v2.startswith("/") and v2.endswith("/"): 

524 # items like /kɛiŋ²¹³⁻⁵³ ^((Ø-))ŋuaŋ³³/ 

525 new_vals.append(v2) 

526 else: 

527 # split in parentheses otherwise 

528 new_vals.extend(re.split(r"[()]", v2)) 

529 vals = new_vals 

530 for v in vals: 

531 pron = generate_pron( 

532 v, new_parent_hdrs, new_specific_hdrs 

533 ) 

534 if pron: 

535 pron["tags"] = list(sorted(pron["tags"])) 

536 if pron not in data.get("sounds", ()): 

537 data_append(data, "sounds", pron) 

538 else: 

539 new_parent_hdrs.append(text) 

540 

541 for x in item.children: 

542 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST: 

543 parse_expanded_zh_pron( 

544 x, new_parent_hdrs, specific_hdrs, unknown_header_tags 

545 ) 

546 

547 def parse_chinese_pron( 

548 contents: Union[list[Union[WikiNode, str]], WikiNode, str], 

549 unknown_header_tags: set[str], 

550 ) -> None: 

551 if isinstance(contents, list): 

552 for item in contents: 

553 parse_chinese_pron(item, unknown_header_tags) 

554 return 

555 if not isinstance(contents, WikiNode): 

556 return 

557 if contents.kind != NodeKind.TEMPLATE: 

558 for item in contents.children: 

559 parse_chinese_pron(item, unknown_header_tags) 

560 return 

561 if ( 

562 len(contents.largs[0]) == 1 

563 and isinstance(contents.largs[0][0], str) 

564 and contents.largs[0][0].strip() == "zh-pron" 

565 ): 

566 src = wxr.wtp.node_to_wikitext(contents) 

567 expanded = wxr.wtp.expand(src, templates_to_expand={"zh-pron"}) 

568 parsed = wxr.wtp.parse(expanded) 

569 parse_expanded_zh_pron(parsed, [], [], unknown_header_tags) 

570 else: 

571 for item in contents.children: 571 ↛ 572line 571 didn't jump to line 572 because the loop on line 571 never started

572 parse_chinese_pron(item, unknown_header_tags) 

573 return 

574 

575 if lang_code == "zh": 

576 unknown_header_tags: set[str] = set() 

577 parse_chinese_pron(contents, unknown_header_tags) 

578 for hdr in unknown_header_tags: 578 ↛ 579line 578 didn't jump to line 579 because the loop on line 578 never started

579 wxr.wtp.debug( 

580 f"Zh-pron header not found in zh_pron_tags or tags: " 

581 f"{repr(hdr)}", 

582 sortid="pronunciations/296/20230324", 

583 ) 

584 

585 def flattened_tree( 

586 lines: list[Union[WikiNode, str]], 

587 ) -> Iterator[Union[WikiNode, str]]: 

588 assert isinstance(lines, list) 

589 for line in lines: 

590 yield from flattened_tree1(line) 

591 

592 def flattened_tree1( 

593 node: Union[WikiNode, str], 

594 ) -> Iterator[Union[WikiNode, str]]: 

595 assert isinstance(node, (WikiNode, str)) 

596 if isinstance(node, str): 

597 yield node 

598 return 

599 elif node.kind == NodeKind.LIST: 

600 for item in node.children: 

601 yield from flattened_tree1(item) 

602 elif node.kind == NodeKind.LIST_ITEM: 

603 new_children = [] 

604 sublist = None 

605 for child in node.children: 

606 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST: 

607 sublist = child 

608 else: 

609 new_children.append(child) 

610 node.children = new_children 

611 node.sarg = "*" 

612 yield node 

613 if sublist: 

614 yield from flattened_tree1(sublist) 

615 else: 

616 yield node 

617 

618 # XXX Do not use flattened_tree more than once here, for example for 

619 # debug printing... The underlying data is changed, and the separated 

620 # sublists disappear. 

621 

622 # Kludge for templates that generate several lines, but haven't 

623 # been caught by earlier kludges... 

624 def split_cleaned_node_on_newlines( 

625 contents: list[Union[WikiNode, str]], 

626 ) -> Iterator[str]: 

627 for litem in flattened_tree(contents): 

628 ipa_text = clean_node( 

629 wxr, 

630 data, 

631 litem, 

632 template_fn=parse_pronunciation_template_fn, 

633 post_template_fn=parse_pron_post_template_fn, 

634 ) 

635 for line in ipa_text.splitlines(): 

636 yield line 

637 

638 # have_pronunciations = False 

639 active_pos: Optional[str] = None 

640 

641 for line in split_cleaned_node_on_newlines(contents): 

642 # print(f"{line=}") 

643 prefix: Optional[str] = None 

644 earlier_base_data: Optional[SoundData] = None 

645 if not line: 645 ↛ 646line 645 didn't jump to line 646 because the condition on line 645 was never true

646 continue 

647 

648 split_templates = re.split(r"__PRON_TEMPLATE_(\d+)__", line) 

649 for i, text in enumerate(split_templates): 

650 if not text: 

651 continue 

652 # clean up starts at the start of the line 

653 text = re.sub(r"^\**\s*", "", text).strip() 

654 if i == 0: 

655 # At the start of a line, check for stuff like "Noun:" 

656 # for active_pos; active_pos is a temporary data field 

657 # given to each saved SoundData entry which is later 

658 # used to sort the entries into their respective PoSes. 

659 m = re.match(r"\s*(\w+\s?\w*)\s*:?\s*", text) 

660 if m: 

661 if (m_lower := m.group(1).lower()) in part_of_speech_map: 

662 active_pos = part_of_speech_map[m_lower]["pos"] 

663 text = text[m.end() :].strip() 

664 if not text: 

665 continue 

666 if i % 2 == 1: 

667 # re.split (with capture groups) splits the lines so that 

668 # every even entry is a captured splitter; odd lines are either 

669 # empty strings or stuff around the splitters. 

670 base_pron_data, first_prons = pron_templates[int(text)] 

671 if base_pron_data: 

672 earlier_base_data = base_pron_data 

673 # print(f"Set {earlier_base_data=}") 

674 elif earlier_base_data is not None: 674 ↛ 691line 674 didn't jump to line 691 because the condition on line 674 was always true

675 # merge data from an earlier iteration of this loop 

676 for pr in first_prons: 

677 if "note" in pr and "note" in earlier_base_data: 677 ↛ 678line 677 didn't jump to line 678 because the condition on line 677 was never true

678 pr["note"] += ";" + earlier_base_data.get( 

679 "note", "" 

680 ) 

681 elif "note" in earlier_base_data: 681 ↛ 682line 681 didn't jump to line 682 because the condition on line 681 was never true

682 pr["note"] = earlier_base_data["note"] 

683 if "topics" in earlier_base_data: 683 ↛ 684line 683 didn't jump to line 684 because the condition on line 683 was never true

684 data_extend( 

685 pr, "topics", earlier_base_data["topics"] 

686 ) 

687 if "tags" in pr and "tags" in earlier_base_data: 687 ↛ 688line 687 didn't jump to line 688 because the condition on line 687 was never true

688 pr["tags"].extend(earlier_base_data["tags"]) 

689 elif "tags" in earlier_base_data: 689 ↛ 676line 689 didn't jump to line 676 because the condition on line 689 was always true

690 pr["tags"] = sorted(set(earlier_base_data["tags"])) 

691 for pr in first_prons: 

692 if active_pos: 

693 pr["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

694 if pr not in data.get("sounds", ()): 694 ↛ 691line 694 didn't jump to line 691 because the condition on line 694 was always true

695 data_append(data, "sounds", pr) 

696 # This bit is handled 

697 continue 

698 

699 if "IPA" in text: 

700 field = "ipa" 

701 else: 

702 # This is used for Rhymes, Homophones, etc 

703 field = "other" 

704 

705 # Check if it contains Japanese "Tokyo" pronunciation with 

706 # special syntax 

707 m = re.search(r"(?m)\(Tokyo\) +([^ ]+) +\[", text) 

708 if m: 708 ↛ 709line 708 didn't jump to line 709 because the condition on line 708 was never true

709 pron: SoundData = {field: m.group(1)} # type: ignore[misc] 

710 if active_pos: 

711 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

712 data_append(data, "sounds", pron) 

713 # have_pronunciations = True 

714 continue 

715 

716 # Check if it contains Rhymes 

717 m = re.match(r"\s*Rhymes?: (.*)", text) 

718 if m: 

719 for ending in split_at_comma_semi(m.group(1)): 

720 ending = ending.strip() 

721 if ending: 721 ↛ 719line 721 didn't jump to line 719 because the condition on line 721 was always true

722 pron = {"rhymes": ending} 

723 if active_pos: 

724 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

725 data_append(data, "sounds", pron) 

726 # have_pronunciations = True 

727 continue 

728 

729 # Check if it contains homophones 

730 m = re.search(r"(?m)\bHomophones?: (.*)", text) 

731 if m: 

732 for w in split_at_comma_semi(m.group(1)): 

733 w = w.strip() 

734 if w: 734 ↛ 732line 734 didn't jump to line 732 because the condition on line 734 was always true

735 pron = {"homophone": w} 

736 if active_pos: 

737 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

738 data_append(data, "sounds", pron) 

739 # have_pronunciations = True 

740 continue 

741 

742 # Check if it contains Phonetic hangeul 

743 m = re.search(r"(?m)\bPhonetic hange?ul: \[([^]]+)\]", text) 

744 if m: 744 ↛ 745line 744 didn't jump to line 745 because the condition on line 744 was never true

745 seen = set() 

746 for w in m.group(1).split("/"): 

747 w = w.strip() 

748 if w and w not in seen: 

749 seen.add(w) 

750 pron = {"hangeul": w} 

751 if active_pos: 

752 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

753 data_append(data, "sounds", pron) 

754 # have_pronunciations = True 

755 

756 m = re.search(r"\b(Syllabification|Hyphenation): ([^\s,]*)", text) 

757 if m: 757 ↛ 758line 757 didn't jump to line 758 because the condition on line 757 was never true

758 data_append(data, "hyphenation", m.group(2)) 

759 # have_pronunciations = True 

760 

761 # See if it contains a word prefix restricting which forms the 

762 # pronunciation applies to (see amica/Latin) and/or parenthesized 

763 # tags. 

764 m = re.match( 

765 r"^[*#\s]*(([-\w]+):\s+)?\((([^()]|\([^()]*\))*?)\)", text 

766 ) 

767 if m: 

768 prefix = m.group(2) or "" 

769 tagstext = m.group(3) 

770 text = text[m.end() :] 

771 else: 

772 m = re.match(r"^[*#\s]*([-\w]+):\s+", text) 

773 if m: 

774 prefix = m.group(1) 

775 tagstext = "" 

776 text = text[m.end() :] 

777 else: 

778 # Spanish has tags before pronunciations, eg. aceite/Spanish 

779 m = re.match(r".*:\s+\(([^)]*)\)\s+(.*)", text) 

780 if m: 780 ↛ 781line 780 didn't jump to line 781 because the condition on line 780 was never true

781 tagstext = m.group(1) 

782 text = m.group(2) 

783 else: 

784 # No prefix. In this case, we inherit prefix 

785 # from previous entry. This particularly 

786 # applies for nested Audio files. 

787 tagstext = "" 

788 if tagstext: 

789 earlier_base_data = {} 

790 parse_pronunciation_tags(wxr, tagstext, earlier_base_data) 

791 

792 # Find romanizations from the pronunciation section (routinely 

793 # produced for Korean by {{ko-IPA}}) 

794 for m in re.finditer(pron_romanization_re, text): 794 ↛ 795line 794 didn't jump to line 795 because the loop on line 794 never started

795 prefix = m.group(1) 

796 w = m.group(2).strip() 

797 tag = pron_romanizations[prefix] 

798 form = {"form": w, "tags": tag.split()} 

799 data_append(data, "forms", form) 

800 

801 # Find IPA pronunciations 

802 for m in re.finditer( 

803 r"(?m)/[^][\n/,]+?/" r"|" r"\[[^]\n0-9,/][^],/]*?\]", text 

804 ): 

805 v = m.group(0) 

806 # The regexp above can match file links. Skip them. 

807 if v.startswith("[[File:"): 807 ↛ 808line 807 didn't jump to line 808 because the condition on line 807 was never true

808 continue 

809 if v == "/wiki.local/": 809 ↛ 810line 809 didn't jump to line 810 because the condition on line 809 was never true

810 continue 

811 if field == "ipa" and "__AUDIO_IGNORE_THIS__" in text: 811 ↛ 812line 811 didn't jump to line 812 because the condition on line 811 was never true

812 m = re.search(r"__AUDIO_IGNORE_THIS__(\d+)__", text) 

813 assert m 

814 idx = int(m.group(1)) 

815 if idx >= len(audios): 

816 continue 

817 if not audios[idx].get("audio-ipa"): 

818 audios[idx]["audio-ipa"] = v 

819 if prefix: 

820 audios[idx]["form"] = prefix 

821 else: 

822 if earlier_base_data: 

823 pron = deepcopy(earlier_base_data) 

824 pron[field] = v 

825 else: 

826 pron = {field: v} # type: ignore[misc] 

827 if active_pos: 

828 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

829 if prefix: 

830 pron["form"] = prefix 

831 if active_pos: 

832 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

833 data_append(data, "sounds", pron) 

834 # have_pronunciations = True 

835 

836 # XXX what about {{hyphenation|...}}, {{hyph|...}} 

837 # and those used to be stored under "hyphenation" 

838 

839 # Add data that was collected in template_fn 

840 for audio in audios: 

841 if "audio" in audio: 841 ↛ 898line 841 didn't jump to line 898 because the condition on line 841 was always true

842 # Compute audio file URLs 

843 fn = audio["audio"] 

844 # Strip certain characters, e.g., left-to-right mark 

845 fn = re.sub(r"[\u200f\u200e]", "", fn) 

846 fn = fn.strip() 

847 fn = urllib.parse.unquote(fn) 

848 # First character is usually uppercased 

849 if re.match(r"^[a-z][a-z]+", fn): 

850 fn = fn[0].upper() + fn[1:] 

851 if fn in wxr.config.redirects: 851 ↛ 852line 851 didn't jump to line 852 because the condition on line 851 was never true

852 fn = wxr.config.redirects[fn] 

853 # File extension is lowercased 

854 # XXX some words seem to need this, some don't seem to 

855 # have this??? what is the exact rule? 

856 # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn) 

857 # Spaces are converted to underscores 

858 fn = re.sub(r"\s+", "_", fn) 

859 # Compute hash digest part 

860 h = hashlib.md5() 

861 hname = fn.encode("utf-8") 

862 h.update(hname) 

863 digest = h.hexdigest() 

864 # Quote filename for URL 

865 qfn = urllib.parse.quote(fn) 

866 # For safety when writing files 

867 qfn = qfn.replace("/", "__slash__") 

868 if re.search(r"(?i)\.(ogg|oga)$", fn): 

869 ogg = ( 

870 "https://upload.wikimedia.org/wikipedia/" 

871 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn) 

872 ) 

873 else: 

874 ogg = ( 

875 "https://upload.wikimedia.org/wikipedia/" 

876 "commons/transcoded/" 

877 "{}/{}/{}/{}.ogg".format( 

878 digest[:1], digest[:2], qfn, qfn 

879 ) 

880 ) 

881 if re.search(r"(?i)\.(mp3)$", fn): 881 ↛ 882line 881 didn't jump to line 882 because the condition on line 881 was never true

882 mp3 = ( 

883 "https://upload.wikimedia.org/wikipedia/" 

884 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn) 

885 ) 

886 else: 

887 mp3 = ( 

888 "https://upload.wikimedia.org/wikipedia/" 

889 "commons/transcoded/" 

890 "{}/{}/{}/{}.mp3".format( 

891 digest[:1], digest[:2], qfn, qfn 

892 ) 

893 ) 

894 audio["ogg_url"] = ogg 

895 audio["mp3_url"] = mp3 

896 if active_pos: 

897 audio["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

898 if audio not in data.get("sounds", ()): 

899 data_append(data, "sounds", audio) 

900 # if audios: 

901 # have_pronunciations = True 

902 audios = [] 

903 

904 ## I have commented out the otherwise unused have_pronunciation 

905 ## toggles; uncomment them to use this debug print 

906 # if not have_pronunciations and not have_panel_templates: 

907 # wxr.wtp.debug("no pronunciations found from pronunciation section", 

908 # sortid="pronunciations/533")