Coverage for src/wiktextract/extractor/en/pronunciation.py: 52%

510 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import hashlib 

2import re 

3import urllib 

4from copy import deepcopy 

5from typing import Iterator, Optional, Union 

6 

7from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

8 

9from ...clean import clean_value 

10from ...datautils import data_append, data_extend, split_at_comma_semi 

11from ...page import LEVEL_KINDS, clean_node, is_panel_template 

12from ...tags import valid_tags 

13from ...wxr_context import WiktextractContext 

14from .form_descriptions import classify_desc, parse_pronunciation_tags 

15from .parts_of_speech import part_of_speech_map 

16from .type_utils import SoundData, TemplateArgs, WordData 

17from .zh_pron_tags import ZH_PRON_TAGS 

18 

19# Prefixes, tags, and regexp for finding romanizations from the pronuncation 

20# section 

21pron_romanizations = { 

22 " Revised Romanization ": "romanization revised", 

23 " Revised Romanization (translit.) ": "romanization revised transliteration", 

24 " McCune-Reischauer ": "McCune-Reischauer romanization", 

25 " McCune–Reischauer ": "McCune-Reischauer romanization", 

26 " Yale Romanization ": "Yale romanization", 

27} 

28pron_romanization_re = re.compile( 

29 "(?m)^(" 

30 + "|".join( 

31 re.escape(x) 

32 for x in sorted(pron_romanizations.keys(), key=len, reverse=True) 

33 ) 

34 + ")([^\n]+)" 

35) 

36 

37IPA_EXTRACT = r"^(\((.+)\) )?(IPA⁽ᵏᵉʸ⁾|enPR): ((.+?)( \(([^(]+)\))?\s*)$" 

38IPA_EXTRACT_RE = re.compile(IPA_EXTRACT) 

39 

40 

41def extract_pron_template( 

42 wxr: WiktextractContext, tname: str, targs: TemplateArgs, expanded: str 

43) -> Optional[tuple[SoundData, list[SoundData]]]: 

44 """In post_template_fn, this is used to handle all enPR and IPA templates 

45 so that we can leave breadcrumbs in the text that can later be handled 

46 there. We return a `base_data` so that if there are two 

47 or more templates on the same line, like this: 

48 (Tags for the whole line, really) enPR: foo, IPA(keys): /foo/ 

49 then we can apply base_data fields to other templates, too, if needed. 

50 """ 

51 cleaned = clean_value(wxr, expanded) 

52 # print(f"extract_pron_template input: {tname=} {expanded=}-> {cleaned=}") 

53 m = IPA_EXTRACT_RE.match(cleaned) 

54 if not m: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 wxr.wtp.error( 

56 f"Text cannot match IPA_EXTRACT_RE regex: " 

57 f"{cleaned=}, {tname=}, {targs=}", 

58 sortid="en/pronunciation/54", 

59 ) 

60 return None 

61 # for i, group in enumerate(m.groups()): 

62 # print(i + 1, repr(group)) 

63 main_qual = m.group(2) or "" 

64 if "qq" in targs: 

65 # If the template has been given a qualifier that applies to 

66 # every entry, but which also happens to appear at the end 

67 # which can be confused with the post-qualifier of a single 

68 # entry in the style of "... /ipa3/ (foo) (bar)", where foo 

69 # might not be present so the bar looks like it only might 

70 # apply to `/ipa3/` 

71 pron_body = m.group(5) 

72 post_qual = m.group(7) 

73 else: 

74 pron_body = m.group(4) 

75 post_qual = "" 

76 

77 if not pron_body: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true

78 wxr.wtp.error( 

79 f"Regex failed to find 'body' from {cleaned=}", 

80 sortid="en/pronunciation/81", 

81 ) 

82 return None 

83 

84 base_data: SoundData = {} 

85 if main_qual: 

86 parse_pronunciation_tags(wxr, main_qual, base_data) 

87 if post_qual: 

88 parse_pronunciation_tags(wxr, post_qual, base_data) 

89 # This base_data is used as the base copy for all entries from this 

90 # template, but it is also returned so that its contents may be applied 

91 # to other templates on the same line. 

92 # print(f"{base_data=}") 

93 

94 sound_datas: list[SoundData] = [] 

95 

96 parts: list[list[str]] = [[]] 

97 inside = 0 

98 current: list[str] = [] 

99 for i, p in enumerate(re.split(r"(\s*,|;|\(|\)\s*)", pron_body)): 

100 # Split the line on commas and semicolons outside of parens. 

101 # This gives us lines with "(main-qualifier) /phon/ (post-qualifier, maybe)" 

102 # print(f" {i=}, {p=}") 

103 comp = p.strip() 

104 if not p: 

105 continue 

106 if comp == "(": 

107 if not inside and i > 0: 107 ↛ 110line 107 didn't jump to line 110 because the condition on line 107 was always true

108 if stripped := "".join(current).strip(): 

109 parts[-1].append("".join(current).strip()) # type:ignore[arg-type] 

110 current = [p] 

111 inside += 1 

112 continue 

113 if comp == ")": 

114 inside -= 1 

115 if not inside: 115 ↛ 120line 115 didn't jump to line 120 because the condition on line 115 was always true

116 if stripped := "".join(current).strip(): 116 ↛ 120line 116 didn't jump to line 120 because the condition on line 116 was always true

117 current.append(p) 

118 parts[-1].append("".join(current).strip()) # type:ignore[arg-type] 

119 current = [] 

120 continue 

121 if not inside and comp in (",", ";"): 

122 if stripped := "".join(current).strip(): 

123 parts[-1].append(stripped) # type:ignore[arg-type] 

124 current = [] 

125 parts.append([]) 

126 continue 

127 current.append(p) 

128 if current: 

129 parts[-1].append("".join(current).strip()) 

130 

131 # print(f">>>>>> {parts=}") 

132 new_parts: list[list[str]] = [] 

133 for entry in parts: 

134 if not entry: 134 ↛ 135line 134 didn't jump to line 135 because the condition on line 134 was never true

135 continue 

136 new_entry: list[str] = [] 

137 i1: int = entry[0].startswith("(") and entry[0].endswith(")") 

138 if i1: 

139 new_entry.append(entry[0][1:-1].strip()) 

140 else: 

141 new_entry.append("") 

142 i2: int = ( 

143 entry[-1].startswith("(") 

144 and entry[-1].endswith(")") 

145 and len(entry) > 1 

146 ) 

147 if i2 == 0: 

148 i2 = len(entry) 

149 else: 

150 i2 = -1 

151 new_entry.append("".join(entry[i1:i2]).strip()) 

152 if not new_entry[-1]: 152 ↛ 153line 152 didn't jump to line 153 because the condition on line 152 was never true

153 wxr.wtp.error( 

154 f"Missing IPA/enPRO sound data between qualifiers?" f"{entry=}", 

155 sortid="en/pronunciation/153", 

156 ) 

157 if i2 == -1: 

158 new_entry.append(entry[-1][1:-1].strip()) 

159 else: 

160 new_entry.append("") 

161 new_parts.append(new_entry) 

162 

163 # print(f">>>>> {new_parts=}") 

164 

165 for part in new_parts: 

166 sd = deepcopy(base_data) 

167 if part[0]: 

168 parse_pronunciation_tags(wxr, part[0], sd) 

169 if part[2]: 

170 parse_pronunciation_tags(wxr, part[2], sd) 

171 if tname == "enPR": 

172 sd["enpr"] = part[1] 

173 else: 

174 sd["ipa"] = part[1] 

175 sound_datas.append(sd) 

176 

177 # print(f"BASE_DATA: {base_data}") 

178 # print(f"SOUND_DATAS: {sound_datas=}") 

179 

180 return base_data, sound_datas 

181 

182 

183def parse_pronunciation( 

184 wxr: WiktextractContext, 

185 node: WikiNode, 

186 data: WordData, 

187 etym_data: WordData, 

188 have_etym: bool, 

189 base_data: WordData, 

190 lang_code: str, 

191) -> None: 

192 """Parses the pronunciation section from a language section on a 

193 page.""" 

194 assert isinstance(node, WikiNode) 

195 if node.kind in LEVEL_KINDS: 195 ↛ 198line 195 didn't jump to line 198 because the condition on line 195 was always true

196 contents = node.children 

197 else: 

198 contents = [node] 

199 # Remove subsections, such as Usage notes. They may contain IPAchar 

200 # templates in running text, and we do not want to extract IPAs from 

201 # those. 

202 # Filter out only LEVEL_KINDS; 'or' is doing heavy lifting here 

203 # Slip through not-WikiNodes, then slip through WikiNodes that 

204 # are not LEVEL_KINDS. 

205 contents = [ 

206 x 

207 for x in contents 

208 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS 

209 ] 

210 if not any( 210 ↛ 214line 210 didn't jump to line 214 because the condition on line 210 was never true

211 isinstance(x, WikiNode) and x.kind == NodeKind.LIST for x in contents 

212 ): 

213 # expand all templates 

214 new_contents: list[Union[str, WikiNode]] = [] 

215 for lst in contents: 

216 if ( 

217 isinstance(lst, TemplateNode) 

218 and isinstance(lst.largs[0][0], str) 

219 and lst.largs[0][0].strip() != "zh-pron" 

220 ): 

221 temp = wxr.wtp.node_to_wikitext(lst) 

222 temp = wxr.wtp.expand(temp) 

223 temp_parsed = wxr.wtp.parse(temp) 

224 new_contents.extend(temp_parsed.children) 

225 else: 

226 new_contents.append(lst) 

227 contents = new_contents 

228 

229 if have_etym and data is base_data: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true

230 data = etym_data 

231 pron_templates: list[tuple[SoundData, list[SoundData]]] = [] 

232 audios = [] 

233 have_panel_templates = False 

234 

235 def parse_pronunciation_template_fn( 

236 name: str, ht: TemplateArgs 

237 ) -> Optional[str]: 

238 # _template_fn handles templates *before* they are expanded; 

239 # this allows for special handling before all the work needed 

240 # for expansion is done. 

241 nonlocal have_panel_templates 

242 if is_panel_template(wxr, name): 242 ↛ 243line 242 didn't jump to line 243 because the condition on line 242 was never true

243 have_panel_templates = True 

244 return "" 

245 if name == "audio": 

246 filename = ht.get(2) or "" 

247 desc = ht.get(3) or "" 

248 desc = clean_node(wxr, None, [desc]) 

249 audio: SoundData = {"audio": filename.strip()} 

250 if desc: 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true

251 audio["text"] = desc 

252 m = re.search(r"\((([^()]|\([^()]*\))*)\)", desc) 

253 skip = False 

254 if m: 254 ↛ 255line 254 didn't jump to line 255 because the condition on line 254 was never true

255 par = m.group(1) 

256 cls = classify_desc(par) 

257 if cls == "tags": 

258 parse_pronunciation_tags(wxr, par, audio) 

259 else: 

260 skip = True 

261 if skip: 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true

262 return "" 

263 audios.append(audio) 

264 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

265 if name == "audio-IPA": 265 ↛ 266line 265 didn't jump to line 266 because the condition on line 265 was never true

266 filename = ht.get(2) or "" 

267 ipa = ht.get(3) or "" 

268 dial = ht.get("dial") 

269 audio = {"audio": filename.strip()} 

270 if dial: 

271 dial = clean_node(wxr, None, [dial]) 

272 audio["text"] = dial 

273 if ipa: 

274 audio["audio-ipa"] = ipa 

275 audios.append(audio) 

276 # The problem with these IPAs is that they often just describe 

277 # what's in the sound file, rather than giving the pronunciation 

278 # of the word alone. It is common for audio files to contain 

279 # multiple pronunciations or articles in the same file, and then 

280 # this IPA often describes what is in the file. 

281 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

282 if name == "audio-pron": 282 ↛ 283line 282 didn't jump to line 283 because the condition on line 282 was never true

283 filename = ht.get(2) or "" 

284 ipa = ht.get("ipa") or "" 

285 dial = ht.get("dial") 

286 country = ht.get("country") 

287 audio = {"audio": filename.strip()} 

288 if dial: 

289 dial = clean_node(wxr, None, [dial]) 

290 audio["text"] = dial 

291 parse_pronunciation_tags(wxr, dial, audio) 

292 if country: 

293 parse_pronunciation_tags(wxr, country, audio) 

294 if ipa: 

295 audio["audio-ipa"] = ipa 

296 audios.append(audio) 

297 # XXX do we really want to extract pronunciations from these? 

298 # Or are they spurious / just describing what is in the 

299 # audio file? 

300 # if ipa: 

301 # pron = {"ipa": ipa} 

302 # if dial: 

303 # parse_pronunciation_tags(wxr, dial, pron) 

304 # if country: 

305 # parse_pronunciation_tags(wxr, country, pron) 

306 # data_append(data, "sounds", pron) 

307 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

308 return None 

309 

310 def parse_pron_post_template_fn( 

311 name: str, ht: TemplateArgs, text: str 

312 ) -> Optional[str]: 

313 # _post_template_fn handles templates *after* the work to expand 

314 # them has been done; this is exactly the same as _template_fn, 

315 # except with the additional expanded text as an input, and 

316 # possible side-effects from the expansion and recursion (like 

317 # calling other subtemplates that are handled in _template_fn. 

318 if is_panel_template(wxr, name): 318 ↛ 319line 318 didn't jump to line 319 because the condition on line 318 was never true

319 return "" 

320 if name in { 320 ↛ 337line 320 didn't jump to line 337 because the condition on line 320 was never true

321 "q", 

322 "qualifier", 

323 "sense", 

324 "a", 

325 "accent", 

326 "l", 

327 "link", 

328 "lb", 

329 "lbl", 

330 "label", 

331 }: 

332 # Kludge: when these templates expand to /.../ or [...], 

333 # replace the expansion by something safe. This is used 

334 # to filter spurious IPA-looking expansions that aren't really 

335 # IPAs. We probably don't care about these templates in the 

336 # contexts where they expand to something containing these. 

337 v = re.sub(r'href="[^"]*"', "", text) # Ignore URLs 

338 v = re.sub(r'src="[^"]*"', "", v) 

339 if re.search(r"/[^/,]+?/|\[[^]0-9,/][^],/]*?\]", v): 

340 # Note: replacing by empty results in Lua errors that we 

341 # would rather not have. For example, voi/Middle Vietnamese 

342 # uses {{a|{{l{{vi|...}}}}, and the {{a|...}} will fail 

343 # if {{l|...}} returns empty. 

344 return "stripped-by-parse_pron_post_template_fn" 

345 if name in ("IPA", "enPR"): 

346 # Extract the data from IPA and enPR templates (same underlying 

347 # template) and replace them in-text with magical cookie that 

348 # can be later used to refer to the data's index inside 

349 # pron_templates. 

350 if pron_t := extract_pron_template(wxr, name, ht, text): 350 ↛ 353line 350 didn't jump to line 353 because the condition on line 350 was always true

351 pron_templates.append(pron_t) 

352 return f"__PRON_TEMPLATE_{len(pron_templates)-1}__" 

353 return text 

354 

355 def parse_expanded_zh_pron( 

356 node: WikiNode, 

357 parent_hdrs: list[str], 

358 specific_hdrs: list[str], 

359 unknown_header_tags: set[str], 

360 ) -> None: 

361 def generate_pron( 

362 v, new_parent_hdrs: list[str], new_specific_hdrs: list[str] 

363 ) -> Optional[SoundData]: 

364 pron: SoundData = {} 

365 pron["tags"] = [] 

366 pron["zh-pron"] = v.strip() 

367 for hdr in new_parent_hdrs + new_specific_hdrs: 

368 hdr = hdr.strip() 

369 valid_hdr = re.sub(r"\s+", "-", hdr) 

370 if hdr in ZH_PRON_TAGS: 

371 for tag in ZH_PRON_TAGS[hdr]: 

372 if tag not in pron["tags"]: 

373 pron["tags"].append(tag) 

374 elif valid_hdr in valid_tags: 

375 if valid_hdr not in pron["tags"]: 

376 pron["tags"].append(valid_hdr) 

377 else: 

378 unknown_header_tags.add(hdr) 

379 # convert into normal IPA format if has the IPA flag 

380 if "IPA" in pron["tags"]: 

381 pron["ipa"] = v 

382 del pron["zh-pron"] 

383 pron["tags"].remove("IPA") 

384 # convert into IPA but retain the Sinological-IPA tag 

385 elif "Sinological-IPA" in pron["tags"]: 

386 pron["ipa"] = v 

387 del pron["zh-pron"] 

388 

389 if not (pron.get("zh-pron") or pron.get("ipa")): 

390 return None 

391 return pron 

392 

393 if isinstance(node, list): 

394 for item in node: 

395 parse_expanded_zh_pron( 

396 item, parent_hdrs, specific_hdrs, unknown_header_tags 

397 ) 

398 return 

399 if not isinstance(node, WikiNode): 

400 return 

401 if node.kind != NodeKind.LIST: 

402 for item in node.children: 

403 parse_expanded_zh_pron( 

404 item, parent_hdrs, specific_hdrs, unknown_header_tags 

405 ) 

406 return 

407 for item in node.children: 

408 assert isinstance(item, WikiNode) 

409 assert item.kind == NodeKind.LIST_ITEM 

410 base_item = list( 

411 x 

412 for x in item.children 

413 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

414 ) 

415 text = clean_node(wxr, None, base_item) 

416 # print(f"{parent_hdrs} zhpron: {text}") # XXX remove me 

417 text = re.sub(r"(?s)\(Note:.*?\)", "", text) 

418 # Kludge to clean up text like 

419 # '(Standard Chinese, erhua-ed) (旋兒/旋儿)' where 

420 # the hanzi are examples 

421 hanzi_m = re.match(r"\s*(\([^()]*\))\s*\(([^()]*)\)\s*$", text) 

422 if hanzi_m: 

423 if re.search("[\u4e00-\u9fff]", hanzi_m.group(2)): 

424 text = hanzi_m.group(1) 

425 new_parent_hdrs = list(parent_hdrs) 

426 new_specific_hdrs = list(specific_hdrs) 

427 # look no further, here be dragons... 

428 

429 if ": " in text or ":" in text: 

430 parts = re.split(r": |:", text) 

431 m = re.match( 

432 r"\s*\((([^():]+)\s*(:|:)?\s*([^():]*))\)\s*$", text 

433 ) 

434 # Matches lines with stuff like "(Hokkien: Xiamen, Quanzhou)" 

435 # thrown into new_parent_hdrs 

436 if m: 

437 new_parent_hdrs.append(m.group(2).strip()) 

438 for hdr in m.group(4).split(","): 

439 new_specific_hdrs.append(hdr.strip()) 

440 else: 

441 # if "Zhangzhou" in text: 

442 # print("\nFOUND IN:", text, "\n") 

443 # print("PARTS: ", repr(parts)) 

444 # print(f" PARTS: {parts}") 

445 extra_tags = parts[0] 

446 # Kludge to handle how (Hokkien: Locations) and 

447 # IPA (Specific Location) interact; this is why 

448 # specific_hdrs was introduced to the soup, just 

449 # to specify which are actual hierarchical higher 

450 # level tags (Min'nan, Hokkien, etc.) which should 

451 # always be present and then use specific_hdrs 

452 # for that list of misc sublocations and subdialects 

453 # that can be overridden by more specific stuff 

454 # later. 

455 m = re.match(r"\s*IPA\s*\((.*)\)\s*$", extra_tags) 

456 if m: 

457 new_parent_hdrs.append("IPA") 

458 new_specific_hdrs = [ 

459 s.strip() for s in m.group(1).split(",") 

460 ] 

461 extra_tags = extra_tags[m.end() :] 

462 

463 m = re.match(r"\s*\([^()]*,[^()]*\)\s*$", extra_tags) 

464 if m: 

465 extra_tags = extra_tags.strip()[1:-1] # remove parens 

466 new_parent_hdrs.extend( 

467 s.strip() for s in extra_tags.split(",") 

468 ) 

469 elif extra_tags: 

470 new_parent_hdrs.append(extra_tags) 

471 

472 v = ":".join(parts[1:]) 

473 

474 # check for phrases 

475 if ("," in (wxr.wtp.title or "")) and len( 

476 v.split(" ") 

477 ) + v.count(",") == len(wxr.wtp.title or ""): 

478 # This just captures exact matches where you have 

479 # the pronunciation of the whole phrase and nothing 

480 # else. Split on spaces, then because we're not 

481 # splitting next to a comma we need to add the 

482 # count of commas so that it synchs up with the 

483 # unicode string length of the original hanzi, 

484 # where the comma is a separate character (unlike 

485 # in the split list, where it's part of a space- 

486 # separated string, like "teo⁴,". 

487 vals = [v] 

488 pron = generate_pron( 

489 v, new_parent_hdrs, new_specific_hdrs 

490 ) 

491 

492 if pron: 

493 pron["tags"] = list(sorted(pron["tags"])) 

494 if pron not in data.get("sounds", ()): 

495 data_append(data, "sounds", pron) 

496 elif "→" in v: 

497 vals = re.split("→", v) 

498 for v in vals: 

499 pron = generate_pron( 

500 v, new_parent_hdrs, new_specific_hdrs 

501 ) 

502 if pron: 

503 m = re.match( 

504 r"([^()]+)\s*\(toneless" 

505 r" final syllable variant\)\s*", 

506 v, 

507 ) 

508 if m: 

509 pron["zh-pron"] = m.group(1).strip() 

510 pron["tags"].append( 

511 "toneless-final-syllable-variant" 

512 ) 

513 

514 pron["tags"] = list(sorted(pron["tags"])) 

515 if pron not in data.get("sounds", ()): 

516 data_append(data, "sounds", pron) 

517 else: 

518 # split alternative pronunciations split 

519 # with "," or " / " 

520 vals = re.split(r"\s*,\s*|\s+/\s+", v) 

521 new_vals = [] 

522 for v2 in vals: 

523 if v2.startswith("/") and v2.endswith("/"): 

524 # items like /kɛiŋ²¹³⁻⁵³ ^((Ø-))ŋuaŋ³³/ 

525 new_vals.append(v2) 

526 else: 

527 # split in parentheses otherwise 

528 new_vals.extend(re.split(r"[()]", v2)) 

529 vals = new_vals 

530 for v in vals: 

531 pron = generate_pron( 

532 v, new_parent_hdrs, new_specific_hdrs 

533 ) 

534 if pron: 

535 pron["tags"] = list(sorted(pron["tags"])) 

536 if pron not in data.get("sounds", ()): 

537 data_append(data, "sounds", pron) 

538 else: 

539 new_parent_hdrs.append(text) 

540 

541 for x in item.children: 

542 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST: 

543 parse_expanded_zh_pron( 

544 x, new_parent_hdrs, specific_hdrs, unknown_header_tags 

545 ) 

546 

547 def parse_chinese_pron( 

548 contents: Union[list[Union[WikiNode, str]], WikiNode, str], 

549 unknown_header_tags: set[str], 

550 ) -> None: 

551 if isinstance(contents, list): 

552 for item in contents: 

553 parse_chinese_pron(item, unknown_header_tags) 

554 return 

555 if not isinstance(contents, WikiNode): 

556 return 

557 if contents.kind != NodeKind.TEMPLATE: 

558 for item in contents.children: 

559 parse_chinese_pron(item, unknown_header_tags) 

560 return 

561 if ( 

562 len(contents.largs[0]) == 1 

563 and isinstance(contents.largs[0][0], str) 

564 and contents.largs[0][0].strip() == "zh-pron" 

565 ): 

566 src = wxr.wtp.node_to_wikitext(contents) 

567 expanded = wxr.wtp.expand(src, templates_to_expand={"zh-pron"}) 

568 parsed = wxr.wtp.parse(expanded) 

569 parse_expanded_zh_pron(parsed, [], [], unknown_header_tags) 

570 else: 

571 for item in contents.children: 

572 parse_chinese_pron(item, unknown_header_tags) 

573 return 

574 

575 if lang_code == "zh": 575 ↛ 576line 575 didn't jump to line 576 because the condition on line 575 was never true

576 unknown_header_tags: set[str] = set() 

577 parse_chinese_pron(contents, unknown_header_tags) 

578 for hdr in unknown_header_tags: 

579 wxr.wtp.debug( 

580 f"Zh-pron header not found in zh_pron_tags or tags: " 

581 f"{repr(hdr)}", 

582 sortid="pronunciations/296/20230324", 

583 ) 

584 

585 def flattened_tree( 

586 lines: list[Union[WikiNode, str]], 

587 ) -> Iterator[Union[WikiNode, str]]: 

588 assert isinstance(lines, list) 

589 for line in lines: 

590 yield from flattened_tree1(line) 

591 

592 def flattened_tree1( 

593 node: Union[WikiNode, str], 

594 ) -> Iterator[Union[WikiNode, str]]: 

595 assert isinstance(node, (WikiNode, str)) 

596 if isinstance(node, str): 

597 yield node 

598 return 

599 elif node.kind == NodeKind.LIST: 

600 for item in node.children: 

601 yield from flattened_tree1(item) 

602 elif node.kind == NodeKind.LIST_ITEM: 602 ↛ 616line 602 didn't jump to line 616 because the condition on line 602 was always true

603 new_children = [] 

604 sublist = None 

605 for child in node.children: 

606 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST: 606 ↛ 607line 606 didn't jump to line 607 because the condition on line 606 was never true

607 sublist = child 

608 else: 

609 new_children.append(child) 

610 node.children = new_children 

611 node.sarg = "*" 

612 yield node 

613 if sublist: 613 ↛ 614line 613 didn't jump to line 614 because the condition on line 613 was never true

614 yield from flattened_tree1(sublist) 

615 else: 

616 yield node 

617 

618 # XXX Do not use flattened_tree more than once here, for example for 

619 # debug printing... The underlying data is changed, and the separated 

620 # sublists disappear. 

621 

622 # Kludge for templates that generate several lines, but haven't 

623 # been caught by earlier kludges... 

624 def split_cleaned_node_on_newlines( 

625 contents: list[Union[WikiNode, str]], 

626 ) -> Iterator[str]: 

627 for litem in flattened_tree(contents): 

628 ipa_text = clean_node( 

629 wxr, 

630 data, 

631 litem, 

632 template_fn=parse_pronunciation_template_fn, 

633 post_template_fn=parse_pron_post_template_fn, 

634 ) 

635 for line in ipa_text.splitlines(): 

636 yield line 

637 

638 # have_pronunciations = False 

639 active_pos: Optional[str] = None 

640 

641 for line in split_cleaned_node_on_newlines(contents): 

642 # print(f"{line=}") 

643 prefix: Optional[str] = None 

644 earlier_base_data: Optional[SoundData] = None 

645 if not line: 645 ↛ 646line 645 didn't jump to line 646 because the condition on line 645 was never true

646 continue 

647 

648 split_templates = re.split(r"__PRON_TEMPLATE_(\d+)__", line) 

649 for i, text in enumerate(split_templates): 

650 if not text: 

651 continue 

652 # clean up starts at the start of the line 

653 text = re.sub(r"^\**\s*", "", text).strip() 

654 if i == 0: 

655 # At the start of a line, check for stuff like "Noun:" 

656 # for active_pos; active_pos is a temporary data field 

657 # given to each saved SoundData entry which is later 

658 # used to sort the entries into their respective PoSes. 

659 m = re.match(r"\s*(\w+\s?\w*)\s*:?\s*", text) 

660 if m: 

661 if (m_lower := m.group(1).lower()) in part_of_speech_map: 

662 active_pos = part_of_speech_map[m_lower]["pos"] 

663 text = text[m.end() :].strip() 

664 if not text: 

665 continue 

666 if i % 2 == 1: 

667 # re.split (with capture groups) splits the lines so that 

668 # every even entry is a captured splitter; odd lines are either 

669 # empty strings or stuff around the splitters. 

670 base_pron_data, first_prons = pron_templates[int(text)] 

671 if base_pron_data: 

672 earlier_base_data = base_pron_data 

673 # print(f"Set {earlier_base_data=}") 

674 elif earlier_base_data is not None: 674 ↛ 691line 674 didn't jump to line 691 because the condition on line 674 was always true

675 # merge data from an earlier iteration of this loop 

676 for pr in first_prons: 

677 if "note" in pr and "note" in earlier_base_data: 677 ↛ 678line 677 didn't jump to line 678 because the condition on line 677 was never true

678 pr["note"] += ";" + earlier_base_data.get( 

679 "note", "" 

680 ) 

681 elif "note" in earlier_base_data: 681 ↛ 682line 681 didn't jump to line 682 because the condition on line 681 was never true

682 pr["note"] = earlier_base_data["note"] 

683 if "topics" in earlier_base_data: 683 ↛ 684line 683 didn't jump to line 684 because the condition on line 683 was never true

684 data_extend( 

685 pr, "topics", earlier_base_data["topics"] 

686 ) 

687 if "tags" in pr and "tags" in earlier_base_data: 687 ↛ 688line 687 didn't jump to line 688 because the condition on line 687 was never true

688 pr["tags"].extend(earlier_base_data["tags"]) 

689 elif "tags" in earlier_base_data: 689 ↛ 676line 689 didn't jump to line 676 because the condition on line 689 was always true

690 pr["tags"] = sorted(set(earlier_base_data["tags"])) 

691 for pr in first_prons: 

692 if active_pos: 

693 pr["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

694 if pr not in data.get("sounds", ()): 694 ↛ 691line 694 didn't jump to line 691 because the condition on line 694 was always true

695 data_append(data, "sounds", pr) 

696 # This bit is handled 

697 continue 

698 

699 if "IPA" in text: 

700 field = "ipa" 

701 else: 

702 # This is used for Rhymes, Homophones, etc 

703 field = "other" 

704 

705 # Check if it contains Japanese "Tokyo" pronunciation with 

706 # special syntax 

707 m = re.search(r"(?m)\(Tokyo\) +([^ ]+) +\[", text) 

708 if m: 708 ↛ 709line 708 didn't jump to line 709 because the condition on line 708 was never true

709 pron: SoundData = {field: m.group(1)} # type: ignore[misc] 

710 if active_pos: 

711 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

712 data_append(data, "sounds", pron) 

713 # have_pronunciations = True 

714 continue 

715 

716 # Check if it contains Rhymes 

717 m = re.match(r"\s*Rhymes?: (.*)", text) 

718 if m: 

719 for ending in split_at_comma_semi(m.group(1)): 

720 ending = ending.strip() 

721 if ending: 721 ↛ 719line 721 didn't jump to line 719 because the condition on line 721 was always true

722 pron = {"rhymes": ending} 

723 if active_pos: 723 ↛ 725line 723 didn't jump to line 725 because the condition on line 723 was always true

724 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

725 data_append(data, "sounds", pron) 

726 # have_pronunciations = True 

727 continue 

728 

729 # Check if it contains homophones 

730 m = re.search(r"(?m)\bHomophones?: (.*)", text) 

731 if m: 

732 for w in split_at_comma_semi(m.group(1)): 

733 w = w.strip() 

734 if w: 734 ↛ 732line 734 didn't jump to line 732 because the condition on line 734 was always true

735 pron = {"homophone": w} 

736 if active_pos: 736 ↛ 738line 736 didn't jump to line 738 because the condition on line 736 was always true

737 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

738 data_append(data, "sounds", pron) 

739 # have_pronunciations = True 

740 continue 

741 

742 # Check if it contains Phonetic hangeul 

743 m = re.search(r"(?m)\bPhonetic hange?ul: \[([^]]+)\]", text) 

744 if m: 744 ↛ 745line 744 didn't jump to line 745 because the condition on line 744 was never true

745 seen = set() 

746 for w in m.group(1).split("/"): 

747 w = w.strip() 

748 if w and w not in seen: 

749 seen.add(w) 

750 pron = {"hangeul": w} 

751 if active_pos: 

752 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

753 data_append(data, "sounds", pron) 

754 # have_pronunciations = True 

755 

756 m = re.search(r"\b(Syllabification|Hyphenation): ([^\s,]*)", text) 

757 if m: 757 ↛ 758line 757 didn't jump to line 758 because the condition on line 757 was never true

758 data_append(data, "hyphenation", m.group(2)) 

759 # have_pronunciations = True 

760 

761 # See if it contains a word prefix restricting which forms the 

762 # pronunciation applies to (see amica/Latin) and/or parenthesized 

763 # tags. 

764 m = re.match( 

765 r"^[*#\s]*(([-\w]+):\s+)?\((([^()]|\([^()]*\))*?)\)", text 

766 ) 

767 if m: 

768 prefix = m.group(2) or "" 

769 tagstext = m.group(3) 

770 text = text[m.end() :] 

771 else: 

772 m = re.match(r"^[*#\s]*([-\w]+):\s+", text) 

773 if m: 773 ↛ 774line 773 didn't jump to line 774 because the condition on line 773 was never true

774 prefix = m.group(1) 

775 tagstext = "" 

776 text = text[m.end() :] 

777 else: 

778 # Spanish has tags before pronunciations, eg. aceite/Spanish 

779 m = re.match(r".*:\s+\(([^)]*)\)\s+(.*)", text) 

780 if m: 780 ↛ 781line 780 didn't jump to line 781 because the condition on line 780 was never true

781 tagstext = m.group(1) 

782 text = m.group(2) 

783 else: 

784 # No prefix. In this case, we inherit prefix 

785 # from previous entry. This particularly 

786 # applies for nested Audio files. 

787 tagstext = "" 

788 if tagstext: 

789 earlier_base_data = {} 

790 parse_pronunciation_tags(wxr, tagstext, earlier_base_data) 

791 

792 # Find romanizations from the pronunciation section (routinely 

793 # produced for Korean by {{ko-IPA}}) 

794 for m in re.finditer(pron_romanization_re, text): 794 ↛ 795line 794 didn't jump to line 795 because the loop on line 794 never started

795 prefix = m.group(1) 

796 w = m.group(2).strip() 

797 tag = pron_romanizations[prefix] 

798 form = {"form": w, "tags": tag.split()} 

799 data_append(data, "forms", form) 

800 

801 # Find IPA pronunciations 

802 for m in re.finditer( 

803 r"(?m)/[^][\n/,]+?/" r"|" r"\[[^]\n0-9,/][^],/]*?\]", text 

804 ): 

805 v = m.group(0) 

806 # The regexp above can match file links. Skip them. 

807 if v.startswith("[[File:"): 807 ↛ 808line 807 didn't jump to line 808 because the condition on line 807 was never true

808 continue 

809 if v == "/wiki.local/": 809 ↛ 810line 809 didn't jump to line 810 because the condition on line 809 was never true

810 continue 

811 if field == "ipa" and "__AUDIO_IGNORE_THIS__" in text: 811 ↛ 812line 811 didn't jump to line 812 because the condition on line 811 was never true

812 m = re.search(r"__AUDIO_IGNORE_THIS__(\d+)__", text) 

813 assert m 

814 idx = int(m.group(1)) 

815 if not audios[idx].get("audio-ipa"): 

816 audios[idx]["audio-ipa"] = v 

817 if prefix: 

818 audios[idx]["form"] = prefix 

819 else: 

820 if earlier_base_data: 820 ↛ 824line 820 didn't jump to line 824 because the condition on line 820 was always true

821 pron = deepcopy(earlier_base_data) 

822 pron[field] = v 

823 else: 

824 pron = {field: v} # type: ignore[misc] 

825 if active_pos: 825 ↛ 827line 825 didn't jump to line 827 because the condition on line 825 was always true

826 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

827 if prefix: 827 ↛ 828line 827 didn't jump to line 828 because the condition on line 827 was never true

828 pron["form"] = prefix 

829 if active_pos: 829 ↛ 831line 829 didn't jump to line 831 because the condition on line 829 was always true

830 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

831 data_append(data, "sounds", pron) 

832 # have_pronunciations = True 

833 

834 # XXX what about {{hyphenation|...}}, {{hyph|...}} 

835 # and those used to be stored under "hyphenation" 

836 

837 # Add data that was collected in template_fn 

838 for audio in audios: 

839 if "audio" in audio: 839 ↛ 896line 839 didn't jump to line 896 because the condition on line 839 was always true

840 # Compute audio file URLs 

841 fn = audio["audio"] 

842 # Strip certain characters, e.g., left-to-right mark 

843 fn = re.sub(r"[\u200f\u200e]", "", fn) 

844 fn = fn.strip() 

845 fn = urllib.parse.unquote(fn) 

846 # First character is usually uppercased 

847 if re.match(r"^[a-z][a-z]+", fn): 

848 fn = fn[0].upper() + fn[1:] 

849 if fn in wxr.config.redirects: 849 ↛ 850line 849 didn't jump to line 850 because the condition on line 849 was never true

850 fn = wxr.config.redirects[fn] 

851 # File extension is lowercased 

852 # XXX some words seem to need this, some don't seem to 

853 # have this??? what is the exact rule? 

854 # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn) 

855 # Spaces are converted to underscores 

856 fn = re.sub(r"\s+", "_", fn) 

857 # Compute hash digest part 

858 h = hashlib.md5() 

859 hname = fn.encode("utf-8") 

860 h.update(hname) 

861 digest = h.hexdigest() 

862 # Quote filename for URL 

863 qfn = urllib.parse.quote(fn) 

864 # For safety when writing files 

865 qfn = qfn.replace("/", "__slash__") 

866 if re.search(r"(?i)\.(ogg|oga)$", fn): 

867 ogg = ( 

868 "https://upload.wikimedia.org/wikipedia/" 

869 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn) 

870 ) 

871 else: 

872 ogg = ( 

873 "https://upload.wikimedia.org/wikipedia/" 

874 "commons/transcoded/" 

875 "{}/{}/{}/{}.ogg".format( 

876 digest[:1], digest[:2], qfn, qfn 

877 ) 

878 ) 

879 if re.search(r"(?i)\.(mp3)$", fn): 879 ↛ 880line 879 didn't jump to line 880

880 mp3 = ( 

881 "https://upload.wikimedia.org/wikipedia/" 

882 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn) 

883 ) 

884 else: 

885 mp3 = ( 

886 "https://upload.wikimedia.org/wikipedia/" 

887 "commons/transcoded/" 

888 "{}/{}/{}/{}.mp3".format( 

889 digest[:1], digest[:2], qfn, qfn 

890 ) 

891 ) 

892 audio["ogg_url"] = ogg 

893 audio["mp3_url"] = mp3 

894 if active_pos: 894 ↛ 896line 894 didn't jump to line 896 because the condition on line 894 was always true

895 audio["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

896 if audio not in data.get("sounds", ()): 896 ↛ 838line 896 didn't jump to line 838 because the condition on line 896 was always true

897 data_append(data, "sounds", audio) 

898 # if audios: 

899 # have_pronunciations = True 

900 audios = [] 

901 

902 ## I have commented out the otherwise unused have_pronunciation 

903 ## toggles; uncomment them to use this debug print 

904 # if not have_pronunciations and not have_panel_templates: 

905 # wxr.wtp.debug("no pronunciations found from pronunciation section", 

906 # sortid="pronunciations/533")