Coverage for src/wiktextract/extractor/en/pronunciation.py: 80%

633 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import hashlib 

2import re 

3import urllib 

4from copy import deepcopy 

5from dataclasses import dataclass 

6from typing import Iterator 

7 

8from wikitextprocessor import ( 

9 HTMLNode, 

10 LevelNode, 

11 NodeKind, 

12 TemplateNode, 

13 WikiNode, 

14) 

15 

16from ...clean import clean_value 

17from ...datautils import data_append, data_extend, split_at_comma_semi 

18from ...page import LEVEL_KINDS, clean_node, is_panel_template 

19from ...tags import valid_tags 

20from ...wxr_context import WiktextractContext 

21from ..share import create_audio_url_dict 

22from .form_descriptions import ( 

23 classify_desc, 

24 decode_tags, 

25 parse_pronunciation_tags, 

26) 

27from .parts_of_speech import part_of_speech_map 

28from .type_utils import Hyphenation, SoundData, TemplateArgs, WordData 

29 

30# Prefixes, tags, and regexp for finding romanizations from the pronuncation 

31# section 

32pron_romanizations = { 

33 " Revised Romanization ": "romanization revised", 

34 " Revised Romanization (translit.) ": "romanization revised transliteration", 

35 " McCune-Reischauer ": "McCune-Reischauer romanization", 

36 " McCune–Reischauer ": "McCune-Reischauer romanization", 

37 " Yale Romanization ": "Yale romanization", 

38} 

39pron_romanization_re = re.compile( 

40 "(?m)^(" 

41 + "|".join( 

42 re.escape(x) 

43 for x in sorted(pron_romanizations.keys(), key=len, reverse=True) 

44 ) 

45 + ")([^\n]+)" 

46) 

47 

48IPA_EXTRACT = r"^(\((.+)\) )?(IPA⁽ᵏᵉʸ⁾|enPR): ((.+?)( \(([^(]+)\))?\s*)$" 

49IPA_EXTRACT_RE = re.compile(IPA_EXTRACT) 

50 

51 

52def extract_pron_template( 

53 wxr: WiktextractContext, tname: str, targs: TemplateArgs, expanded: str 

54) -> tuple[SoundData, list[SoundData]] | None: 

55 """In post_template_fn, this is used to handle all enPR and IPA templates 

56 so that we can leave breadcrumbs in the text that can later be handled 

57 there. We return a `base_data` so that if there are two 

58 or more templates on the same line, like this: 

59 (Tags for the whole line, really) enPR: foo, IPA(keys): /foo/ 

60 then we can apply base_data fields to other templates, too, if needed. 

61 """ 

62 cleaned = clean_value(wxr, expanded) 

63 # print(f"extract_pron_template input: {tname=} {expanded=}-> {cleaned=}") 

64 m = IPA_EXTRACT_RE.match(cleaned) 

65 if not m: 

66 wxr.wtp.error( 

67 f"Text cannot match IPA_EXTRACT_RE regex: " 

68 f"{cleaned=}, {tname=}, {targs=}", 

69 sortid="en/pronunciation/54", 

70 ) 

71 return None 

72 # for i, group in enumerate(m.groups()): 

73 # print(i + 1, repr(group)) 

74 main_qual = m.group(2) or "" 

75 if "qq" in targs: 

76 # If the template has been given a qualifier that applies to 

77 # every entry, but which also happens to appear at the end 

78 # which can be confused with the post-qualifier of a single 

79 # entry in the style of "... /ipa3/ (foo) (bar)", where foo 

80 # might not be present so the bar looks like it only might 

81 # apply to `/ipa3/` 

82 pron_body = m.group(5) 

83 post_qual = m.group(7) 

84 else: 

85 pron_body = m.group(4) 

86 post_qual = "" 

87 

88 if not pron_body: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 wxr.wtp.error( 

90 f"Regex failed to find 'body' from {cleaned=}", 

91 sortid="en/pronunciation/81", 

92 ) 

93 return None 

94 

95 base_data: SoundData = {} 

96 if main_qual: 

97 parse_pronunciation_tags(wxr, main_qual, base_data) 

98 if post_qual: 

99 parse_pronunciation_tags(wxr, post_qual, base_data) 

100 # This base_data is used as the base copy for all entries from this 

101 # template, but it is also returned so that its contents may be applied 

102 # to other templates on the same line. 

103 # print(f"{base_data=}") 

104 

105 sound_datas: list[SoundData] = [] 

106 

107 parts: list[list[str]] = [[]] 

108 inside = 0 

109 current: list[str] = [] 

110 for i, p in enumerate(re.split(r"(\s*,|;|\(|\)\s*)", pron_body)): 

111 # Split the line on commas and semicolons outside of parens. This 

112 # gives us lines with "(main-qualifier) /phon/ (post-qualifier, maybe)" 

113 # print(f" {i=}, {p=}") 

114 comp = p.strip() 

115 if not p: 

116 continue 

117 if comp == "(": 

118 if not inside and i > 0: 118 ↛ 121line 118 didn't jump to line 121 because the condition on line 118 was always true

119 if stripped := "".join(current).strip(): 

120 parts[-1].append("".join(current).strip()) # type:ignore[arg-type] 

121 current = [p] 

122 inside += 1 

123 continue 

124 if comp == ")": 

125 inside -= 1 

126 if not inside: 126 ↛ 131line 126 didn't jump to line 131 because the condition on line 126 was always true

127 if stripped := "".join(current).strip(): 127 ↛ 131line 127 didn't jump to line 131 because the condition on line 127 was always true

128 current.append(p) 

129 parts[-1].append("".join(current).strip()) # type:ignore[arg-type] 

130 current = [] 

131 continue 

132 if not inside and comp in (",", ";"): 

133 if stripped := "".join(current).strip(): 

134 parts[-1].append(stripped) # type:ignore[arg-type] 

135 current = [] 

136 parts.append([]) 

137 continue 

138 current.append(p) 

139 if current: 

140 parts[-1].append("".join(current).strip()) 

141 

142 # print(f">>>>>> {parts=}") 

143 new_parts: list[list[str]] = [] 

144 for entry in parts: 

145 if not entry: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 continue 

147 new_entry: list[str] = [] 

148 i1: int = entry[0].startswith("(") and entry[0].endswith(")") 

149 if i1: 

150 new_entry.append(entry[0][1:-1].strip()) 

151 else: 

152 new_entry.append("") 

153 i2: int = ( 

154 entry[-1].startswith("(") 

155 and entry[-1].endswith(")") 

156 and len(entry) > 1 

157 ) 

158 if i2 == 0: 

159 i2 = len(entry) 

160 else: 

161 i2 = -1 

162 new_entry.append("".join(entry[i1:i2]).strip()) 

163 if not new_entry[-1]: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 wxr.wtp.error( 

165 f"Missing IPA/enPRO sound data between qualifiers?{entry=}", 

166 sortid="en/pronunciation/153", 

167 ) 

168 if i2 == -1: 

169 new_entry.append(entry[-1][1:-1].strip()) 

170 else: 

171 new_entry.append("") 

172 new_parts.append(new_entry) 

173 

174 # print(f">>>>> {new_parts=}") 

175 

176 for part in new_parts: 

177 sd = deepcopy(base_data) 

178 if part[0]: 

179 parse_pronunciation_tags(wxr, part[0], sd) 

180 if part[2]: 

181 parse_pronunciation_tags(wxr, part[2], sd) 

182 if tname == "enPR": 

183 sd["enpr"] = part[1] 

184 else: 

185 sd["ipa"] = part[1] 

186 sound_datas.append(sd) 

187 

188 # print(f"BASE_DATA: {base_data}") 

189 # print(f"SOUND_DATAS: {sound_datas=}") 

190 

191 return base_data, sound_datas 

192 

193 

194def parse_pronunciation( 

195 wxr: WiktextractContext, 

196 level_node: LevelNode, 

197 data: WordData, 

198 etym_data: WordData, 

199 have_etym: bool, 

200 base_data: WordData, 

201 lang_code: str, 

202) -> None: 

203 """Parses the pronunciation section from a language section on a 

204 page.""" 

205 if level_node.kind in LEVEL_KINDS: 205 ↛ 218line 205 didn't jump to line 218 because the condition on line 205 was always true

206 contents = [] 

207 for node in level_node.children: 

208 if isinstance(node, TemplateNode): 

209 if node.template_name == "th-pron": 

210 extract_th_pron_template(wxr, data, node) 

211 elif node.template_name == "zh-pron": 

212 extract_zh_pron_template(wxr, data, node) 

213 else: 

214 contents.append(node) 

215 else: 

216 contents.append(node) 

217 else: 

218 contents = [level_node] 

219 # Remove subsections, such as Usage notes. They may contain IPAchar 

220 # templates in running text, and we do not want to extract IPAs from 

221 # those. 

222 # Filter out only LEVEL_KINDS; 'or' is doing heavy lifting here 

223 # Slip through not-WikiNodes, then slip through WikiNodes that 

224 # are not LEVEL_KINDS. 

225 contents = [ 

226 x 

227 for x in contents 

228 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS 

229 ] 

230 if not any( 

231 isinstance(x, WikiNode) and x.kind == NodeKind.LIST for x in contents 

232 ): 

233 # expand all templates 

234 new_contents: list[str | WikiNode] = [] 

235 for lst in contents: 

236 if isinstance(lst, TemplateNode): 

237 temp = wxr.wtp.node_to_wikitext(lst) 

238 temp = wxr.wtp.expand(temp) 

239 temp_parsed = wxr.wtp.parse(temp) 

240 new_contents.extend(temp_parsed.children) 

241 else: 

242 new_contents.append(lst) 

243 contents = new_contents 

244 

245 if have_etym and data is base_data: 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true

246 data = etym_data 

247 pron_templates: list[tuple[SoundData, list[SoundData]]] = [] 

248 hyphenations: list[Hyphenation] = [] 

249 audios = [] 

250 have_panel_templates = False 

251 

252 def parse_pronunciation_template_fn( 

253 name: str, ht: TemplateArgs 

254 ) -> str | None: 

255 """Handle pronunciation and hyphenation templates""" 

256 # _template_fn handles templates *before* they are expanded; 

257 # this allows for special handling before all the work needed 

258 # for expansion is done. 

259 nonlocal have_panel_templates 

260 if is_panel_template(wxr, name): 

261 have_panel_templates = True 

262 return "" 

263 if name == "audio": 

264 filename = ht.get(2) or "" 

265 desc = ht.get(3) or "" 

266 desc = clean_node(wxr, None, [desc]) 

267 audio: SoundData = {"audio": filename.strip()} 

268 if desc: 268 ↛ 269line 268 didn't jump to line 269 because the condition on line 268 was never true

269 audio["text"] = desc 

270 m = re.search(r"\((([^()]|\([^()]*\))*)\)", desc) 

271 skip = False 

272 if m: 272 ↛ 273line 272 didn't jump to line 273 because the condition on line 272 was never true

273 par = m.group(1) 

274 cls = classify_desc(par) 

275 if cls == "tags": 

276 parse_pronunciation_tags(wxr, par, audio) 

277 else: 

278 skip = True 

279 if skip: 279 ↛ 280line 279 didn't jump to line 280 because the condition on line 279 was never true

280 return "" 

281 audios.append(audio) 

282 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

283 if name == "audio-IPA": 283 ↛ 284line 283 didn't jump to line 284 because the condition on line 283 was never true

284 filename = ht.get(2) or "" 

285 ipa = ht.get(3) or "" 

286 dial = ht.get("dial") 

287 audio = {"audio": filename.strip()} 

288 if dial: 

289 dial = clean_node(wxr, None, [dial]) 

290 audio["text"] = dial 

291 if ipa: 

292 audio["audio-ipa"] = ipa 

293 audios.append(audio) 

294 # The problem with these IPAs is that they often just describe 

295 # what's in the sound file, rather than giving the pronunciation 

296 # of the word alone. It is common for audio files to contain 

297 # multiple pronunciations or articles in the same file, and then 

298 # this IPA often describes what is in the file. 

299 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

300 if name == "audio-pron": 

301 filename = ht.get(2) or "" 

302 ipa = ht.get("ipa") or "" 

303 dial = ht.get("dial") 

304 country = ht.get("country") 

305 audio = {"audio": filename.strip()} 

306 if dial: 306 ↛ 310line 306 didn't jump to line 310 because the condition on line 306 was always true

307 dial = clean_node(wxr, None, [dial]) 

308 audio["text"] = dial 

309 parse_pronunciation_tags(wxr, dial, audio) 

310 if country: 310 ↛ 312line 310 didn't jump to line 312 because the condition on line 310 was always true

311 parse_pronunciation_tags(wxr, country, audio) 

312 if ipa: 312 ↛ 314line 312 didn't jump to line 314 because the condition on line 312 was always true

313 audio["audio-ipa"] = ipa 

314 audios.append(audio) 

315 # XXX do we really want to extract pronunciations from these? 

316 # Or are they spurious / just describing what is in the 

317 # audio file? 

318 # if ipa: 

319 # pron = {"ipa": ipa} 

320 # if dial: 

321 # parse_pronunciation_tags(wxr, dial, pron) 

322 # if country: 

323 # parse_pronunciation_tags(wxr, country, pron) 

324 # data_append(data, "sounds", pron) 

325 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

326 if name in ("hyph", "hyphenation"): 

327 # {{hyph|en|re|late|caption="Hyphenation UK:"}} 

328 # {{hyphenation|it|quiè|to||qui|è|to||quié|to||qui|é|to}} 

329 # and also nocaption=1 

330 caption = clean_node(wxr, None, ht.get("caption", "")) 

331 tagsets, _ = decode_tags(caption) 

332 # flatten the tagsets into one; it would be really weird to have 

333 # several tagsets for a hyphenation caption 

334 tags = list(set(tag for tagset in tagsets for tag in tagset)) 

335 # We'll just ignore any errors from tags, it's not very important 

336 # for hyphenation 

337 tags = [tag for tag in tags if not tag.startswith("error")] 

338 hyph_sequences: list[list[str]] = [[]] 

339 for text in [ 

340 t for (k, t) in ht.items() if (isinstance(k, int) and k >= 2) 

341 ]: 

342 if not text: 

343 hyph_sequences.append([]) 

344 else: 

345 hyph_sequences[-1].append(clean_node(wxr, None, text)) 

346 for seq in hyph_sequences: 

347 hyphenations.append(Hyphenation(parts=seq, tags=tags)) 

348 return "" 

349 return None 

350 

351 def parse_pron_post_template_fn( 

352 name: str, ht: TemplateArgs, text: str 

353 ) -> str | None: 

354 # _post_template_fn handles templates *after* the work to expand 

355 # them has been done; this is exactly the same as _template_fn, 

356 # except with the additional expanded text as an input, and 

357 # possible side-effects from the expansion and recursion (like 

358 # calling other subtemplates that are handled in _template_fn. 

359 if is_panel_template(wxr, name): 359 ↛ 360line 359 didn't jump to line 360 because the condition on line 359 was never true

360 return "" 

361 if name in { 

362 "q", 

363 "qualifier", 

364 "sense", 

365 "a", 

366 "accent", 

367 "l", 

368 "link", 

369 "lb", 

370 "lbl", 

371 "label", 

372 }: 

373 # Kludge: when these templates expand to /.../ or [...], 

374 # replace the expansion by something safe. This is used 

375 # to filter spurious IPA-looking expansions that aren't really 

376 # IPAs. We probably don't care about these templates in the 

377 # contexts where they expand to something containing these. 

378 v = re.sub(r'href="[^"]*"', "", text) # Ignore URLs 

379 v = re.sub(r'src="[^"]*"', "", v) 

380 if re.search(r"/[^/,]+?/|\[[^]0-9,/][^],/]*?\]", v): 380 ↛ 386line 380 didn't jump to line 386 because the condition on line 380 was always true

381 # Note: replacing by empty results in Lua errors that we 

382 # would rather not have. For example, voi/Middle Vietnamese 

383 # uses {{a|{{l{{vi|...}}}}, and the {{a|...}} will fail 

384 # if {{l|...}} returns empty. 

385 return "stripped-by-parse_pron_post_template_fn" 

386 if name in ("IPA", "enPR"): 

387 # Extract the data from IPA and enPR templates (same underlying 

388 # template) and replace them in-text with magical cookie that 

389 # can be later used to refer to the data's index inside 

390 # pron_templates. 

391 if pron_t := extract_pron_template(wxr, name, ht, text): 

392 pron_templates.append(pron_t) 

393 return f"__PRON_TEMPLATE_{len(pron_templates) - 1}__" 

394 return text 

395 

396 def flattened_tree(lines: list[WikiNode | str]) -> Iterator[WikiNode | str]: 

397 assert isinstance(lines, list) 

398 for line in lines: 

399 yield from flattened_tree1(line) 

400 

401 def flattened_tree1(node: WikiNode | str) -> Iterator[WikiNode | str]: 

402 assert isinstance(node, (WikiNode, str)) 

403 if isinstance(node, str): 

404 yield node 

405 return 

406 elif node.kind == NodeKind.LIST: 

407 for item in node.children: 

408 yield from flattened_tree1(item) 

409 elif node.kind == NodeKind.LIST_ITEM: 

410 new_children = [] 

411 sublist = None 

412 for child in node.children: 

413 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST: 

414 sublist = child 

415 else: 

416 new_children.append(child) 

417 node.children = new_children 

418 node.sarg = "*" 

419 yield node 

420 if sublist: 

421 yield from flattened_tree1(sublist) 

422 else: 

423 yield node 

424 

425 # XXX Do not use flattened_tree more than once here, for example for 

426 # debug printing... The underlying data is changed, and the separated 

427 # sublists disappear. 

428 

429 # Kludge for templates that generate several lines, but haven't 

430 # been caught by earlier kludges... 

431 def split_cleaned_node_on_newlines( 

432 contents: list[WikiNode | str], 

433 ) -> Iterator[str]: 

434 for litem in flattened_tree(contents): 

435 ipa_text = clean_node( 

436 wxr, 

437 data, 

438 litem, 

439 template_fn=parse_pronunciation_template_fn, 

440 post_template_fn=parse_pron_post_template_fn, 

441 ) 

442 for line in ipa_text.splitlines(): 

443 yield line 

444 

445 # have_pronunciations = False 

446 active_pos: str | None = None 

447 

448 for line in split_cleaned_node_on_newlines(contents): 

449 # print(f"{line=}") 

450 prefix: str | None = None 

451 earlier_base_data: SoundData | None = None 

452 if not line: 452 ↛ 453line 452 didn't jump to line 453 because the condition on line 452 was never true

453 continue 

454 

455 split_templates = re.split(r"__PRON_TEMPLATE_(\d+)__", line) 

456 for i, text in enumerate(split_templates): 

457 if not text: 

458 continue 

459 # clean up starts at the start of the line 

460 text = re.sub(r"^\**\s*", "", text).strip() 

461 if i == 0: 

462 # At the start of a line, check for stuff like "Noun:" 

463 # for active_pos; active_pos is a temporary data field 

464 # given to each saved SoundData entry which is later 

465 # used to sort the entries into their respective PoSes. 

466 m = re.match(r"\s*(\w+\s?\w*)\s*:?\s*", text) 

467 if m: 

468 if (m_lower := m.group(1).lower()) in part_of_speech_map: 

469 active_pos = part_of_speech_map[m_lower]["pos"] 

470 text = text[m.end() :].strip() 

471 if not text: 

472 continue 

473 if i % 2 == 1: 

474 # re.split (with capture groups) splits the lines so that 

475 # every even entry is a captured splitter; odd lines are either 

476 # empty strings or stuff around the splitters. 

477 base_pron_data, first_prons = pron_templates[int(text)] 

478 if base_pron_data: 

479 earlier_base_data = base_pron_data 

480 # print(f"Set {earlier_base_data=}") 

481 elif earlier_base_data is not None: 

482 # merge data from an earlier iteration of this loop 

483 for pr in first_prons: 

484 if "note" in pr and "note" in earlier_base_data: 484 ↛ 485line 484 didn't jump to line 485 because the condition on line 484 was never true

485 pr["note"] += ";" + earlier_base_data.get( 

486 "note", "" 

487 ) 

488 elif "note" in earlier_base_data: 488 ↛ 489line 488 didn't jump to line 489 because the condition on line 488 was never true

489 pr["note"] = earlier_base_data["note"] 

490 if "topics" in earlier_base_data: 490 ↛ 491line 490 didn't jump to line 491 because the condition on line 490 was never true

491 data_extend( 

492 pr, "topics", earlier_base_data["topics"] 

493 ) 

494 if "tags" in pr and "tags" in earlier_base_data: 494 ↛ 495line 494 didn't jump to line 495 because the condition on line 494 was never true

495 pr["tags"].extend(earlier_base_data["tags"]) 

496 elif "tags" in earlier_base_data: 496 ↛ 483line 496 didn't jump to line 483 because the condition on line 496 was always true

497 pr["tags"] = sorted(set(earlier_base_data["tags"])) 

498 for pr in first_prons: 

499 if active_pos: 

500 pr["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

501 if pr not in data.get("sounds", ()): 501 ↛ 498line 501 didn't jump to line 498 because the condition on line 501 was always true

502 data_append(data, "sounds", pr) 

503 # This bit is handled 

504 continue 

505 

506 if "IPA" in text: 

507 field = "ipa" 

508 else: 

509 # This is used for Rhymes, Homophones, etc 

510 field = "other" 

511 

512 # Check if it contains Japanese "Tokyo" pronunciation with 

513 # special syntax 

514 m = re.search(r"(?m)\(Tokyo\) +([^ ]+) +\[", text) 

515 if m: 515 ↛ 516line 515 didn't jump to line 516 because the condition on line 515 was never true

516 pron: SoundData = {field: m.group(1)} # type: ignore[misc] 

517 if active_pos: 

518 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

519 data_append(data, "sounds", pron) 

520 # have_pronunciations = True 

521 continue 

522 

523 # Check if it contains Rhymes 

524 m = re.match(r"\s*Rhymes?: (.*)", text) 

525 if m: 

526 for ending in split_at_comma_semi(m.group(1)): 

527 ending = ending.strip() 

528 if ending: 528 ↛ 526line 528 didn't jump to line 526 because the condition on line 528 was always true

529 pron = {"rhymes": ending} 

530 if active_pos: 

531 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

532 data_append(data, "sounds", pron) 

533 # have_pronunciations = True 

534 continue 

535 

536 # Check if it contains homophones 

537 m = re.search(r"(?m)\bHomophones?: (.*)", text) 

538 if m: 

539 for w in split_at_comma_semi(m.group(1)): 

540 w = w.strip() 

541 if w: 541 ↛ 539line 541 didn't jump to line 539 because the condition on line 541 was always true

542 pron = {"homophone": w} 

543 if active_pos: 

544 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

545 data_append(data, "sounds", pron) 

546 # have_pronunciations = True 

547 continue 

548 

549 # Check if it contains Phonetic hangeul 

550 m = re.search(r"(?m)\bPhonetic hange?ul: \[([^]]+)\]", text) 

551 if m: 551 ↛ 552line 551 didn't jump to line 552 because the condition on line 551 was never true

552 seen = set() 

553 for w in m.group(1).split("/"): 

554 w = w.strip() 

555 if w and w not in seen: 

556 seen.add(w) 

557 pron = {"hangeul": w} 

558 if active_pos: 

559 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

560 data_append(data, "sounds", pron) 

561 # have_pronunciations = True 

562 

563 # This regex-based hyphenation detection left as backup 

564 m = re.search(r"\b(Syllabification|Hyphenation): *([^\n.]*)", text) 

565 if m: 

566 data_append(data, "hyphenation", m.group(2)) 

567 commaseparated = m.group(2).split(",") 

568 if len(commaseparated) > 1: 568 ↛ 579line 568 didn't jump to line 579 because the condition on line 568 was always true

569 for h in commaseparated: 

570 # That second characters looks like a dash but it's 

571 # actually unicode decimal code 8231, hyphenation dash 

572 # Add more delimiters here if needed. 

573 parts = re.split(r"-|‧", h.strip()) 

574 data_append( 

575 data, "hyphenations", Hyphenation(parts=parts) 

576 ) 

577 ... 

578 else: 

579 data_append( 

580 data, 

581 "hyphenations", 

582 Hyphenation(parts=m.group(2).split(sep="-")), 

583 ) 

584 # have_pronunciations = True 

585 

586 # See if it contains a word prefix restricting which forms the 

587 # pronunciation applies to (see amica/Latin) and/or parenthesized 

588 # tags. 

589 m = re.match( 

590 r"^[*#\s]*(([-\w]+):\s+)?\((([^()]|\([^()]*\))*?)\)", text 

591 ) 

592 if m: 

593 prefix = m.group(2) or "" 

594 tagstext = m.group(3) 

595 text = text[m.end() :] 

596 else: 

597 m = re.match(r"^[*#\s]*([-\w]+):\s+", text) 

598 if m: 

599 prefix = m.group(1) 

600 tagstext = "" 

601 text = text[m.end() :] 

602 else: 

603 # Spanish has tags before pronunciations, eg. aceite/Spanish 

604 m = re.match(r".*:\s+\(([^)]*)\)\s+(.*)", text) 

605 if m: 605 ↛ 606line 605 didn't jump to line 606 because the condition on line 605 was never true

606 tagstext = m.group(1) 

607 text = m.group(2) 

608 else: 

609 # No prefix. In this case, we inherit prefix 

610 # from previous entry. This particularly 

611 # applies for nested Audio files. 

612 tagstext = "" 

613 if tagstext: 

614 earlier_base_data = {} 

615 parse_pronunciation_tags(wxr, tagstext, earlier_base_data) 

616 

617 # Find romanizations from the pronunciation section (routinely 

618 # produced for Korean by {{ko-IPA}}) 

619 for m in re.finditer(pron_romanization_re, text): 619 ↛ 620line 619 didn't jump to line 620 because the loop on line 619 never started

620 prefix = m.group(1) 

621 w = m.group(2).strip() 

622 tag = pron_romanizations[prefix] 

623 form = {"form": w, "tags": tag.split()} 

624 data_append(data, "forms", form) 

625 

626 # Find IPA pronunciations 

627 for m in re.finditer( 

628 r"(?m)/[^][\n/,]+?/" r"|" r"\[[^]\n0-9,/][^],/]*?\]", text 

629 ): 

630 v = m.group(0) 

631 # The regexp above can match file links. Skip them. 

632 if v.startswith("[[File:"): 632 ↛ 633line 632 didn't jump to line 633 because the condition on line 632 was never true

633 continue 

634 if v == "/wiki.local/": 634 ↛ 635line 634 didn't jump to line 635 because the condition on line 634 was never true

635 continue 

636 if field == "ipa" and "__AUDIO_IGNORE_THIS__" in text: 636 ↛ 637line 636 didn't jump to line 637 because the condition on line 636 was never true

637 m = re.search(r"__AUDIO_IGNORE_THIS__(\d+)__", text) 

638 assert m 

639 idx = int(m.group(1)) 

640 if idx >= len(audios): 

641 continue 

642 if not audios[idx].get("audio-ipa"): 

643 audios[idx]["audio-ipa"] = v 

644 if prefix: 

645 audios[idx]["form"] = prefix 

646 else: 

647 if earlier_base_data: 

648 pron = deepcopy(earlier_base_data) 

649 pron[field] = v 

650 else: 

651 pron = {field: v} # type: ignore[misc] 

652 if active_pos: 

653 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

654 if prefix: 

655 pron["form"] = prefix 

656 if active_pos: 

657 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

658 data_append(data, "sounds", pron) 

659 # have_pronunciations = True 

660 

661 # XXX what about {{hyphenation|...}}, {{hyph|...}} 

662 # and those used to be stored under "hyphenation" 

663 

664 # Add data that was collected in template_fn 

665 for audio in audios: 

666 if "audio" in audio: 666 ↛ 723line 666 didn't jump to line 723 because the condition on line 666 was always true

667 # Compute audio file URLs 

668 fn = audio["audio"] 

669 # Strip certain characters, e.g., left-to-right mark 

670 fn = re.sub(r"[\u200f\u200e]", "", fn) 

671 fn = fn.strip() 

672 fn = urllib.parse.unquote(fn) 

673 # First character is usually uppercased 

674 if re.match(r"^[a-z][a-z]+", fn): 

675 fn = fn[0].upper() + fn[1:] 

676 if fn in wxr.config.redirects: 676 ↛ 677line 676 didn't jump to line 677 because the condition on line 676 was never true

677 fn = wxr.config.redirects[fn] 

678 # File extension is lowercased 

679 # XXX some words seem to need this, some don't seem to 

680 # have this??? what is the exact rule? 

681 # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn) 

682 # Spaces are converted to underscores 

683 fn = re.sub(r"\s+", "_", fn) 

684 # Compute hash digest part 

685 h = hashlib.md5() 

686 hname = fn.encode("utf-8") 

687 h.update(hname) 

688 digest = h.hexdigest() 

689 # Quote filename for URL 

690 qfn = urllib.parse.quote(fn) 

691 # For safety when writing files 

692 qfn = qfn.replace("/", "__slash__") 

693 if re.search(r"(?i)\.(ogg|oga)$", fn): 

694 ogg = ( 

695 "https://upload.wikimedia.org/wikipedia/" 

696 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn) 

697 ) 

698 else: 

699 ogg = ( 

700 "https://upload.wikimedia.org/wikipedia/" 

701 "commons/transcoded/" 

702 "{}/{}/{}/{}.ogg".format( 

703 digest[:1], digest[:2], qfn, qfn 

704 ) 

705 ) 

706 if re.search(r"(?i)\.(mp3)$", fn): 706 ↛ 707line 706 didn't jump to line 707 because the condition on line 706 was never true

707 mp3 = ( 

708 "https://upload.wikimedia.org/wikipedia/" 

709 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn) 

710 ) 

711 else: 

712 mp3 = ( 

713 "https://upload.wikimedia.org/wikipedia/" 

714 "commons/transcoded/" 

715 "{}/{}/{}/{}.mp3".format( 

716 digest[:1], digest[:2], qfn, qfn 

717 ) 

718 ) 

719 audio["ogg_url"] = ogg 

720 audio["mp3_url"] = mp3 

721 if active_pos: 

722 audio["pos"] = active_pos # type: ignore[typeddict-unknown-key] 

723 if audio not in data.get("sounds", ()): 

724 data_append(data, "sounds", audio) 

725 

726 # if audios: 

727 # have_pronunciations = True 

728 audios = [] 

729 

730 data_extend(data, "hyphenations", hyphenations) 

731 hyphenations = [] 

732 

733 ## I have commented out the otherwise unused have_pronunciation 

734 ## toggles; uncomment them to use this debug print 

735 # if not have_pronunciations and not have_panel_templates: 

736 # wxr.wtp.debug("no pronunciations found from pronunciation section", 

737 # sortid="pronunciations/533") 

738 

739 

740@dataclass 

741class TableHeader: 

742 text: str 

743 rowspan: int 

744 

745 

746def extract_th_pron_template( 

747 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode 

748): 

749 # https://en.wiktionary.org/wiki/Template:th-pron 

750 expanded_node = wxr.wtp.parse( 

751 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

752 ) 

753 sounds = [] 

754 for table_tag in expanded_node.find_html("table"): 

755 row_headers = [] 

756 for tr_tag in table_tag.find_html("tr"): 

757 field = "other" 

758 new_headers = [] 

759 for header in row_headers: 

760 if header.rowspan > 1: 

761 header.rowspan -= 1 

762 new_headers.append(header) 

763 row_headers = new_headers 

764 for th_tag in tr_tag.find_html("th"): 

765 header_str = clean_node(wxr, None, th_tag) 

766 if header_str.startswith("(standard) IPA"): 

767 field = "ipa" 

768 elif header_str.startswith("Homophones"): 768 ↛ 769line 768 didn't jump to line 769 because the condition on line 768 was never true

769 field = "homophone" 

770 elif header_str == "Audio": 

771 field = "audio" 

772 elif header_str != "": 772 ↛ 764line 772 didn't jump to line 764 because the condition on line 772 was always true

773 rowspan = 1 

774 rowspan_str = th_tag.attrs.get("rowspan", "1") 

775 if re.fullmatch(r"\d+", rowspan_str): 775 ↛ 777line 775 didn't jump to line 777 because the condition on line 775 was always true

776 rowspan = int(rowspan_str) 

777 row_headers.append(TableHeader(header_str, rowspan)) 

778 

779 for td_tag in tr_tag.find_html("td"): 

780 if field == "audio": 

781 for link_node in td_tag.find_child(NodeKind.LINK): 

782 filename = clean_node(wxr, None, link_node.largs[0]) 

783 if filename != "": 783 ↛ 781line 783 didn't jump to line 781 because the condition on line 783 was always true

784 sound = create_audio_url_dict(filename) 

785 sounds.append(sound) 

786 elif field == "homophone": 786 ↛ 787line 786 didn't jump to line 787 because the condition on line 786 was never true

787 for span_tag in td_tag.find_html_recursively( 

788 "span", attr_name="lang", attr_value="th" 

789 ): 

790 word = clean_node(wxr, None, span_tag) 

791 if word != "": 

792 sounds.append({"homophone": word}) 

793 else: 

794 raw_tag = "" 

795 for html_node in td_tag.find_child_recursively( 

796 NodeKind.HTML 

797 ): 

798 if html_node.tag == "small": 

799 node_str = clean_node(wxr, None, html_node) 

800 if node_str.startswith("[") and node_str.endswith( 

801 "]" 

802 ): 

803 raw_tag = node_str.strip("[]") 

804 elif len(sounds) > 0: 804 ↛ 795line 804 didn't jump to line 795 because the condition on line 804 was always true

805 sounds[-1]["roman"] = node_str 

806 elif html_node.tag == "span": 

807 node_str = clean_node(wxr, None, html_node) 

808 span_lang = html_node.attrs.get("lang", "") 

809 span_class = html_node.attrs.get("class", "") 

810 if node_str != "" and ( 

811 span_lang == "th" or span_class in ["IPA", "tr"] 

812 ): 

813 sound = {field: node_str} 

814 if raw_tag != "": 

815 if raw_tag in valid_tags: 815 ↛ 818line 815 didn't jump to line 818 because the condition on line 815 was always true

816 data_append(sound, "tags", raw_tag) 

817 else: 

818 data_append(sound, "raw_tags", raw_tag) 

819 for header in row_headers: 

820 if header.text.lower() in valid_tags: 

821 data_append( 

822 sound, "tags", header.text.lower() 

823 ) 

824 else: 

825 data_append( 

826 sound, "raw_tags", header.text 

827 ) 

828 sounds.append(sound) 

829 

830 clean_node(wxr, word_entry, expanded_node) 

831 data_extend(word_entry, "sounds", sounds) 

832 

833 

834def extract_zh_pron_template( 

835 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode 

836): 

837 # https://en.wiktionary.org/wiki/Template:zh-pron 

838 expanded_node = wxr.wtp.parse( 

839 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

840 ) 

841 seen_lists = set() 

842 sounds = [] 

843 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

844 if list_node not in seen_lists: 

845 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

846 sounds.extend( 

847 extract_zh_pron_list_item(wxr, list_item, [], seen_lists) 

848 ) 

849 clean_node(wxr, word_entry, expanded_node) 

850 data_extend(word_entry, "sounds", sounds) 

851 

852 

853def extract_zh_pron_list_item( 

854 wxr: WiktextractContext, 

855 list_item: WikiNode, 

856 raw_tags: list[str], 

857 seen_lists: set[WikiNode], 

858) -> list[SoundData]: 

859 current_tags = raw_tags[:] 

860 sounds = [] 

861 is_first_small_tag = True 

862 for node in list_item.children: 

863 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

864 link_str = clean_node(wxr, None, node.largs) 

865 node_str = clean_node(wxr, None, node) 

866 if link_str.startswith("File:"): 866 ↛ 867line 866 didn't jump to line 867 because the condition on line 866 was never true

867 sound = create_audio_url_dict(link_str.removeprefix("File:")) 

868 sound["raw_tags"] = current_tags[:] 

869 translate_zh_pron_raw_tags(sound) 

870 sounds.append(sound) 

871 elif node_str != "": 871 ↛ 862line 871 didn't jump to line 862 because the condition on line 871 was always true

872 current_tags.append(node_str) 

873 elif isinstance(node, HTMLNode): 

874 if node.tag == "small": 

875 if is_first_small_tag: 875 ↛ 886line 875 didn't jump to line 886 because the condition on line 875 was always true

876 raw_tag_text = clean_node( 

877 wxr, 

878 None, 

879 [ 

880 n 

881 for n in node.children 

882 if not (isinstance(n, HTMLNode) and n.tag == "sup") 

883 ], 

884 ) 

885 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text)) 

886 elif len(sounds) > 0: 

887 data_extend( 

888 sounds[-1], 

889 "raw_tags", 

890 split_zh_pron_raw_tag(clean_node(wxr, None, node)), 

891 ) 

892 translate_zh_pron_raw_tags(sounds[-1]) 

893 is_first_small_tag = False 

894 elif node.tag == "span": 

895 sounds.extend(extract_zh_pron_span(wxr, node, current_tags)) 

896 elif ( 896 ↛ 901line 896 didn't jump to line 901 because the condition on line 896 was never true

897 node.tag == "table" 

898 and len(current_tags) > 0 

899 and current_tags[-1] == "Homophones" 

900 ): 

901 sounds.extend( 

902 extract_zh_pron_homophone_table(wxr, node, current_tags) 

903 ) 

904 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

905 seen_lists.add(node) 

906 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

907 sounds.extend( 

908 extract_zh_pron_list_item( 

909 wxr, child_list_item, current_tags, seen_lists 

910 ) 

911 ) 

912 

913 return sounds 

914 

915 

916def extract_zh_pron_homophone_table( 

917 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str] 

918) -> list[SoundData]: 

919 sounds = [] 

920 for td_tag in table.find_html_recursively("td"): 

921 for span_tag in td_tag.find_html("span"): 

922 span_class = span_tag.attrs.get("class", "") 

923 span_lang = span_tag.attrs.get("lang", "") 

924 span_str = clean_node(wxr, None, span_tag) 

925 if ( 

926 span_str not in ["", "/"] 

927 and span_lang != "" 

928 and span_class in ["Hant", "Hans", "Hani"] 

929 ): 

930 sound = {"homophone": span_str, "raw_tags": raw_tags[:]} 

931 if span_class == "Hant": 

932 data_append(sound, "tags", "Traditional-Chinese") 

933 elif span_class == "Hans": 

934 data_append(sound, "tags", "Simplified-Chinese") 

935 translate_zh_pron_raw_tags(sound) 

936 sounds.append(sound) 

937 

938 return sounds 

939 

940 

941def translate_zh_pron_raw_tags(sound: SoundData): 

942 from .zh_pron_tags import ZH_PRON_TAGS 

943 

944 raw_tags = [] 

945 for raw_tag in sound.get("raw_tags", []): 

946 if raw_tag in ZH_PRON_TAGS: 

947 tr_tag = ZH_PRON_TAGS[raw_tag] 

948 if isinstance(tr_tag, str): 

949 data_append(sound, "tags", tr_tag) 

950 elif isinstance(tr_tag, list) and tr_tag not in sound.get( 950 ↛ 945line 950 didn't jump to line 945 because the condition on line 950 was always true

951 "tags", [] 

952 ): 

953 data_extend(sound, "tags", tr_tag) 

954 elif raw_tag in valid_tags: 

955 if raw_tag not in sound.get("tags", []): 955 ↛ 945line 955 didn't jump to line 945 because the condition on line 955 was always true

956 data_append(sound, "tags", raw_tag) 

957 elif raw_tag not in raw_tags: 957 ↛ 945line 957 didn't jump to line 945 because the condition on line 957 was always true

958 raw_tags.append(raw_tag) 

959 

960 if len(raw_tags) > 0: 

961 sound["raw_tags"] = raw_tags 

962 elif "raw_tags" in sound: 962 ↛ exitline 962 didn't return from function 'translate_zh_pron_raw_tags' because the condition on line 962 was always true

963 del sound["raw_tags"] 

964 

965 

966def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]: 

967 raw_tags = [] 

968 if "(" not in raw_tag_text: 

969 for raw_tag in re.split(r",|:|;| and ", raw_tag_text): 

970 raw_tag = raw_tag.strip().removeprefix("incl. ").strip() 

971 if raw_tag != "": 

972 raw_tags.append(raw_tag) 

973 else: 

974 processed_offsets = [] 

975 for match in re.finditer(r"\([^()]+\)", raw_tag_text): 

976 processed_offsets.append((match.start(), match.end())) 

977 raw_tags.extend( 

978 split_zh_pron_raw_tag( 

979 raw_tag_text[match.start() + 1 : match.end() - 1] 

980 ) 

981 ) 

982 not_processed = "" 

983 last_end = 0 

984 for start, end in processed_offsets: 

985 not_processed += raw_tag_text[last_end:start] 

986 last_end = end 

987 not_processed += raw_tag_text[last_end:] 

988 if not_processed != raw_tag_text: 988 ↛ 991line 988 didn't jump to line 991 because the condition on line 988 was always true

989 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags 

990 else: 

991 raw_tags.append(not_processed) 

992 

993 return raw_tags 

994 

995 

996def extract_zh_pron_span( 

997 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str] 

998) -> list[SoundData]: 

999 sounds = [] 

1000 small_tags = [] 

1001 pron_nodes = [] 

1002 roman = "" 

1003 phonetic_pron = "" 

1004 for index, node in enumerate(span_tag.children): 

1005 if isinstance(node, HTMLNode) and node.tag == "small": 1005 ↛ 1006line 1005 didn't jump to line 1006 because the condition on line 1005 was never true

1006 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

1007 elif ( 1007 ↛ 1012line 1007 didn't jump to line 1012 because the condition on line 1007 was never true

1008 isinstance(node, HTMLNode) 

1009 and node.tag == "span" 

1010 and "-Latn" in node.attrs.get("lang", "") 

1011 ): 

1012 roman = clean_node(wxr, None, node).strip("() ") 

1013 elif isinstance(node, str) and node.strip() == "[Phonetic:": 1013 ↛ 1014line 1013 didn't jump to line 1014 because the condition on line 1013 was never true

1014 phonetic_pron = clean_node( 

1015 wxr, None, span_tag.children[index + 1 :] 

1016 ).strip("] ") 

1017 break 

1018 else: 

1019 pron_nodes.append(node) 

1020 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)): 

1021 zh_pron = zh_pron.strip("[]: ") 

1022 if len(zh_pron) > 0: 1022 ↛ 1020line 1022 didn't jump to line 1020 because the condition on line 1022 was always true

1023 if "IPA" in span_tag.attrs.get("class", ""): 1023 ↛ 1024line 1023 didn't jump to line 1024 because the condition on line 1023 was never true

1024 sound = {"ipa": zh_pron, "raw_tags": raw_tags[:]} 

1025 else: 

1026 sound = {"zh_pron": zh_pron, "raw_tags": raw_tags[:]} 

1027 if roman != "": 1027 ↛ 1028line 1027 didn't jump to line 1028 because the condition on line 1027 was never true

1028 sound["roman"] = roman 

1029 sounds.append(sound) 

1030 if len(sounds) > 0: 1030 ↛ 1032line 1030 didn't jump to line 1032 because the condition on line 1030 was always true

1031 data_extend(sounds[-1], "raw_tags", small_tags) 

1032 if phonetic_pron != "": 1032 ↛ 1033line 1032 didn't jump to line 1033 because the condition on line 1032 was never true

1033 sound = { 

1034 "zh_pron": phonetic_pron, 

1035 "raw_tags": raw_tags[:] + ["Phonetic"], 

1036 } 

1037 if roman != "": 

1038 sound["roman"] = roman 

1039 sounds.append(sound) 

1040 for sound in sounds: 

1041 translate_zh_pron_raw_tags(sound) 

1042 return sounds 

1043 

1044 

1045def split_zh_pron(zh_pron: str) -> list[str]: 

1046 # split by comma and other symbols that outside parentheses 

1047 parentheses = 0 

1048 pron_list = [] 

1049 pron = "" 

1050 for c in zh_pron: 

1051 if ( 

1052 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/"))) 

1053 and parentheses == 0 

1054 and len(pron.strip()) > 0 

1055 ): 

1056 pron_list.append(pron.strip()) 

1057 pron = "" 

1058 elif c == "(": 

1059 parentheses += 1 

1060 pron += c 

1061 elif c == ")": 

1062 parentheses -= 1 

1063 pron += c 

1064 else: 

1065 pron += c 

1066 

1067 if pron.strip() != "": 1067 ↛ 1069line 1067 didn't jump to line 1069 because the condition on line 1067 was always true

1068 pron_list.append(pron) 

1069 return pron_list