Coverage for src/wiktextract/extractor/en/pronunciation.py: 83%

850 statements  

« prev     ^ index     » next       coverage.py v7.14.3, created at 2026-06-23 09:14 +0000

1import hashlib 

2import re 

3import urllib 

4from copy import deepcopy 

5from dataclasses import dataclass 

6from typing import Iterator, Literal, NamedTuple 

7 

8from wikitextprocessor import ( 

9 HTMLNode, 

10 LevelNode, 

11 NodeKind, 

12 TemplateNode, 

13 WikiNode, 

14) 

15 

16from ...clean import clean_value 

17from ...datautils import data_append, data_extend, split_at_comma_semi 

18from ...page import LEVEL_KINDS, clean_node, is_panel_template 

19from ...tags import valid_tags 

20from ...wxr_context import WiktextractContext 

21from ..share import create_audio_url_dict 

22from .form_descriptions import ( 

23 classify_desc, 

24 decode_tags, 

25 parse_pronunciation_tags, 

26) 

27from .parts_of_speech import part_of_speech_map 

28from .type_utils import Hyphenation, SoundData, TemplateArgs, WordData 

29 

30PronunciationPoses = tuple[str, ...] 

31 

32# Prefixes, tags, and regexp for finding romanizations from the pronuncation 

33# section 

34pron_romanizations = { 

35 " Revised Romanization ": "romanization revised", 

36 " Revised Romanization (translit.) ": ( 

37 "romanization revised transliteration" 

38 ), 

39 " McCune-Reischauer ": "McCune-Reischauer romanization", 

40 " McCune–Reischauer ": "McCune-Reischauer romanization", 

41 " Yale Romanization ": "Yale romanization", 

42} 

43pron_romanization_re = re.compile( 

44 "(?m)^(" 

45 + "|".join( 

46 re.escape(x) 

47 for x in sorted(pron_romanizations.keys(), key=len, reverse=True) 

48 ) 

49 + ")([^\n]+)" 

50) 

51 

52IPA_EXTRACT = r"^(\((.+)\) )?(IPA⁽ᵏᵉʸ⁾|enPR): ((.+?)( \(([^(]+)\))?\s*)$" 

53IPA_EXTRACT_RE = re.compile(IPA_EXTRACT) 

54 

55 

56class PronunciationPosMatch(NamedTuple): 

57 pos_values: PronunciationPoses 

58 residual: str 

59 

60 

61class PronunciationPosPrefix(NamedTuple): 

62 pos_values: PronunciationPoses 

63 text: str 

64 is_persistent: bool 

65 

66 

67class FlattenedListNode(NamedTuple): 

68 node: WikiNode | str 

69 list_depth: int 

70 

71 

72PRON_POS_TEMPLATE_NAMES = { 

73 "q", 

74 "qualifier", 

75 "qual", 

76 "i", 

77 "sense", 

78 "a", 

79 "accent", 

80 "lb", 

81 "lbl", 

82 "label", 

83} 

84 

85PRON_POS_BY_LABEL = { 

86 label: pos_data["pos"] for label, pos_data in part_of_speech_map.items() 

87} 

88 

89PRON_POS_LABEL_RE = re.compile( 

90 r"^(?:(?P<residual>.+?)\s+)?(?P<label>" 

91 + "|".join( 

92 re.escape(label) 

93 for label in sorted(PRON_POS_BY_LABEL, key=len, reverse=True) 

94 ) 

95 + r")$" 

96) 

97 

98 

99def normalize_pronunciation_pos_label(label: str) -> str: 

100 label = label.strip().lower() 

101 label = re.sub(r"\s+", " ", label) 

102 # Drop explanatory suffixes such as "noun (barren areas)" before 

103 # matching the label against part-of-speech names. 

104 label = re.sub(r"\s*\([^)]*\)\s*$", "", label).strip() 

105 label = label.strip(" \t\n\r():") 

106 label = re.sub(r"\s+senses?$", "", label).strip() 

107 return label 

108 

109 

110def split_pronunciation_pos_text(text: str) -> PronunciationPosMatch: 

111 pos_values: list[str] = [] 

112 residual: list[str] = [] 

113 for part in split_pronunciation_pos_parts(text): 

114 pos, residual_part = pronunciation_pos_from_part(part) 

115 if pos: 

116 # POS-bearing qualifier text may also contain normal pronunciation 

117 # tags before the POS label, e.g. "attributive adjective". 

118 if pos not in pos_values: 118 ↛ 120line 118 didn't jump to line 120 because the condition on line 118 was always true

119 pos_values.append(pos) 

120 if residual_part: 

121 residual.append(residual_part) 

122 elif residual_part: 122 ↛ 113line 122 didn't jump to line 113 because the condition on line 122 was always true

123 residual.append(residual_part) 

124 if not pos_values: 

125 # If nothing in the text was a POS label, preserve the original text 

126 # for normal pronunciation tag/note parsing. 

127 return PronunciationPosMatch((), text.strip()) 

128 return PronunciationPosMatch(tuple(pos_values), ", ".join(residual)) 

129 

130 

131def pronunciation_pos_from_part(part: str) -> tuple[str | None, str]: 

132 normalized = normalize_pronunciation_pos_label(part) 

133 if normalized in PRON_POS_BY_LABEL: 

134 return PRON_POS_BY_LABEL[normalized], "" 

135 # Match residual tag text followed by a POS label: 

136 # "attributive adjective" -> ("adj", "attributive") 

137 # "attributive proper noun" -> ("name", "attributive") 

138 # The label alternation is sorted longest-first so multi-word POS labels 

139 # such as "proper noun" win over their suffixes. 

140 match = PRON_POS_LABEL_RE.match(normalized) 

141 if match: 

142 label = match.group("label") 

143 residual = (match.group("residual") or "").rstrip(" ,;:") 

144 if not residual or classify_desc(residual) == "tags": 

145 return PRON_POS_BY_LABEL[label], residual 

146 return None, part 

147 

148 

149def split_pronunciation_pos_parts(text: str) -> list[str]: 

150 parts: list[str] = [] 

151 for comma_part in re.split(r"[,;]", text): 

152 comma_part = comma_part.strip() 

153 if not comma_part: 

154 continue 

155 # Commas and semicolons reliably separate qualifier chunks. Only split 

156 # "and"/"or" when at least one side is a POS label, so prose notes 

157 # stay intact. 

158 conjunction_parts = re.split(r"\s+(?:and|or)\s+", comma_part) 

159 if len(conjunction_parts) > 1 and any( 

160 pronunciation_pos_from_part(part)[0] 

161 for part in conjunction_parts 

162 ): 

163 parts.extend(conjunction_parts) 

164 else: 

165 parts.append(comma_part) 

166 return parts 

167 

168 

169def set_sound_pos( 

170 sound: SoundData, pos_values: PronunciationPoses | None 

171) -> PronunciationPoses | None: 

172 if pos_values: 

173 sound["pos"] = pos_values # type: ignore[typeddict-unknown-key] 

174 return pos_values 

175 if "pos" in sound: 

176 return sound["pos"] # type: ignore[typeddict-item] 

177 return None 

178 

179 

180def common_sound_pos( 

181 pos_candidates: set[PronunciationPoses], 

182) -> PronunciationPoses | None: 

183 if len(pos_candidates) != 1: 

184 return None 

185 return next(iter(pos_candidates)) 

186 

187 

188def merge_pronunciation_tag_data( 

189 sound: SoundData, tag_data: SoundData 

190) -> None: 

191 for value in tag_data.get("tags", []): 

192 if value not in sound.get("tags", []): 192 ↛ 191line 192 didn't jump to line 191 because the condition on line 192 was always true

193 data_append(sound, "tags", value) 

194 for value in tag_data.get("topics", []): 194 ↛ 195line 194 didn't jump to line 195 because the loop on line 194 never started

195 if value not in sound.get("topics", []): 

196 data_append(sound, "topics", value) 

197 if note := tag_data.get("note"): 

198 existing_note = sound.get("note") 

199 if not existing_note: 

200 sound["note"] = note 

201 elif note not in [n.strip() for n in existing_note.split(";")]: 201 ↛ exitline 201 didn't return from function 'merge_pronunciation_tag_data' because the condition on line 201 was always true

202 sound["note"] = f"{existing_note}; {note}" 

203 

204 

205def parse_pronunciation_tags_with_pos( 

206 wxr: WiktextractContext, text: str, sound: SoundData 

207) -> PronunciationPoses: 

208 match = split_pronunciation_pos_text(text) 

209 set_sound_pos(sound, match.pos_values) 

210 if match.residual: 

211 tag_data: SoundData = {} 

212 parse_pronunciation_tags(wxr, match.residual, tag_data) 

213 merge_pronunciation_tag_data(sound, tag_data) 

214 return match.pos_values 

215 

216 

217def extract_pos_prefix(text: str) -> PronunciationPosPrefix | None: 

218 stripped = text.strip() 

219 if not (stripped.startswith("(") and stripped.endswith(")")): 

220 bare_match = split_pronunciation_pos_text(text) 

221 if bare_match.pos_values and not bare_match.residual: 

222 return PronunciationPosPrefix(bare_match.pos_values, "", True) 

223 

224 colon_match = re.match(r"\s*([^:()]+?)\s*:\s*(.*)$", text) 

225 if colon_match: 

226 match = split_pronunciation_pos_text(colon_match.group(1)) 

227 if match.pos_values and not match.residual: 227 ↛ 228line 227 didn't jump to line 228 because the condition on line 227 was never true

228 return PronunciationPosPrefix( 

229 match.pos_values, colon_match.group(2).strip(), True 

230 ) 

231 

232 paren_match = re.match(r"\s*\(([^()]*)\)\s*(.*)$", text) 

233 if paren_match: 

234 match = split_pronunciation_pos_text(paren_match.group(1)) 

235 if match.pos_values and not match.residual: 

236 return PronunciationPosPrefix( 

237 match.pos_values, paren_match.group(2).strip(), False 

238 ) 

239 

240 return None 

241 

242 

243def extract_pronunciation_pos_template( 

244 wxr: WiktextractContext, 

245 name: str, 

246 ht: TemplateArgs, 

247 lang_code: str, 

248) -> PronunciationPosMatch: 

249 if name in {"a", "accent", "lb", "lbl", "label"}: 

250 pos_args = [ 

251 value 

252 for key, value in ht.items() 

253 if isinstance(key, int) and key >= 2 

254 ] 

255 if not pos_args and ht.get(1) != lang_code: 

256 pos_args = [ht.get(1, "")] 

257 else: 

258 pos_args = [ 

259 value 

260 for key, value in ht.items() 

261 if isinstance(key, int) and key >= 1 

262 ] 

263 

264 pos_values: list[str] = [] 

265 residual: list[str] = [] 

266 for arg in pos_args: 

267 text = clean_node(wxr, None, [arg]) 

268 match = split_pronunciation_pos_text(text) 

269 for pos in match.pos_values: 

270 if pos not in pos_values: 270 ↛ 269line 270 didn't jump to line 269 because the condition on line 270 was always true

271 pos_values.append(pos) 

272 if match.residual: 

273 residual.append(match.residual) 

274 return PronunciationPosMatch(tuple(pos_values), ", ".join(residual)) 

275 

276 

277def extract_pron_template( 

278 wxr: WiktextractContext, tname: str, targs: TemplateArgs, expanded: str 

279) -> tuple[SoundData, list[SoundData]] | None: 

280 """In post_template_fn, this is used to handle all enPR and IPA templates 

281 so that we can leave breadcrumbs in the text that can later be handled 

282 there. We return a `base_data` so that if there are two 

283 or more templates on the same line, like this: 

284 (Tags for the whole line, really) enPR: foo, IPA(keys): /foo/ 

285 then we can apply base_data fields to other templates, too, if needed. 

286 """ 

287 cleaned = clean_value(wxr, expanded) 

288 # print(f"extract_pron_template input: {tname=} {expanded=}-> {cleaned=}") 

289 m = IPA_EXTRACT_RE.match(cleaned) 

290 if not m: 

291 wxr.wtp.error( 

292 f"Text cannot match IPA_EXTRACT_RE regex: " 

293 f"{cleaned=}, {tname=}, {targs=}", 

294 sortid="en/pronunciation/54", 

295 ) 

296 return None 

297 # for i, group in enumerate(m.groups()): 

298 # print(i + 1, repr(group)) 

299 main_qual = m.group(2) or "" 

300 if "qq" in targs: 

301 # If the template has been given a qualifier that applies to 

302 # every entry, but which also happens to appear at the end 

303 # which can be confused with the post-qualifier of a single 

304 # entry in the style of "... /ipa3/ (foo) (bar)", where foo 

305 # might not be present so the bar looks like it only might 

306 # apply to `/ipa3/` 

307 pron_body = m.group(5) 

308 post_qual = m.group(7) 

309 else: 

310 pron_body = m.group(4) 

311 post_qual = "" 

312 

313 if not pron_body: 313 ↛ 314line 313 didn't jump to line 314 because the condition on line 313 was never true

314 wxr.wtp.error( 

315 f"Regex failed to find 'body' from {cleaned=}", 

316 sortid="en/pronunciation/81", 

317 ) 

318 return None 

319 

320 base_data: SoundData = {} 

321 if main_qual: 

322 parse_pronunciation_tags_with_pos(wxr, main_qual, base_data) 

323 if post_qual: 

324 parse_pronunciation_tags_with_pos(wxr, post_qual, base_data) 

325 # This base_data is used as the base copy for all entries from this 

326 # template, but it is also returned so that its contents may be applied 

327 # to other templates on the same line. 

328 # print(f"{base_data=}") 

329 

330 sound_datas: list[SoundData] = [] 

331 

332 parts: list[list[str]] = [[]] 

333 inside = 0 

334 current: list[str] = [] 

335 for i, p in enumerate(re.split(r"(\s*,|;|\(|\)\s*)", pron_body)): 

336 # Split the line on commas and semicolons outside of parens. This 

337 # gives us lines with "(main-qualifier) /phon/ (post-qualifier, maybe)" 

338 # print(f" {i=}, {p=}") 

339 comp = p.strip() 

340 if not p: 

341 continue 

342 if comp == "(": 

343 if not inside and i > 0: 343 ↛ 346line 343 didn't jump to line 346 because the condition on line 343 was always true

344 if stripped := "".join(current).strip(): 

345 parts[-1].append("".join(current).strip()) # type:ignore[arg-type] 

346 current = [p] 

347 inside += 1 

348 continue 

349 if comp == ")": 

350 inside -= 1 

351 if not inside: 351 ↛ 356line 351 didn't jump to line 356 because the condition on line 351 was always true

352 if stripped := "".join(current).strip(): 352 ↛ 356line 352 didn't jump to line 356 because the condition on line 352 was always true

353 current.append(p) 

354 parts[-1].append("".join(current).strip()) # type:ignore[arg-type] 

355 current = [] 

356 continue 

357 if not inside and comp in (",", ";"): 

358 if stripped := "".join(current).strip(): 

359 parts[-1].append(stripped) # type:ignore[arg-type] 

360 current = [] 

361 parts.append([]) 

362 continue 

363 current.append(p) 

364 if current: 

365 parts[-1].append("".join(current).strip()) 

366 

367 # print(f">>>>>> {parts=}") 

368 new_parts: list[list[str]] = [] 

369 for entry in parts: 

370 if not entry: 370 ↛ 371line 370 didn't jump to line 371 because the condition on line 370 was never true

371 continue 

372 new_entry: list[str] = [] 

373 i1: int = entry[0].startswith("(") and entry[0].endswith(")") 

374 if i1: 

375 new_entry.append(entry[0][1:-1].strip()) 

376 else: 

377 new_entry.append("") 

378 i2: int = ( 

379 entry[-1].startswith("(") 

380 and entry[-1].endswith(")") 

381 and len(entry) > 1 

382 ) 

383 if i2 == 0: 

384 i2 = len(entry) 

385 else: 

386 i2 = -1 

387 new_entry.append("".join(entry[i1:i2]).strip()) 

388 if not new_entry[-1]: 388 ↛ 389line 388 didn't jump to line 389 because the condition on line 388 was never true

389 wxr.wtp.error( 

390 f"Missing IPA/enPRO sound data between qualifiers?{entry=}", 

391 sortid="en/pronunciation/153", 

392 ) 

393 if i2 == -1: 

394 new_entry.append(entry[-1][1:-1].strip()) 

395 else: 

396 new_entry.append("") 

397 new_parts.append(new_entry) 

398 

399 # print(f">>>>> {new_parts=}") 

400 

401 for part in new_parts: 

402 sd = deepcopy(base_data) 

403 if part[0]: 

404 parse_pronunciation_tags_with_pos(wxr, part[0], sd) 

405 if part[2]: 

406 parse_pronunciation_tags_with_pos(wxr, part[2], sd) 

407 if tname == "enPR": 

408 sd["enpr"] = part[1] 

409 else: 

410 sd["ipa"] = part[1] 

411 sound_datas.append(sd) 

412 

413 # print(f"BASE_DATA: {base_data}") 

414 # print(f"SOUND_DATAS: {sound_datas=}") 

415 

416 return base_data, sound_datas 

417 

418 

419def parse_pronunciation( 

420 wxr: WiktextractContext, 

421 level_node: LevelNode, 

422 data: WordData, 

423 etym_data: WordData, 

424 have_etym: bool, 

425 base_data: WordData, 

426 lang_code: str, 

427) -> None: 

428 """Parses the pronunciation section from a language section on a 

429 page.""" 

430 if level_node.kind in LEVEL_KINDS: 430 ↛ 443line 430 didn't jump to line 443 because the condition on line 430 was always true

431 contents: list[str | WikiNode | TemplateNode] = [] 

432 for node in level_node.children: 

433 if isinstance(node, TemplateNode): 

434 if node.template_name == "th-pron": 

435 extract_th_pron_template(wxr, data, node) 

436 elif node.template_name == "zh-pron": 

437 extract_zh_pron_template(wxr, data, node) 

438 else: 

439 contents.append(node) 

440 else: 

441 contents.append(node) 

442 else: 

443 contents = [level_node] 

444 # Remove subsections, such as Usage notes. They may contain IPAchar 

445 # templates in running text, and we do not want to extract IPAs from 

446 # those. 

447 # Filter out only LEVEL_KINDS; 'or' is doing heavy lifting here 

448 # Slip through not-WikiNodes, then slip through WikiNodes that 

449 # are not LEVEL_KINDS. 

450 contents = [ 

451 x 

452 for x in contents 

453 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS 

454 ] 

455 if not any( 

456 isinstance(x, WikiNode) and x.kind == NodeKind.LIST for x in contents 

457 ): 

458 # expand all templates 

459 new_contents: list[str | WikiNode | TemplateNode] = [] 

460 for lst in contents: 

461 if isinstance(lst, TemplateNode): 

462 temp = wxr.wtp.node_to_wikitext(lst) 

463 temp = wxr.wtp.expand(temp) 

464 temp_parsed = wxr.wtp.parse(temp) 

465 new_contents.extend(temp_parsed.children) 

466 else: 

467 new_contents.append(lst) 

468 contents = new_contents 

469 

470 if have_etym and data is base_data: 470 ↛ 471line 470 didn't jump to line 471 because the condition on line 470 was never true

471 data = etym_data 

472 pron_templates: list[tuple[SoundData, list[SoundData]]] = [] 

473 pron_pos_markers: list[PronunciationPoses] = [] 

474 hyphenations: list[Hyphenation] = [] 

475 audios: list[SoundData] = [] 

476 have_panel_templates = False 

477 

478 def parse_pronunciation_template_fn( 

479 name: str, ht: TemplateArgs 

480 ) -> str | None: 

481 """Handle pronunciation and hyphenation templates""" 

482 # _template_fn handles templates *before* they are expanded; 

483 # this allows for special handling before all the work needed 

484 # for expansion is done. 

485 nonlocal have_panel_templates 

486 if is_panel_template(wxr, name): 

487 have_panel_templates = True 

488 return "" 

489 if name == "audio": 

490 filename = ht.get(2) or "" 

491 audio: SoundData = {"audio": filename.strip()} 

492 dialect = ht.get("a", "") 

493 if "aa" in ht: 493 ↛ 494line 493 didn't jump to line 494 because the condition on line 493 was never true

494 dialect += ", " + ht.get("aa", "") 

495 if dialect: 

496 dialect = dialect.replace("<", "").replace(">", "") 

497 dialect = clean_node(wxr, None, [dialect]) 

498 for part in split_at_comma_semi(dialect): 

499 if "(" not in part: 

500 parse_pronunciation_tags(wxr, part, audio) 

501 else: 

502 for ppart in re.split(r"[][()]", part): 

503 parse_pronunciation_tags(wxr, ppart, audio) 

504 desc = ht.get(3) or "" 

505 desc = clean_node(wxr, None, [desc]) 

506 if desc: 506 ↛ 507line 506 didn't jump to line 507 because the condition on line 506 was never true

507 audio["text"] = desc 

508 m = re.search(r"\((([^()]|\([^()]*\))*)\)", desc) 

509 skip = False 

510 if m: 510 ↛ 511line 510 didn't jump to line 511 because the condition on line 510 was never true

511 par = m.group(1) 

512 cls = classify_desc(par) 

513 if cls == "tags": 

514 parse_pronunciation_tags(wxr, par, audio) 

515 else: 

516 skip = True 

517 if skip: 517 ↛ 518line 517 didn't jump to line 518 because the condition on line 517 was never true

518 return "" 

519 audios.append(audio) 

520 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

521 if name == "audio-IPA": 521 ↛ 522line 521 didn't jump to line 522 because the condition on line 521 was never true

522 filename = ht.get(2) or "" 

523 ipa = ht.get(3) or "" 

524 dial = ht.get("dial") 

525 audio = {"audio": filename.strip()} 

526 if dial: 

527 dial = clean_node(wxr, None, [dial]) 

528 audio["text"] = dial 

529 if ipa: 

530 audio["audio-ipa"] = ipa 

531 audios.append(audio) 

532 # The problem with these IPAs is that they often just describe 

533 # what's in the sound file, rather than giving the pronunciation 

534 # of the word alone. It is common for audio files to contain 

535 # multiple pronunciations or articles in the same file, and then 

536 # this IPA often describes what is in the file. 

537 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

538 if name == "audio-pron": 

539 filename = ht.get(2) or "" 

540 ipa = ht.get("ipa") or "" 

541 dial = ht.get("dial") 

542 country = ht.get("country") 

543 audio = {"audio": filename.strip()} 

544 if dial: 544 ↛ 548line 544 didn't jump to line 548 because the condition on line 544 was always true

545 dial = clean_node(wxr, None, [dial]) 

546 audio["text"] = dial 

547 parse_pronunciation_tags(wxr, dial, audio) 

548 if country: 548 ↛ 550line 548 didn't jump to line 550 because the condition on line 548 was always true

549 parse_pronunciation_tags(wxr, country, audio) 

550 if ipa: 550 ↛ 552line 550 didn't jump to line 552 because the condition on line 550 was always true

551 audio["audio-ipa"] = ipa 

552 audios.append(audio) 

553 # XXX do we really want to extract pronunciations from these? 

554 # Or are they spurious / just describing what is in the 

555 # audio file? 

556 # if ipa: 

557 # pron = {"ipa": ipa} 

558 # if dial: 

559 # parse_pronunciation_tags(wxr, dial, pron) 

560 # if country: 

561 # parse_pronunciation_tags(wxr, country, pron) 

562 # data_append(data, "sounds", pron) 

563 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__" 

564 if name in ("hyph", "hyphenation"): 

565 # {{hyph|en|re|late|caption="Hyphenation UK:"}} 

566 # {{hyphenation|it|quiè|to||qui|è|to||quié|to||qui|é|to}} 

567 # and also nocaption=1 

568 caption = clean_node(wxr, None, ht.get("caption", "")) 

569 tagsets, _ = decode_tags(caption) 

570 # flatten the tagsets into one; it would be really weird to have 

571 # several tagsets for a hyphenation caption 

572 tags = sorted(set(tag for tagset in tagsets for tag in tagset)) 

573 # We'll just ignore any errors from tags, it's not very important 

574 # for hyphenation 

575 tags = [tag for tag in tags if not tag.startswith("error")] 

576 hyph_sequences: list[list[str]] = [[]] 

577 for text in [ 

578 t for (k, t) in ht.items() if (isinstance(k, int) and k >= 2) 

579 ]: 

580 if not text: 

581 hyph_sequences.append([]) 

582 else: 

583 hyph_sequences[-1].append(clean_node(wxr, None, text)) 

584 for seq in hyph_sequences: 

585 hyphenations.append(Hyphenation(parts=seq, tags=tags)) 

586 return "" 

587 return None 

588 

589 may_be_duplicates = False 

590 

591 def parse_pron_post_template_fn( 

592 name: str, ht: TemplateArgs, text: str 

593 ) -> str | None: 

594 # _post_template_fn handles templates *after* the work to expand 

595 # them has been done; this is exactly the same as _template_fn, 

596 # except with the additional expanded text as an input, and 

597 # possible side-effects from the expansion and recursion (like 

598 # calling other subtemplates that are handled in _template_fn. 

599 nonlocal may_be_duplicates 

600 if is_panel_template(wxr, name): 600 ↛ 601line 600 didn't jump to line 601 because the condition on line 600 was never true

601 return "" 

602 if name in PRON_POS_TEMPLATE_NAMES: 

603 pos_match = extract_pronunciation_pos_template( 

604 wxr, name, ht, lang_code 

605 ) 

606 if pos_match.pos_values: 

607 pron_pos_markers.append(pos_match.pos_values) 

608 marker = ( 

609 f"__PRON_POS_MARKER_{len(pron_pos_markers) - 1}__" 

610 ) 

611 if pos_match.residual: 611 ↛ 612line 611 didn't jump to line 612 because the condition on line 611 was never true

612 return f"{marker} ({pos_match.residual})" 

613 return marker 

614 if name in { 

615 *PRON_POS_TEMPLATE_NAMES, 

616 "l", 

617 "link", 

618 }: 

619 # Kludge: when these templates expand to /.../ or [...], 

620 # replace the expansion by something safe. This is used 

621 # to filter spurious IPA-looking expansions that aren't really 

622 # IPAs. We probably don't care about these templates in the 

623 # contexts where they expand to something containing these. 

624 v = re.sub(r'href="[^"]*"', "", text) # Ignore URLs 

625 v = re.sub(r'src="[^"]*"', "", v) 

626 v = clean_value(wxr, v) 

627 if re.search(r"/[^/,]+?/|\[[^]0-9,/][^],/]*?\]", v): 

628 # Note: replacing by empty results in Lua errors that we 

629 # would rather not have. For example, voi/Middle Vietnamese 

630 # uses {{a|{{l{{vi|...}}}}, and the {{a|...}} will fail 

631 # if {{l|...}} returns empty. 

632 return "stripped-by-parse_pron_post_template_fn" 

633 if name in ("IPA", "enPR"): 

634 # Extract the data from IPA and enPR templates (same underlying 

635 # template) and replace them in-text with magical cookie that 

636 # can be later used to refer to the data's index inside 

637 # pron_templates. 

638 if pron_t := extract_pron_template(wxr, name, ht, text): 

639 pron_templates.append(pron_t) 

640 return f"__PRON_TEMPLATE_{len(pron_templates) - 1}__" 

641 # Catch templates that generate duplicate sound data entries 

642 # here; if the text produces a big, toggleable section, the 

643 # "header" for that section might be duplicated. Add more conditions 

644 # if necessary. 

645 if text.startswith("<") and "vsToggleElement" in text: 645 ↛ 646line 645 didn't jump to line 646 because the condition on line 645 was never true

646 may_be_duplicates = True 

647 return text 

648 

649 def flattened_tree( 

650 lines: list[WikiNode | str], 

651 ) -> Iterator[FlattenedListNode]: 

652 assert isinstance(lines, list) 

653 for line in lines: 

654 yield from flattened_tree1(line, 0) 

655 

656 def flattened_tree1( 

657 node: WikiNode | str, list_depth: int 

658 ) -> Iterator[FlattenedListNode]: 

659 assert isinstance(node, (WikiNode, str)) 

660 if isinstance(node, str): 

661 yield FlattenedListNode(node, list_depth) 

662 return 

663 elif node.kind == NodeKind.LIST: 

664 for item in node.children: 

665 yield from flattened_tree1(item, list_depth) 

666 elif node.kind == NodeKind.LIST_ITEM: 

667 item_depth = ( 

668 len(node.sarg) if isinstance(node.sarg, str) else list_depth 

669 ) 

670 new_children = [] 

671 sublist = None 

672 for child in node.children: 

673 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST: 

674 sublist = child 

675 else: 

676 new_children.append(child) 

677 node.children = new_children 

678 node.sarg = "*" 

679 yield FlattenedListNode(node, item_depth) 

680 if sublist: 

681 yield from flattened_tree1(sublist, item_depth) 

682 else: 

683 yield FlattenedListNode(node, list_depth) 

684 

685 # XXX Do not use flattened_tree more than once here, for example for 

686 # debug printing... The underlying data is changed, and the separated 

687 # sublists disappear. 

688 

689 # Kludge for templates that generate several lines, but haven't 

690 # been caught by earlier kludges... 

691 def split_cleaned_node_on_newlines( 

692 contents: list[WikiNode | str], 

693 ) -> Iterator[tuple[str, int]]: 

694 for flattened in flattened_tree(contents): 

695 ipa_text = clean_node( 

696 wxr, 

697 data, 

698 flattened.node, 

699 template_fn=parse_pronunciation_template_fn, 

700 post_template_fn=parse_pron_post_template_fn, 

701 ) 

702 for line in ipa_text.splitlines(): 

703 yield line, flattened.list_depth 

704 

705 # have_pronunciations = False 

706 active_pos: PronunciationPoses | None = None 

707 # POS values from parent pronunciation lines by original list depth. 

708 # Audio-only child lines can inherit from a parent pronunciation line, 

709 # but same-depth audio lines must not inherit from a preceding IPA. 

710 pronunciation_pos_stack: list[tuple[int, PronunciationPoses]] = [] 

711 

712 def parent_pronunciation_pos( 

713 list_depth: int, 

714 ) -> PronunciationPoses | None: 

715 if not pronunciation_pos_stack: 

716 return None 

717 parent_depth, parent_pos = pronunciation_pos_stack[-1] 

718 return parent_pos if parent_depth < list_depth else None 

719 

720 for line, list_depth in split_cleaned_node_on_newlines(contents): 

721 prefix: str | None = None 

722 earlier_base_data: SoundData | None = None 

723 line_pos: PronunciationPoses | None = None 

724 current_group_sounds: list[SoundData] = [] 

725 # POS values seen on sounds extracted from this physical line. A 

726 # single candidate can seed adjacent audio-only child lines; multiple 

727 # POS-marked sounds on one line are too ambiguous for inheritance. 

728 line_sound_pos_candidates: set[PronunciationPoses] = set() 

729 line_has_sound = False 

730 if not line: 730 ↛ 731line 730 didn't jump to line 731 because the condition on line 730 was never true

731 continue 

732 while ( 

733 pronunciation_pos_stack 

734 and pronunciation_pos_stack[-1][0] >= list_depth 

735 ): 

736 pronunciation_pos_stack.pop() 

737 

738 split_templates = re.split(r"__PRON_TEMPLATE_(\d+)__", line) 

739 for i, text in enumerate(split_templates): 

740 if not text: 

741 continue 

742 # clean up starts at the start of the line 

743 text = re.sub(r"^\**\s*", "", text).strip() 

744 if i == 0: 

745 # At the start of a line, check for stuff like "Noun:" 

746 # or "(verb)" for POS labels that apply to this line or 

747 # structurally nested pronunciation lines. 

748 # These labels feed the inheritance state that later sets the 

749 # temporary sound["pos"] field used to route pronunciation 

750 # data into matching POS sections. 

751 if pos_prefix := extract_pos_prefix(text): 

752 text = pos_prefix.text 

753 line_pos = pos_prefix.pos_values 

754 if pos_prefix.is_persistent: 

755 active_pos = pos_prefix.pos_values 

756 if not text: 

757 continue 

758 

759 m = re.search(r"__PRON_POS_MARKER_(\d+)__", text) 

760 while m: 

761 if current_group_sounds and re.search( 

762 r"[,;]", text[: m.start()] 

763 ): 

764 current_group_sounds = [] 

765 pos_values = pron_pos_markers[int(m.group(1))] 

766 if current_group_sounds: 

767 for sound in current_group_sounds: 

768 set_sound_pos(sound, pos_values) 

769 line_sound_pos_candidates.add(pos_values) 

770 line_pos = pos_values 

771 text = text[: m.start()] + text[m.end() :] 

772 m = re.search(r"__PRON_POS_MARKER_(\d+)__", text) 

773 text = text.strip() 

774 if not text: 

775 continue 

776 # POS inheritance for normal pronunciation data: 

777 # 1. line_pos: explicit POS marker on this line, e.g. 

778 # "* {{q|noun}} {{IPA|...}}". 

779 # 2. parent_pronunciation_pos: structurally inherited from a 

780 # parent list item, e.g. "* {{q|noun}}" then "** {{IPA|...}}". 

781 # 3. active_pos: support for "* Noun:" followed by 

782 # "* {{IPA|...}}"; broad, so it stays after structural data. 

783 inherited_pos = ( 

784 line_pos or parent_pronunciation_pos(list_depth) or active_pos 

785 ) 

786 

787 if i % 2 == 1: 

788 # re.split (with capture groups) splits the lines so that 

789 # every even entry is a captured splitter; odd lines are either 

790 # empty strings or stuff around the splitters. 

791 base_pron_data, first_prons = pron_templates[int(text)] 

792 if base_pron_data: 

793 earlier_base_data = base_pron_data 

794 # print(f"Set {earlier_base_data=}") 

795 elif earlier_base_data is not None: 

796 # merge data from an earlier iteration of this loop 

797 for pr in first_prons: 

798 if "note" in pr and "note" in earlier_base_data: 798 ↛ 799line 798 didn't jump to line 799 because the condition on line 798 was never true

799 pr["note"] += ";" + earlier_base_data.get( 

800 "note", "" 

801 ) 

802 elif "note" in earlier_base_data: 802 ↛ 803line 802 didn't jump to line 803 because the condition on line 802 was never true

803 pr["note"] = earlier_base_data["note"] 

804 if "topics" in earlier_base_data: 804 ↛ 805line 804 didn't jump to line 805 because the condition on line 804 was never true

805 data_extend( 

806 pr, "topics", earlier_base_data["topics"] 

807 ) 

808 if "tags" in pr and "tags" in earlier_base_data: 808 ↛ 809line 808 didn't jump to line 809 because the condition on line 808 was never true

809 pr["tags"].extend(earlier_base_data["tags"]) 

810 elif "tags" in earlier_base_data: 810 ↛ 797line 810 didn't jump to line 797 because the condition on line 810 was always true

811 pr["tags"] = sorted(set(earlier_base_data["tags"])) 

812 for pr in first_prons: 

813 if sound_pos := set_sound_pos( 

814 pr, 

815 None if "pos" in pr else inherited_pos, 

816 ): 

817 line_sound_pos_candidates.add(sound_pos) 

818 if pr not in data.get("sounds", ()): 

819 data_append(data, "sounds", pr) 

820 current_group_sounds.append(pr) 

821 line_has_sound = True 

822 # This bit is handled 

823 continue 

824 

825 if "IPA" in text: 

826 field: Literal[ 

827 "audio", 

828 "audio-ipa", 

829 "enpr", 

830 "form", 

831 "hangeul", 

832 "homophone", 

833 "ipa", 

834 "mp3_url", 

835 "note", 

836 "ogg_url", 

837 "other", 

838 "rhymes", 

839 "tags", 

840 "text", 

841 "topics", 

842 "zh-pron", 

843 ] = "ipa" 

844 else: 

845 # This is used for Rhymes, Homophones, etc 

846 field = "other" 

847 

848 # Check if it contains Japanese "Tokyo" pronunciation with 

849 # special syntax 

850 m = re.search(r"(?m)\(Tokyo\) +([^ ]+) +\[", text) 

851 if m: 851 ↛ 852line 851 didn't jump to line 852 because the condition on line 851 was never true

852 pron: SoundData = {field: m.group(1)} # type: ignore[misc] 

853 if sound_pos := set_sound_pos(pron, inherited_pos): 

854 line_sound_pos_candidates.add(sound_pos) 

855 data_append(data, "sounds", pron) 

856 current_group_sounds.append(pron) 

857 line_has_sound = True 

858 # have_pronunciations = True 

859 continue 

860 

861 # Check if it contains Rhymes 

862 m = re.match(r"\s*Rhymes?: (.*)", text) 

863 if m: 

864 for ending in split_at_comma_semi(m.group(1)): 

865 ending = ending.strip() 

866 if ending: 866 ↛ 864line 866 didn't jump to line 864 because the condition on line 866 was always true

867 pron = {"rhymes": ending} 

868 if sound_pos := set_sound_pos(pron, inherited_pos): 

869 line_sound_pos_candidates.add(sound_pos) 

870 data_append(data, "sounds", pron) 

871 current_group_sounds.append(pron) 

872 line_has_sound = True 

873 # have_pronunciations = True 

874 continue 

875 

876 # Check if it contains homophones 

877 m = re.search(r"(?m)\bHomophones?: (.*)", text) 

878 if m: 

879 for w in split_at_comma_semi(m.group(1)): 

880 w = w.strip() 

881 if w: 881 ↛ 879line 881 didn't jump to line 879 because the condition on line 881 was always true

882 pron = {"homophone": w} 

883 if sound_pos := set_sound_pos(pron, inherited_pos): 

884 line_sound_pos_candidates.add(sound_pos) 

885 data_append(data, "sounds", pron) 

886 current_group_sounds.append(pron) 

887 line_has_sound = True 

888 # have_pronunciations = True 

889 continue 

890 

891 # Check if it contains Phonetic hangeul 

892 m = re.search(r"(?m)\bPhonetic hange?ul: \[([^]]+)\]", text) 

893 if m: 893 ↛ 894line 893 didn't jump to line 894 because the condition on line 893 was never true

894 seen = set() 

895 for w in m.group(1).split("/"): 

896 w = w.strip() 

897 if w and w not in seen: 

898 seen.add(w) 

899 pron = {"hangeul": w} 

900 if sound_pos := set_sound_pos(pron, inherited_pos): 

901 line_sound_pos_candidates.add(sound_pos) 

902 data_append(data, "sounds", pron) 

903 current_group_sounds.append(pron) 

904 line_has_sound = True 

905 # have_pronunciations = True 

906 

907 # This regex-based hyphenation detection left as backup 

908 m = re.search(r"\b(Syllabification|Hyphenation): *([^\n.]*)", text) 

909 if m: 

910 data_append(data, "hyphenation", m.group(2)) 

911 commaseparated = m.group(2).split(",") 

912 if len(commaseparated) > 1: 912 ↛ 923line 912 didn't jump to line 923 because the condition on line 912 was always true

913 for h in commaseparated: 

914 # That second characters looks like a dash but it's 

915 # actually unicode decimal code 8231, hyphenation dash 

916 # Add more delimiters here if needed. 

917 parts = re.split(r"-|‧", h.strip()) 

918 data_append( 

919 data, "hyphenations", Hyphenation(parts=parts) 

920 ) 

921 ... 

922 else: 

923 data_append( 

924 data, 

925 "hyphenations", 

926 Hyphenation(parts=m.group(2).split(sep="-")), 

927 ) 

928 # have_pronunciations = True 

929 

930 # See if it contains a word prefix restricting which forms the 

931 # pronunciation applies to (see amica/Latin) and/or parenthesized 

932 # tags. 

933 m = re.match( 

934 r"^[*#\s]*(([-\w]+):\s+)?\((([^()]|\([^()]*\))*?)\)", text 

935 ) 

936 if m: 

937 prefix = m.group(2) or "" 

938 tagstext = m.group(3) 

939 text = text[m.end() :] 

940 else: 

941 m = re.match(r"^[*#\s]*([-\w]+):\s+", text) 

942 if m: 

943 prefix = m.group(1) 

944 tagstext = "" 

945 text = text[m.end() :] 

946 else: 

947 # Spanish has tags before pronunciations, eg. aceite/Spanish 

948 m = re.match(r".*:\s+\(([^)]*)\)\s+(.*)", text) 

949 if m: 949 ↛ 950line 949 didn't jump to line 950 because the condition on line 949 was never true

950 tagstext = m.group(1) 

951 text = m.group(2) 

952 else: 

953 # No prefix. In this case, we inherit prefix 

954 # from previous entry. This particularly 

955 # applies for nested Audio files. 

956 tagstext = "" 

957 if tagstext: 

958 earlier_base_data = {} 

959 parse_pronunciation_tags_with_pos( 

960 wxr, tagstext, earlier_base_data 

961 ) 

962 

963 # Find romanizations from the pronunciation section (routinely 

964 # produced for Korean by {{ko-IPA}}) 

965 for m in re.finditer(pron_romanization_re, text): 965 ↛ 966line 965 didn't jump to line 966 because the loop on line 965 never started

966 prefix = m.group(1) 

967 w = m.group(2).strip() 

968 tag = pron_romanizations[prefix] 

969 form = {"form": w, "tags": tag.split()} 

970 data_append(data, "forms", form) 

971 

972 # Find IPA pronunciations 

973 for m in re.finditer( 

974 r"(?m)/[^][\n/,]+?/" r"|" r"\[[^]\n0-9,/][^],/]*?\]", text 

975 ): 

976 v = m.group(0) 

977 # The regexp above can match file links. Skip them. 

978 if v.startswith("[[File:"): 978 ↛ 979line 978 didn't jump to line 979 because the condition on line 978 was never true

979 continue 

980 if v == "/wiki.local/": 980 ↛ 981line 980 didn't jump to line 981 because the condition on line 980 was never true

981 continue 

982 if field == "ipa" and "__AUDIO_IGNORE_THIS__" in text: 982 ↛ 983line 982 didn't jump to line 983 because the condition on line 982 was never true

983 m = re.search(r"__AUDIO_IGNORE_THIS__(\d+)__", text) 

984 assert m 

985 idx = int(m.group(1)) 

986 if idx >= len(audios): 

987 continue 

988 if not audios[idx].get("audio-ipa"): 

989 audios[idx]["audio-ipa"] = v 

990 if prefix: 

991 audios[idx]["form"] = prefix 

992 else: 

993 if earlier_base_data: 

994 pron = deepcopy(earlier_base_data) 

995 pron[field] = v 

996 else: 

997 pron = {field: v} # type: ignore[misc] 

998 if prefix: 

999 pron["form"] = prefix 

1000 if sound_pos := set_sound_pos( 

1001 pron, 

1002 None if "pos" in pron else inherited_pos, 

1003 ): 

1004 line_sound_pos_candidates.add(sound_pos) 

1005 if may_be_duplicates is True: 1005 ↛ 1006line 1005 didn't jump to line 1006 because the condition on line 1005 was never true

1006 ok = True 

1007 for comp_sound in data.get("sounds", []): 

1008 # Python has dict comparison since 3.8 

1009 if pron == comp_sound: 

1010 ok = False 

1011 break 

1012 if ok: 

1013 data_append(data, "sounds", pron) 

1014 else: 

1015 data_append(data, "sounds", pron) 

1016 current_group_sounds.append(pron) 

1017 line_has_sound = True 

1018 # have_pronunciations = True 

1019 if current_group_sounds and re.search(r"[,;]", text): 

1020 current_group_sounds = [] 

1021 # XXX what about {{hyphenation|...}}, {{hyph|...}} 

1022 # and those used to be stored under "hyphenation" 

1023 

1024 # Add data that was collected in template_fn 

1025 # POS inheritance for audio has one extra source: 

1026 # common_sound_pos(line_sound_pos_candidates), from pronunciations 

1027 # extracted earlier on the same physical line, e.g. 

1028 # "* {{IPA|en|/foo/|a=verb}} {{audio|en|foo.wav}}". 

1029 # Explicit line_pos still wins, then same-line sound agreement, then 

1030 # parent-list structure, then active_pos. 

1031 audio_inherited_pos = ( 

1032 line_pos 

1033 or common_sound_pos(line_sound_pos_candidates) 

1034 or parent_pronunciation_pos(list_depth) 

1035 or active_pos 

1036 ) 

1037 for audio in audios: 

1038 if "audio" in audio: 1038 ↛ 1095line 1038 didn't jump to line 1095 because the condition on line 1038 was always true

1039 # Compute audio file URLs 

1040 fn = audio["audio"] 

1041 # Strip certain characters, e.g., left-to-right mark 

1042 fn = re.sub(r"[\u200f\u200e]", "", fn) 

1043 fn = fn.strip() 

1044 fn = urllib.parse.unquote(fn) 

1045 # First character is usually uppercased 

1046 if re.match(r"^[a-z][a-z]+", fn): 

1047 fn = fn[0].upper() + fn[1:] 

1048 if fn in wxr.config.redirects: 1048 ↛ 1049line 1048 didn't jump to line 1049 because the condition on line 1048 was never true

1049 fn = wxr.config.redirects[fn] 

1050 # File extension is lowercased 

1051 # XXX some words seem to need this, some don't seem to 

1052 # have this??? what is the exact rule? 

1053 # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn) 

1054 # Spaces are converted to underscores 

1055 fn = re.sub(r"\s+", "_", fn) 

1056 # Compute hash digest part 

1057 h = hashlib.md5() 

1058 hname = fn.encode("utf-8") 

1059 h.update(hname) 

1060 digest = h.hexdigest() 

1061 # Quote filename for URL 

1062 qfn = urllib.parse.quote(fn) 

1063 # For safety when writing files 

1064 qfn = qfn.replace("/", "__slash__") 

1065 if re.search(r"(?i)\.(ogg|oga)$", fn): 

1066 ogg = ( 

1067 "https://upload.wikimedia.org/wikipedia/" 

1068 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn) 

1069 ) 

1070 else: 

1071 ogg = ( 

1072 "https://upload.wikimedia.org/wikipedia/" 

1073 "commons/transcoded/" 

1074 "{}/{}/{}/{}.ogg".format( 

1075 digest[:1], digest[:2], qfn, qfn 

1076 ) 

1077 ) 

1078 if re.search(r"(?i)\.(mp3)$", fn): 1078 ↛ 1079line 1078 didn't jump to line 1079 because the condition on line 1078 was never true

1079 mp3 = ( 

1080 "https://upload.wikimedia.org/wikipedia/" 

1081 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn) 

1082 ) 

1083 else: 

1084 mp3 = ( 

1085 "https://upload.wikimedia.org/wikipedia/" 

1086 "commons/transcoded/" 

1087 "{}/{}/{}/{}.mp3".format( 

1088 digest[:1], digest[:2], qfn, qfn 

1089 ) 

1090 ) 

1091 audio["ogg_url"] = ogg 

1092 audio["mp3_url"] = mp3 

1093 if "pos" not in audio: 1093 ↛ 1095line 1093 didn't jump to line 1095 because the condition on line 1093 was always true

1094 set_sound_pos(audio, audio_inherited_pos) 

1095 if audio not in data.get("sounds", ()): 

1096 data_append(data, "sounds", audio) 

1097 line_has_sound = True 

1098 

1099 # if audios: 

1100 # have_pronunciations = True 

1101 audios = [] 

1102 

1103 data_extend(data, "hyphenations", hyphenations) 

1104 hyphenations = [] 

1105 

1106 if line_pos and not line_has_sound: 

1107 active_pos = line_pos 

1108 pronunciation_pos_stack.append((list_depth, line_pos)) 

1109 elif line_pronunciation_pos := common_sound_pos( 

1110 line_sound_pos_candidates 

1111 ): 

1112 pronunciation_pos_stack.append((list_depth, line_pronunciation_pos)) 

1113 

1114 ## I have commented out the otherwise unused have_pronunciation 

1115 ## toggles; uncomment them to use this debug print 

1116 # if not have_pronunciations and not have_panel_templates: 

1117 # wxr.wtp.debug("no pronunciations found from pronunciation section", 

1118 # sortid="pronunciations/533") 

1119 

1120 

1121def extract_th_pron_template( 

1122 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode 

1123): 

1124 # https://en.wiktionary.org/wiki/Template:th-pron 

1125 @dataclass 

1126 class TableHeader: 

1127 raw_tags: list[str] 

1128 rowspan: int 

1129 

1130 expanded_node = wxr.wtp.parse( 

1131 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

1132 ) 

1133 sounds = [] 

1134 for table_tag in expanded_node.find_html("table"): 

1135 row_headers = [] 

1136 for tr_tag in table_tag.find_html("tr"): 

1137 field = "other" 

1138 new_headers = [] 

1139 for header in row_headers: 

1140 if header.rowspan > 1: 

1141 header.rowspan -= 1 

1142 new_headers.append(header) 

1143 row_headers = new_headers 

1144 for th_tag in tr_tag.find_html("th"): 

1145 header_str = clean_node(wxr, None, th_tag) 

1146 if header_str.startswith("(standard) IPA"): 

1147 field = "ipa" 

1148 elif header_str.startswith("Homophones"): 1148 ↛ 1149line 1148 didn't jump to line 1149 because the condition on line 1148 was never true

1149 field = "homophone" 

1150 elif header_str == "Audio": 

1151 field = "audio" 

1152 elif header_str != "": 1152 ↛ 1144line 1152 didn't jump to line 1144 because the condition on line 1152 was always true

1153 rowspan = 1 

1154 rowspan_str = th_tag.attrs.get("rowspan", "1") 

1155 if re.fullmatch(r"\d+", rowspan_str): 1155 ↛ 1157line 1155 didn't jump to line 1157 because the condition on line 1155 was always true

1156 rowspan = int(rowspan_str) 

1157 header = TableHeader([], rowspan) 

1158 for line in header_str.splitlines(): 

1159 for raw_tag in line.strip("{}\n ").split(";"): 

1160 raw_tag = raw_tag.strip() 

1161 if raw_tag != "": 1161 ↛ 1159line 1161 didn't jump to line 1159 because the condition on line 1161 was always true

1162 header.raw_tags.append(raw_tag) 

1163 row_headers.append(header) 

1164 

1165 for td_tag in tr_tag.find_html("td"): 

1166 if field == "audio": 

1167 for link_node in td_tag.find_child(NodeKind.LINK): 

1168 filename = clean_node(wxr, None, link_node.largs[0]) 

1169 if filename != "": 1169 ↛ 1167line 1169 didn't jump to line 1167 because the condition on line 1169 was always true

1170 sound = create_audio_url_dict(filename) 

1171 sounds.append(sound) 

1172 elif field == "homophone": 1172 ↛ 1173line 1172 didn't jump to line 1173 because the condition on line 1172 was never true

1173 for span_tag in td_tag.find_html_recursively( 

1174 "span", attr_name="lang", attr_value="th" 

1175 ): 

1176 word = clean_node(wxr, None, span_tag) 

1177 if word != "": 

1178 sounds.append({"homophone": word}) 

1179 else: 

1180 raw_tags = [] 

1181 for html_node in td_tag.find_child_recursively( 

1182 NodeKind.HTML 

1183 ): 

1184 if html_node.tag == "small": 

1185 node_str = clean_node(wxr, None, html_node) 

1186 if node_str.startswith("[") and node_str.endswith( 

1187 "]" 

1188 ): 

1189 for raw_tag in node_str.strip("[]").split(","): 

1190 raw_tag = raw_tag.strip() 

1191 if raw_tag != "": 1191 ↛ 1189line 1191 didn't jump to line 1189 because the condition on line 1191 was always true

1192 raw_tags.append(raw_tag) 

1193 elif len(sounds) > 0: 1193 ↛ 1181line 1193 didn't jump to line 1181 because the condition on line 1193 was always true

1194 sounds[-1]["roman"] = node_str 

1195 elif html_node.tag == "span": 

1196 node_str = clean_node(wxr, None, html_node) 

1197 span_lang = html_node.attrs.get("lang", "") 

1198 span_class = html_node.attrs.get("class", "") 

1199 if node_str != "" and ( 

1200 span_lang == "th" or span_class in ["IPA", "tr"] 

1201 ): 

1202 sound = {} 

1203 for raw_tag in raw_tags: 

1204 if raw_tag in valid_tags: 1204 ↛ 1207line 1204 didn't jump to line 1207 because the condition on line 1204 was always true

1205 data_append(sound, "tags", raw_tag) 

1206 else: 

1207 data_append(sound, "raw_tags", raw_tag) 

1208 for header in row_headers: 

1209 for raw_tag in header.raw_tags: 

1210 if raw_tag.lower() in valid_tags: 

1211 data_append( 

1212 sound, "tags", raw_tag.lower() 

1213 ) 

1214 else: 

1215 data_append( 

1216 sound, "raw_tags", raw_tag 

1217 ) 

1218 if "romanization" in sound.get("tags", []): 

1219 field = "roman" 

1220 sound[field] = node_str 

1221 sounds.append(sound) 

1222 

1223 clean_node(wxr, word_entry, expanded_node) 

1224 data_extend(word_entry, "sounds", sounds) 

1225 

1226 

1227def extract_zh_pron_template( 

1228 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode 

1229): 

1230 # https://en.wiktionary.org/wiki/Template:zh-pron 

1231 expanded_node = wxr.wtp.parse( 

1232 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

1233 ) 

1234 seen_lists = set() 

1235 sounds = [] 

1236 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

1237 if list_node not in seen_lists: 

1238 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

1239 sounds.extend( 

1240 extract_zh_pron_list_item(wxr, list_item, [], seen_lists) 

1241 ) 

1242 clean_node(wxr, word_entry, expanded_node) 

1243 data_extend(word_entry, "sounds", sounds) 

1244 

1245 

1246def extract_zh_pron_list_item( 

1247 wxr: WiktextractContext, 

1248 list_item: WikiNode, 

1249 raw_tags: list[str], 

1250 seen_lists: set[WikiNode], 

1251) -> list[SoundData]: 

1252 current_tags = raw_tags[:] 

1253 sounds = [] 

1254 is_first_small_tag = True 

1255 for node in list_item.children: 

1256 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

1257 link_str = clean_node(wxr, None, node.largs) 

1258 node_str = clean_node(wxr, None, node) 

1259 if link_str.startswith("File:"): 1259 ↛ 1260line 1259 didn't jump to line 1260 because the condition on line 1259 was never true

1260 sound = create_audio_url_dict(link_str.removeprefix("File:")) 

1261 sound["raw_tags"] = current_tags[:] 

1262 translate_zh_pron_raw_tags(sound) 

1263 sounds.append(sound) 

1264 elif node_str != "": 1264 ↛ 1255line 1264 didn't jump to line 1255 because the condition on line 1264 was always true

1265 current_tags.append(node_str) 

1266 elif isinstance(node, HTMLNode): 

1267 if node.tag == "small": 

1268 if is_first_small_tag: 1268 ↛ 1279line 1268 didn't jump to line 1279 because the condition on line 1268 was always true

1269 raw_tag_text = clean_node( 

1270 wxr, 

1271 None, 

1272 [ 

1273 n 

1274 for n in node.children 

1275 if not (isinstance(n, HTMLNode) and n.tag == "sup") 

1276 ], 

1277 ) 

1278 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text)) 

1279 elif len(sounds) > 0: 

1280 data_extend( 

1281 sounds[-1], 

1282 "raw_tags", 

1283 split_zh_pron_raw_tag(clean_node(wxr, None, node)), 

1284 ) 

1285 translate_zh_pron_raw_tags(sounds[-1]) 

1286 is_first_small_tag = False 

1287 elif node.tag == "span": 

1288 sounds.extend(extract_zh_pron_span(wxr, node, current_tags)) 

1289 elif ( 1289 ↛ 1294line 1289 didn't jump to line 1294 because the condition on line 1289 was never true

1290 node.tag == "table" 

1291 and len(current_tags) > 0 

1292 and current_tags[-1] == "Homophones" 

1293 ): 

1294 sounds.extend( 

1295 extract_zh_pron_homophone_table(wxr, node, current_tags) 

1296 ) 

1297 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

1298 seen_lists.add(node) 

1299 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

1300 sounds.extend( 

1301 extract_zh_pron_list_item( 

1302 wxr, child_list_item, current_tags, seen_lists 

1303 ) 

1304 ) 

1305 

1306 return sounds 

1307 

1308 

1309def extract_zh_pron_homophone_table( 

1310 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str] 

1311) -> list[SoundData]: 

1312 sounds = [] 

1313 for td_tag in table.find_html_recursively("td"): 

1314 for span_tag in td_tag.find_html("span"): 

1315 span_class = span_tag.attrs.get("class", "") 

1316 span_lang = span_tag.attrs.get("lang", "") 

1317 span_str = clean_node(wxr, None, span_tag) 

1318 if ( 

1319 span_str not in ["", "/"] 

1320 and span_lang != "" 

1321 and span_class in ["Hant", "Hans", "Hani"] 

1322 ): 

1323 sound = {"homophone": span_str, "raw_tags": raw_tags[:]} 

1324 if span_class == "Hant": 

1325 data_append(sound, "tags", "Traditional-Chinese") 

1326 elif span_class == "Hans": 

1327 data_append(sound, "tags", "Simplified-Chinese") 

1328 translate_zh_pron_raw_tags(sound) 

1329 sounds.append(sound) 

1330 

1331 return sounds 

1332 

1333 

1334def translate_zh_pron_raw_tags(sound: SoundData): 

1335 from .zh_pron_tags import ZH_PRON_TAGS 

1336 

1337 raw_tags = [] 

1338 for raw_tag in sound.get("raw_tags", []): 

1339 if raw_tag in ZH_PRON_TAGS: 

1340 tr_tag = ZH_PRON_TAGS[raw_tag] 

1341 if isinstance(tr_tag, str): 

1342 data_append(sound, "tags", tr_tag) 

1343 elif isinstance(tr_tag, list) and tr_tag not in sound.get( 1343 ↛ 1338line 1343 didn't jump to line 1338 because the condition on line 1343 was always true

1344 "tags", [] 

1345 ): 

1346 data_extend(sound, "tags", tr_tag) 

1347 elif raw_tag in valid_tags: 

1348 if raw_tag not in sound.get("tags", []): 1348 ↛ 1338line 1348 didn't jump to line 1338 because the condition on line 1348 was always true

1349 data_append(sound, "tags", raw_tag) 

1350 elif raw_tag not in raw_tags: 1350 ↛ 1338line 1350 didn't jump to line 1338 because the condition on line 1350 was always true

1351 raw_tags.append(raw_tag) 

1352 

1353 if len(raw_tags) > 0: 

1354 sound["raw_tags"] = raw_tags 

1355 elif "raw_tags" in sound: 1355 ↛ exitline 1355 didn't return from function 'translate_zh_pron_raw_tags' because the condition on line 1355 was always true

1356 del sound["raw_tags"] 

1357 

1358 

1359def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]: 

1360 raw_tags = [] 

1361 if "(" not in raw_tag_text: 

1362 for raw_tag in re.split(r",|:|;| and ", raw_tag_text): 

1363 raw_tag = raw_tag.strip().removeprefix("incl. ").strip() 

1364 if raw_tag != "": 

1365 raw_tags.append(raw_tag) 

1366 else: 

1367 processed_offsets = [] 

1368 for match in re.finditer(r"\([^()]+\)", raw_tag_text): 

1369 processed_offsets.append((match.start(), match.end())) 

1370 raw_tags.extend( 

1371 split_zh_pron_raw_tag( 

1372 raw_tag_text[match.start() + 1 : match.end() - 1] 

1373 ) 

1374 ) 

1375 not_processed = "" 

1376 last_end = 0 

1377 for start, end in processed_offsets: 

1378 not_processed += raw_tag_text[last_end:start] 

1379 last_end = end 

1380 not_processed += raw_tag_text[last_end:] 

1381 if not_processed != raw_tag_text: 1381 ↛ 1384line 1381 didn't jump to line 1384 because the condition on line 1381 was always true

1382 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags 

1383 else: 

1384 raw_tags.append(not_processed) 

1385 

1386 return raw_tags 

1387 

1388 

1389def extract_zh_pron_span( 

1390 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str] 

1391) -> list[SoundData]: 

1392 sounds = [] 

1393 small_tags = [] 

1394 pron_nodes = [] 

1395 roman = "" 

1396 phonetic_pron = "" 

1397 for index, node in enumerate(span_tag.children): 

1398 if isinstance(node, HTMLNode) and node.tag == "small": 1398 ↛ 1399line 1398 didn't jump to line 1399 because the condition on line 1398 was never true

1399 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node)) 

1400 elif ( 1400 ↛ 1405line 1400 didn't jump to line 1405 because the condition on line 1400 was never true

1401 isinstance(node, HTMLNode) 

1402 and node.tag == "span" 

1403 and "-Latn" in node.attrs.get("lang", "") 

1404 ): 

1405 roman = clean_node(wxr, None, node).strip("() ") 

1406 elif isinstance(node, str) and node.strip() == "[Phonetic:": 1406 ↛ 1407line 1406 didn't jump to line 1407 because the condition on line 1406 was never true

1407 phonetic_pron = clean_node( 

1408 wxr, None, span_tag.children[index + 1 :] 

1409 ).strip("] ") 

1410 break 

1411 else: 

1412 pron_nodes.append(node) 

1413 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)): 

1414 zh_pron = zh_pron.strip("[]: ") 

1415 if len(zh_pron) > 0: 1415 ↛ 1413line 1415 didn't jump to line 1413 because the condition on line 1415 was always true

1416 if "IPA" in span_tag.attrs.get("class", ""): 1416 ↛ 1417line 1416 didn't jump to line 1417 because the condition on line 1416 was never true

1417 sound = {"ipa": zh_pron, "raw_tags": raw_tags[:]} 

1418 else: 

1419 sound = {"zh_pron": zh_pron, "raw_tags": raw_tags[:]} 

1420 if roman != "": 1420 ↛ 1421line 1420 didn't jump to line 1421 because the condition on line 1420 was never true

1421 sound["roman"] = roman 

1422 sounds.append(sound) 

1423 if len(sounds) > 0: 1423 ↛ 1425line 1423 didn't jump to line 1425 because the condition on line 1423 was always true

1424 data_extend(sounds[-1], "raw_tags", small_tags) 

1425 if phonetic_pron != "": 1425 ↛ 1426line 1425 didn't jump to line 1426 because the condition on line 1425 was never true

1426 sound = { 

1427 "zh_pron": phonetic_pron, 

1428 "raw_tags": raw_tags[:] + ["Phonetic"], 

1429 } 

1430 if roman != "": 

1431 sound["roman"] = roman 

1432 sounds.append(sound) 

1433 for sound in sounds: 

1434 translate_zh_pron_raw_tags(sound) 

1435 return sounds 

1436 

1437 

1438def split_zh_pron(zh_pron: str) -> list[str]: 

1439 # split by comma and other symbols that outside parentheses 

1440 parentheses = 0 

1441 pron_list = [] 

1442 pron = "" 

1443 for c in zh_pron: 

1444 if ( 

1445 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/"))) 

1446 and parentheses == 0 

1447 and len(pron.strip()) > 0 

1448 ): 

1449 pron_list.append(pron.strip()) 

1450 pron = "" 

1451 elif c == "(": 

1452 parentheses += 1 

1453 pron += c 

1454 elif c == ")": 

1455 parentheses -= 1 

1456 pron += c 

1457 else: 

1458 pron += c 

1459 

1460 if pron.strip() != "": 1460 ↛ 1462line 1460 didn't jump to line 1462 because the condition on line 1460 was always true

1461 pron_list.append(pron) 

1462 return pron_list