Coverage for src/wiktextract/extractor/en/pronunciation.py: 67%

1import hashlib

2import re

3import urllib

4from copy import deepcopy

5from typing import Iterator, Optional, Union

7from wikitextprocessor import NodeKind, TemplateNode, WikiNode

9from ...clean import clean_value

10from ...datautils import data_append, data_extend, split_at_comma_semi

11from ...page import LEVEL_KINDS, clean_node, is_panel_template

12from ...tags import valid_tags

13from ...wxr_context import WiktextractContext

14from .form_descriptions import (

15 classify_desc,

16 decode_tags,

17 parse_pronunciation_tags,

18)

19from .parts_of_speech import part_of_speech_map

20from .type_utils import Hyphenation, SoundData, TemplateArgs, WordData

21from .zh_pron_tags import ZH_PRON_TAGS

23# Prefixes, tags, and regexp for finding romanizations from the pronuncation

24# section

25pron_romanizations = {

26 " Revised Romanization ": "romanization revised",

27 " Revised Romanization (translit.) ": "romanization revised transliteration",

28 " McCune-Reischauer ": "McCune-Reischauer romanization",

29 " McCune–Reischauer ": "McCune-Reischauer romanization",

30 " Yale Romanization ": "Yale romanization",

31}

32pron_romanization_re = re.compile(

33 "(?m)^("

34 + "|".join(

35 re.escape(x)

36 for x in sorted(pron_romanizations.keys(), key=len, reverse=True)

37 )

38 + ")([^\n]+)"

39)

41IPA_EXTRACT = r"^($(.+)$ )?(IPA⁽ᵏᵉʸ⁾|enPR): ((.+?)( $([^(]+)$)?\s*)$"

42IPA_EXTRACT_RE = re.compile(IPA_EXTRACT)

45def extract_pron_template(

46 wxr: WiktextractContext, tname: str, targs: TemplateArgs, expanded: str

47) -> Optional[tuple[SoundData, list[SoundData]]]:

48 """In post_template_fn, this is used to handle all enPR and IPA templates

49 so that we can leave breadcrumbs in the text that can later be handled

50 there. We return a `base_data` so that if there are two

51 or more templates on the same line, like this:

52 (Tags for the whole line, really) enPR: foo, IPA(keys): /foo/

53 then we can apply base_data fields to other templates, too, if needed.

54 """

55 cleaned = clean_value(wxr, expanded)

56 # print(f"extract_pron_template input: {tname=} {expanded=}-> {cleaned=}")

57 m = IPA_EXTRACT_RE.match(cleaned)

58 if not m:

59 wxr.wtp.error(

60 f"Text cannot match IPA_EXTRACT_RE regex: "

61 f"{cleaned=}, {tname=}, {targs=}",

62 sortid="en/pronunciation/54",

63 )

64 return None

65 # for i, group in enumerate(m.groups()):

66 # print(i + 1, repr(group))

67 main_qual = m.group(2) or ""

68 if "qq" in targs:

69 # If the template has been given a qualifier that applies to

70 # every entry, but which also happens to appear at the end

71 # which can be confused with the post-qualifier of a single

72 # entry in the style of "... /ipa3/ (foo) (bar)", where foo

73 # might not be present so the bar looks like it only might

74 # apply to `/ipa3/`

75 pron_body = m.group(5)

76 post_qual = m.group(7)

77 else:

78 pron_body = m.group(4)

79 post_qual = ""

81 if not pron_body: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 wxr.wtp.error(

83 f"Regex failed to find 'body' from {cleaned=}",

84 sortid="en/pronunciation/81",

85 )

86 return None

88 base_data: SoundData = {}

89 if main_qual:

90 parse_pronunciation_tags(wxr, main_qual, base_data)

91 if post_qual:

92 parse_pronunciation_tags(wxr, post_qual, base_data)

93 # This base_data is used as the base copy for all entries from this

94 # template, but it is also returned so that its contents may be applied

95 # to other templates on the same line.

96 # print(f"{base_data=}")

98 sound_datas: list[SoundData] = []

100 parts: list[list[str]] = [[]]

101 inside = 0

102 current: list[str] = []

103 for i, p in enumerate(re.split(r"(\s*,|;|$|$\s*)", pron_body)):

104 # Split the line on commas and semicolons outside of parens. This

105 # gives us lines with "(main-qualifier) /phon/ (post-qualifier, maybe)"

106 # print(f" {i=}, {p=}")

107 comp = p.strip()

108 if not p:

109 continue

110 if comp == "(":

111 if not inside and i > 0: 111 ↛ 114line 111 didn't jump to line 114 because the condition on line 111 was always true

112 if stripped := "".join(current).strip():

113 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]

114 current = [p]

115 inside += 1

116 continue

117 if comp == ")":

118 inside -= 1

119 if not inside: 119 ↛ 124line 119 didn't jump to line 124 because the condition on line 119 was always true

120 if stripped := "".join(current).strip(): 120 ↛ 124line 120 didn't jump to line 124 because the condition on line 120 was always true

121 current.append(p)

122 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]

123 current = []

124 continue

125 if not inside and comp in (",", ";"):

126 if stripped := "".join(current).strip():

127 parts[-1].append(stripped) # type:ignore[arg-type]

128 current = []

129 parts.append([])

130 continue

131 current.append(p)

132 if current:

133 parts[-1].append("".join(current).strip())

134

135 # print(f">>>>>> {parts=}")

136 new_parts: list[list[str]] = []

137 for entry in parts:

138 if not entry: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 continue

140 new_entry: list[str] = []

141 i1: int = entry[0].startswith("(") and entry[0].endswith(")")

142 if i1:

143 new_entry.append(entry[0][1:-1].strip())

144 else:

145 new_entry.append("")

146 i2: int = (

147 entry[-1].startswith("(")

148 and entry[-1].endswith(")")

149 and len(entry) > 1

150 )

151 if i2 == 0:

152 i2 = len(entry)

153 else:

154 i2 = -1

155 new_entry.append("".join(entry[i1:i2]).strip())

156 if not new_entry[-1]: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true

157 wxr.wtp.error(

158 f"Missing IPA/enPRO sound data between qualifiers?" f"{entry=}",

159 sortid="en/pronunciation/153",

160 )

161 if i2 == -1:

162 new_entry.append(entry[-1][1:-1].strip())

163 else:

164 new_entry.append("")

165 new_parts.append(new_entry)

166

167 # print(f">>>>> {new_parts=}")

168

169 for part in new_parts:

170 sd = deepcopy(base_data)

171 if part[0]:

172 parse_pronunciation_tags(wxr, part[0], sd)

173 if part[2]:

174 parse_pronunciation_tags(wxr, part[2], sd)

175 if tname == "enPR":

176 sd["enpr"] = part[1]

177 else:

178 sd["ipa"] = part[1]

179 sound_datas.append(sd)

180

181 # print(f"BASE_DATA: {base_data}")

182 # print(f"SOUND_DATAS: {sound_datas=}")

183

184 return base_data, sound_datas

185

186

187def parse_pronunciation(

188 wxr: WiktextractContext,

189 node: WikiNode,

190 data: WordData,

191 etym_data: WordData,

192 have_etym: bool,

193 base_data: WordData,

194 lang_code: str,

195) -> None:

196 """Parses the pronunciation section from a language section on a

197 page."""

198 assert isinstance(node, WikiNode)

199 if node.kind in LEVEL_KINDS: 199 ↛ 202line 199 didn't jump to line 202 because the condition on line 199 was always true

200 contents = node.children

201 else:

202 contents = [node]

203 # Remove subsections, such as Usage notes. They may contain IPAchar

204 # templates in running text, and we do not want to extract IPAs from

205 # those.

206 # Filter out only LEVEL_KINDS; 'or' is doing heavy lifting here

207 # Slip through not-WikiNodes, then slip through WikiNodes that

208 # are not LEVEL_KINDS.

209 contents = [

210 x

211 for x in contents

212 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS

213 ]

214 if not any(

215 isinstance(x, WikiNode) and x.kind == NodeKind.LIST for x in contents

216 ):

217 # expand all templates

218 new_contents: list[Union[str, WikiNode]] = []

219 for lst in contents:

220 if (

221 isinstance(lst, TemplateNode)

222 and isinstance(lst.largs[0][0], str)

223 and lst.largs[0][0].strip() != "zh-pron"

224 ):

225 temp = wxr.wtp.node_to_wikitext(lst)

226 temp = wxr.wtp.expand(temp)

227 temp_parsed = wxr.wtp.parse(temp)

228 new_contents.extend(temp_parsed.children)

229 else:

230 new_contents.append(lst)

231 contents = new_contents

232

233 if have_etym and data is base_data: 233 ↛ 234line 233 didn't jump to line 234 because the condition on line 233 was never true

234 data = etym_data

235 pron_templates: list[tuple[SoundData, list[SoundData]]] = []

236 hyphenations: list[Hyphenation] = []

237 audios = []

238 have_panel_templates = False

239

240 def parse_pronunciation_template_fn(

241 name: str, ht: TemplateArgs

242 ) -> Optional[str]:

243 """Handle pronunciation and hyphenation templates"""

244 # _template_fn handles templates *before* they are expanded;

245 # this allows for special handling before all the work needed

246 # for expansion is done.

247 nonlocal have_panel_templates

248 if is_panel_template(wxr, name):

249 have_panel_templates = True

250 return ""

251 if name == "audio":

252 filename = ht.get(2) or ""

253 desc = ht.get(3) or ""

254 desc = clean_node(wxr, None, [desc])

255 audio: SoundData = {"audio": filename.strip()}

256 if desc: 256 ↛ 257line 256 didn't jump to line 257 because the condition on line 256 was never true

257 audio["text"] = desc

258 m = re.search(r"$(([^()]|\([^()]*$)*)\)", desc)

259 skip = False

260 if m: 260 ↛ 261line 260 didn't jump to line 261 because the condition on line 260 was never true

261 par = m.group(1)

262 cls = classify_desc(par)

263 if cls == "tags":

264 parse_pronunciation_tags(wxr, par, audio)

265 else:

266 skip = True

267 if skip: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true

268 return ""

269 audios.append(audio)

270 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"

271 if name == "audio-IPA": 271 ↛ 272line 271 didn't jump to line 272 because the condition on line 271 was never true

272 filename = ht.get(2) or ""

273 ipa = ht.get(3) or ""

274 dial = ht.get("dial")

275 audio = {"audio": filename.strip()}

276 if dial:

277 dial = clean_node(wxr, None, [dial])

278 audio["text"] = dial

279 if ipa:

280 audio["audio-ipa"] = ipa

281 audios.append(audio)

282 # The problem with these IPAs is that they often just describe

283 # what's in the sound file, rather than giving the pronunciation

284 # of the word alone. It is common for audio files to contain

285 # multiple pronunciations or articles in the same file, and then

286 # this IPA often describes what is in the file.

287 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"

288 if name == "audio-pron":

289 filename = ht.get(2) or ""

290 ipa = ht.get("ipa") or ""

291 dial = ht.get("dial")

292 country = ht.get("country")

293 audio = {"audio": filename.strip()}

294 if dial: 294 ↛ 298line 294 didn't jump to line 298 because the condition on line 294 was always true

295 dial = clean_node(wxr, None, [dial])

296 audio["text"] = dial

297 parse_pronunciation_tags(wxr, dial, audio)

298 if country: 298 ↛ 300line 298 didn't jump to line 300 because the condition on line 298 was always true

299 parse_pronunciation_tags(wxr, country, audio)

300 if ipa: 300 ↛ 302line 300 didn't jump to line 302 because the condition on line 300 was always true

301 audio["audio-ipa"] = ipa

302 audios.append(audio)

303 # XXX do we really want to extract pronunciations from these?

304 # Or are they spurious / just describing what is in the

305 # audio file?

306 # if ipa:

307 # pron = {"ipa": ipa}

308 # if dial:

309 # parse_pronunciation_tags(wxr, dial, pron)

310 # if country:

311 # parse_pronunciation_tags(wxr, country, pron)

312 # data_append(data, "sounds", pron)

313 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"

314 if name in ("hyph", "hyphenation"):

315 # {{hyph|en|re|late|caption="Hyphenation UK:"}}

316 # {{hyphenation|it|quiè|to||qui|è|to||quié|to||qui|é|to}}

317 # and also nocaption=1

318 caption = clean_node(wxr, None, ht.get("caption", ""))

319 tagsets, _ = decode_tags(caption)

320 # flatten the tagsets into one; it would be really weird to have

321 # several tagsets for a hyphenation caption

322 tags = list(set(tag for tagset in tagsets for tag in tagset))

323 # We'll just ignore any errors from tags, it's not very important

324 # for hyphenation

325 tags = [tag for tag in tags if not tag.startswith("error")]

326 hyph_sequences: list[list[str]] = [[]]

327 for text in [

328 t for (k, t) in ht.items() if (isinstance(k, int) and k >= 2)

329 ]:

330 if not text:

331 hyph_sequences.append([])

332 else:

333 hyph_sequences[-1].append(clean_node(wxr, None, text))

334 for seq in hyph_sequences:

335 hyphenations.append(Hyphenation(parts=seq, tags=tags))

336 return ""

337 return None

338

339 def parse_pron_post_template_fn(

340 name: str, ht: TemplateArgs, text: str

341 ) -> Optional[str]:

342 # _post_template_fn handles templates *after* the work to expand

343 # them has been done; this is exactly the same as _template_fn,

344 # except with the additional expanded text as an input, and

345 # possible side-effects from the expansion and recursion (like

346 # calling other subtemplates that are handled in _template_fn.

347 if is_panel_template(wxr, name): 347 ↛ 348line 347 didn't jump to line 348 because the condition on line 347 was never true

348 return ""

349 if name in {

350 "q",

351 "qualifier",

352 "sense",

353 "a",

354 "accent",

355 "l",

356 "link",

357 "lb",

358 "lbl",

359 "label",

360 }:

361 # Kludge: when these templates expand to /.../ or [...],

362 # replace the expansion by something safe. This is used

363 # to filter spurious IPA-looking expansions that aren't really

364 # IPAs. We probably don't care about these templates in the

365 # contexts where they expand to something containing these.

366 v = re.sub(r'href="[^"]*"', "", text) # Ignore URLs

367 v = re.sub(r'src="[^"]*"', "", v)

368 if re.search(r"/[^/,]+?/|\[[^]0-9,/][^],/]*?\]", v): 368 ↛ 374line 368 didn't jump to line 374 because the condition on line 368 was always true

369 # Note: replacing by empty results in Lua errors that we

370 # would rather not have. For example, voi/Middle Vietnamese

371 # uses {{a|{{l{{vi|...}}}}, and the {{a|...}} will fail

372 # if {{l|...}} returns empty.

373 return "stripped-by-parse_pron_post_template_fn"

374 if name in ("IPA", "enPR"):

375 # Extract the data from IPA and enPR templates (same underlying

376 # template) and replace them in-text with magical cookie that

377 # can be later used to refer to the data's index inside

378 # pron_templates.

379 if pron_t := extract_pron_template(wxr, name, ht, text):

380 pron_templates.append(pron_t)

381 return f"__PRON_TEMPLATE_{len(pron_templates)-1}__"

382 return text

383

384 def parse_expanded_zh_pron(

385 node: WikiNode,

386 parent_hdrs: list[str],

387 specific_hdrs: list[str],

388 unknown_header_tags: set[str],

389 ) -> None:

390 def generate_pron(

391 v, new_parent_hdrs: list[str], new_specific_hdrs: list[str]

392 ) -> Optional[SoundData]:

393 pron: SoundData = {}

394 pron["tags"] = []

395 pron["zh-pron"] = v.strip()

396 for hdr in new_parent_hdrs + new_specific_hdrs:

397 hdr = hdr.strip()

398 valid_hdr = re.sub(r"\s+", "-", hdr)

399 if hdr in ZH_PRON_TAGS:

400 for tag in ZH_PRON_TAGS[hdr]:

401 if tag not in pron["tags"]:

402 pron["tags"].append(tag)

403 elif valid_hdr in valid_tags:

404 if valid_hdr not in pron["tags"]:

405 pron["tags"].append(valid_hdr)

406 else:

407 unknown_header_tags.add(hdr)

408 # convert into normal IPA format if has the IPA flag

409 if "IPA" in pron["tags"]:

410 pron["ipa"] = v

411 del pron["zh-pron"]

412 pron["tags"].remove("IPA")

413 # convert into IPA but retain the Sinological-IPA tag

414 elif "Sinological-IPA" in pron["tags"]:

415 pron["ipa"] = v

416 del pron["zh-pron"]

417

418 if not (pron.get("zh-pron") or pron.get("ipa")):

419 return None

420 return pron

421

422 if isinstance(node, list): 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true

423 for item in node:

424 parse_expanded_zh_pron(

425 item, parent_hdrs, specific_hdrs, unknown_header_tags

426 )

427 return

428 if not isinstance(node, WikiNode): 428 ↛ 429line 428 didn't jump to line 429 because the condition on line 428 was never true

429 return

430 if node.kind != NodeKind.LIST: 430 ↛ 436line 430 didn't jump to line 436 because the condition on line 430 was always true

431 for item in node.children:

432 parse_expanded_zh_pron(

433 item, parent_hdrs, specific_hdrs, unknown_header_tags

434 )

435 return

436 for item in node.children:

437 assert isinstance(item, WikiNode)

438 assert item.kind == NodeKind.LIST_ITEM

439 base_item = list(

440 x

441 for x in item.children

442 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST

443 )

444 text = clean_node(wxr, None, base_item)

445 # print(f"{parent_hdrs} zhpron: {text}") # XXX remove me

446 text = re.sub(r"(?s)$Note:.*?$", "", text)

447 # Kludge to clean up text like

448 # '(Standard Chinese, erhua-ed) (旋兒／旋儿)' where

449 # the hanzi are examples

450 hanzi_m = re.match(r"\s*($[^()]*$)\s*$([^()]*)$\s*$", text)

451 if hanzi_m:

452 if re.search("[\u4e00-\u9fff]", hanzi_m.group(2)):

453 text = hanzi_m.group(1)

454 new_parent_hdrs = list(parent_hdrs)

455 new_specific_hdrs = list(specific_hdrs)

456 # look no further, here be dragons...

457

458 if ": " in text or "：" in text:

459 parts = re.split(r": |：", text)

460 m = re.match(

461 r"\s*$(([^():]+)\s*(:|：)?\s*([^():]*))$\s*$", text

462 )

463 # Matches lines with stuff like "(Hokkien: Xiamen, Quanzhou)"

464 # thrown into new_parent_hdrs

465 if m:

466 new_parent_hdrs.append(m.group(2).strip())

467 for hdr in m.group(4).split(","):

468 new_specific_hdrs.append(hdr.strip())

469 else:

470 # if "Zhangzhou" in text:

471 # print("\nFOUND IN:", text, "\n")

472 # print("PARTS: ", repr(parts))

473 # print(f" PARTS: {parts}")

474 extra_tags = parts[0]

475 # Kludge to handle how (Hokkien: Locations) and

476 # IPA (Specific Location) interact; this is why

477 # specific_hdrs was introduced to the soup, just

478 # to specify which are actual hierarchical higher

479 # level tags (Min'nan, Hokkien, etc.) which should

480 # always be present and then use specific_hdrs

481 # for that list of misc sublocations and subdialects

482 # that can be overridden by more specific stuff

483 # later.

484 m = re.match(r"\s*IPA\s*$(.*)$\s*$", extra_tags)

485 if m:

486 new_parent_hdrs.append("IPA")

487 new_specific_hdrs = [

488 s.strip() for s in m.group(1).split(",")

489 ]

490 extra_tags = extra_tags[m.end() :]

491

492 m = re.match(r"\s*$[^()]*,[^()]*$\s*$", extra_tags)

493 if m:

494 extra_tags = extra_tags.strip()[1:-1] # remove parens

495 new_parent_hdrs.extend(

496 s.strip() for s in extra_tags.split(",")

497 )

498 elif extra_tags:

499 new_parent_hdrs.append(extra_tags)

500

501 v = ":".join(parts[1:])

502

503 # check for phrases

504 if ("，" in (wxr.wtp.title or "")) and len(

505 v.split(" ")

506 ) + v.count(",") == len(wxr.wtp.title or ""):

507 # This just captures exact matches where you have

508 # the pronunciation of the whole phrase and nothing

509 # else. Split on spaces, then because we're not

510 # splitting next to a comma we need to add the

511 # count of commas so that it synchs up with the

512 # unicode string length of the original hanzi,

513 # where the comma is a separate character (unlike

514 # in the split list, where it's part of a space-

515 # separated string, like "teo⁴,".

516 vals = [v]

517 pron = generate_pron(

518 v, new_parent_hdrs, new_specific_hdrs

519 )

520

521 if pron:

522 pron["tags"] = list(sorted(pron["tags"]))

523 if pron not in data.get("sounds", ()):

524 data_append(data, "sounds", pron)

525 elif "→" in v:

526 vals = re.split("→", v)

527 for v in vals:

528 pron = generate_pron(

529 v, new_parent_hdrs, new_specific_hdrs

530 )

531 if pron:

532 m = re.match(

533 r"([^()]+)\s*\(toneless"

534 r" final syllable variant\)\s*",

535 v,

536 )

537 if m:

538 pron["zh-pron"] = m.group(1).strip()

539 pron["tags"].append(

540 "toneless-final-syllable-variant"

541 )

542

543 pron["tags"] = list(sorted(pron["tags"]))

544 if pron not in data.get("sounds", ()):

545 data_append(data, "sounds", pron)

546 else:

547 # split alternative pronunciations split

548 # with "," or " / "

549 vals = re.split(r"\s*,\s*|\s+/\s+", v)

550 new_vals = []

551 for v2 in vals:

552 if v2.startswith("/") and v2.endswith("/"):

553 # items like /kɛiŋ²¹³⁻⁵³ ^((Ø-))ŋuaŋ³³/

554 new_vals.append(v2)

555 else:

556 # split in parentheses otherwise

557 new_vals.extend(re.split(r"[()]", v2))

558 vals = new_vals

559 for v in vals:

560 pron = generate_pron(

561 v, new_parent_hdrs, new_specific_hdrs

562 )

563 if pron:

564 pron["tags"] = list(sorted(pron["tags"]))

565 if pron not in data.get("sounds", ()):

566 data_append(data, "sounds", pron)

567 else:

568 new_parent_hdrs.append(text)

569

570 for x in item.children:

571 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST:

572 parse_expanded_zh_pron(

573 x, new_parent_hdrs, specific_hdrs, unknown_header_tags

574 )

575

576 def parse_chinese_pron(

577 contents: Union[list[Union[WikiNode, str]], WikiNode, str],

578 unknown_header_tags: set[str],

579 ) -> None:

580 if isinstance(contents, list):

581 for item in contents:

582 parse_chinese_pron(item, unknown_header_tags)

583 return

584 if not isinstance(contents, WikiNode):

585 return

586 if contents.kind != NodeKind.TEMPLATE:

587 for item in contents.children:

588 parse_chinese_pron(item, unknown_header_tags)

589 return

590 if (

591 len(contents.largs[0]) == 1

592 and isinstance(contents.largs[0][0], str)

593 and contents.largs[0][0].strip() == "zh-pron"

594 ):

595 src = wxr.wtp.node_to_wikitext(contents)

596 expanded = wxr.wtp.expand(src, templates_to_expand={"zh-pron"})

597 parsed = wxr.wtp.parse(expanded)

598 parse_expanded_zh_pron(parsed, [], [], unknown_header_tags)

599 else:

600 for item in contents.children: 600 ↛ 601line 600 didn't jump to line 601 because the loop on line 600 never started

601 parse_chinese_pron(item, unknown_header_tags)

602 return

603

604 if lang_code == "zh":

605 unknown_header_tags: set[str] = set()

606 parse_chinese_pron(contents, unknown_header_tags)

607 for hdr in unknown_header_tags: 607 ↛ 608line 607 didn't jump to line 608 because the loop on line 607 never started

608 wxr.wtp.debug(

609 f"Zh-pron header not found in zh_pron_tags or tags: "

610 f"{repr(hdr)}",

611 sortid="pronunciations/296/20230324",

612 )

613

614 def flattened_tree(

615 lines: list[Union[WikiNode, str]],

616 ) -> Iterator[Union[WikiNode, str]]:

617 assert isinstance(lines, list)

618 for line in lines:

619 yield from flattened_tree1(line)

620

621 def flattened_tree1(

622 node: Union[WikiNode, str],

623 ) -> Iterator[Union[WikiNode, str]]:

624 assert isinstance(node, (WikiNode, str))

625 if isinstance(node, str):

626 yield node

627 return

628 elif node.kind == NodeKind.LIST:

629 for item in node.children:

630 yield from flattened_tree1(item)

631 elif node.kind == NodeKind.LIST_ITEM:

632 new_children = []

633 sublist = None

634 for child in node.children:

635 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST:

636 sublist = child

637 else:

638 new_children.append(child)

639 node.children = new_children

640 node.sarg = "*"

641 yield node

642 if sublist:

643 yield from flattened_tree1(sublist)

644 else:

645 yield node

646

647 # XXX Do not use flattened_tree more than once here, for example for

648 # debug printing... The underlying data is changed, and the separated

649 # sublists disappear.

650

651 # Kludge for templates that generate several lines, but haven't

652 # been caught by earlier kludges...

653 def split_cleaned_node_on_newlines(

654 contents: list[Union[WikiNode, str]],

655 ) -> Iterator[str]:

656 for litem in flattened_tree(contents):

657 ipa_text = clean_node(

658 wxr,

659 data,

660 litem,

661 template_fn=parse_pronunciation_template_fn,

662 post_template_fn=parse_pron_post_template_fn,

663 )

664 for line in ipa_text.splitlines():

665 yield line

666

667 # have_pronunciations = False

668 active_pos: Optional[str] = None

669

670 for line in split_cleaned_node_on_newlines(contents):

671 # print(f"{line=}")

672 prefix: Optional[str] = None

673 earlier_base_data: Optional[SoundData] = None

674 if not line: 674 ↛ 675line 674 didn't jump to line 675 because the condition on line 674 was never true

675 continue

676

677 split_templates = re.split(r"__PRON_TEMPLATE_(\d+)__", line)

678 for i, text in enumerate(split_templates):

679 if not text:

680 continue

681 # clean up starts at the start of the line

682 text = re.sub(r"^\**\s*", "", text).strip()

683 if i == 0:

684 # At the start of a line, check for stuff like "Noun:"

685 # for active_pos; active_pos is a temporary data field

686 # given to each saved SoundData entry which is later

687 # used to sort the entries into their respective PoSes.

688 m = re.match(r"\s*(\w+\s?\w*)\s*:?\s*", text)

689 if m:

690 if (m_lower := m.group(1).lower()) in part_of_speech_map:

691 active_pos = part_of_speech_map[m_lower]["pos"]

692 text = text[m.end() :].strip()

693 if not text:

694 continue

695 if i % 2 == 1:

696 # re.split (with capture groups) splits the lines so that

697 # every even entry is a captured splitter; odd lines are either

698 # empty strings or stuff around the splitters.

699 base_pron_data, first_prons = pron_templates[int(text)]

700 if base_pron_data:

701 earlier_base_data = base_pron_data

702 # print(f"Set {earlier_base_data=}")

703 elif earlier_base_data is not None: 703 ↛ 720line 703 didn't jump to line 720 because the condition on line 703 was always true

704 # merge data from an earlier iteration of this loop

705 for pr in first_prons:

706 if "note" in pr and "note" in earlier_base_data: 706 ↛ 707line 706 didn't jump to line 707 because the condition on line 706 was never true

707 pr["note"] += ";" + earlier_base_data.get(

708 "note", ""

709 )

710 elif "note" in earlier_base_data: 710 ↛ 711line 710 didn't jump to line 711 because the condition on line 710 was never true

711 pr["note"] = earlier_base_data["note"]

712 if "topics" in earlier_base_data: 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true

713 data_extend(

714 pr, "topics", earlier_base_data["topics"]

715 )

716 if "tags" in pr and "tags" in earlier_base_data: 716 ↛ 717line 716 didn't jump to line 717 because the condition on line 716 was never true

717 pr["tags"].extend(earlier_base_data["tags"])

718 elif "tags" in earlier_base_data: 718 ↛ 705line 718 didn't jump to line 705 because the condition on line 718 was always true

719 pr["tags"] = sorted(set(earlier_base_data["tags"]))

720 for pr in first_prons:

721 if active_pos:

722 pr["pos"] = active_pos # type: ignore[typeddict-unknown-key]

723 if pr not in data.get("sounds", ()): 723 ↛ 720line 723 didn't jump to line 720 because the condition on line 723 was always true

724 data_append(data, "sounds", pr)

725 # This bit is handled

726 continue

727

728 if "IPA" in text:

729 field = "ipa"

730 else:

731 # This is used for Rhymes, Homophones, etc

732 field = "other"

733

734 # Check if it contains Japanese "Tokyo" pronunciation with

735 # special syntax

736 m = re.search(r"(?m)$Tokyo$ +([^ ]+) +\[", text)

737 if m: 737 ↛ 738line 737 didn't jump to line 738 because the condition on line 737 was never true

738 pron: SoundData = {field: m.group(1)} # type: ignore[misc]

739 if active_pos:

740 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]

741 data_append(data, "sounds", pron)

742 # have_pronunciations = True

743 continue

744

745 # Check if it contains Rhymes

746 m = re.match(r"\s*Rhymes?: (.*)", text)

747 if m:

748 for ending in split_at_comma_semi(m.group(1)):

749 ending = ending.strip()

750 if ending: 750 ↛ 748line 750 didn't jump to line 748 because the condition on line 750 was always true

751 pron = {"rhymes": ending}

752 if active_pos:

753 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]

754 data_append(data, "sounds", pron)

755 # have_pronunciations = True

756 continue

757

758 # Check if it contains homophones

759 m = re.search(r"(?m)\bHomophones?: (.*)", text)

760 if m:

761 for w in split_at_comma_semi(m.group(1)):

762 w = w.strip()

763 if w: 763 ↛ 761line 763 didn't jump to line 761 because the condition on line 763 was always true

764 pron = {"homophone": w}

765 if active_pos:

766 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]

767 data_append(data, "sounds", pron)

768 # have_pronunciations = True

769 continue

770

771 # Check if it contains Phonetic hangeul

772 m = re.search(r"(?m)\bPhonetic hange?ul: \[([^]]+)\]", text)

773 if m: 773 ↛ 774line 773 didn't jump to line 774 because the condition on line 773 was never true

774 seen = set()

775 for w in m.group(1).split("/"):

776 w = w.strip()

777 if w and w not in seen:

778 seen.add(w)

779 pron = {"hangeul": w}

780 if active_pos:

781 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]

782 data_append(data, "sounds", pron)

783 # have_pronunciations = True

784

785 # This regex-based hyphenation detection left as backup

786 m = re.search(r"\b(Syllabification|Hyphenation): *([^\n.]*)", text)

787 if m:

788 data_append(data, "hyphenation", m.group(2))

789 commaseparated = m.group(2).split(",")

790 if len(commaseparated) > 1: 790 ↛ 801line 790 didn't jump to line 801 because the condition on line 790 was always true

791 for h in commaseparated:

792 # That second characters looks like a dash but it's

793 # actually unicode decimal code 8231, hyphenation dash

794 # Add more delimiters here if needed.

795 parts = re.split(r"-|‧", h.strip())

796 data_append(

797 data, "hyphenations", Hyphenation(parts=parts)

798 )

799 ...

800 else:

801 data_append(

802 data,

803 "hyphenations",

804 Hyphenation(parts=m.group(2).split(sep="-")),

805 )

806 # have_pronunciations = True

807

808 # See if it contains a word prefix restricting which forms the

809 # pronunciation applies to (see amica/Latin) and/or parenthesized

810 # tags.

811 m = re.match(

812 r"^[*#\s]*(([-\w]+):\s+)?$(([^()]|\([^()]*$)*?)\)", text

813 )

814 if m:

815 prefix = m.group(2) or ""

816 tagstext = m.group(3)

817 text = text[m.end() :]

818 else:

819 m = re.match(r"^[*#\s]*([-\w]+):\s+", text)

820 if m:

821 prefix = m.group(1)

822 tagstext = ""

823 text = text[m.end() :]

824 else:

825 # Spanish has tags before pronunciations, eg. aceite/Spanish

826 m = re.match(r".*:\s+$([^)]*)$\s+(.*)", text)

827 if m: 827 ↛ 828line 827 didn't jump to line 828 because the condition on line 827 was never true

828 tagstext = m.group(1)

829 text = m.group(2)

830 else:

831 # No prefix. In this case, we inherit prefix

832 # from previous entry. This particularly

833 # applies for nested Audio files.

834 tagstext = ""

835 if tagstext:

836 earlier_base_data = {}

837 parse_pronunciation_tags(wxr, tagstext, earlier_base_data)

838

839 # Find romanizations from the pronunciation section (routinely

840 # produced for Korean by {{ko-IPA}})

841 for m in re.finditer(pron_romanization_re, text): 841 ↛ 842line 841 didn't jump to line 842 because the loop on line 841 never started

842 prefix = m.group(1)

843 w = m.group(2).strip()

844 tag = pron_romanizations[prefix]

845 form = {"form": w, "tags": tag.split()}

846 data_append(data, "forms", form)

847

848 # Find IPA pronunciations

849 for m in re.finditer(

850 r"(?m)/[^][\n/,]+?/" r"|" r"\[[^]\n0-9,/][^],/]*?\]", text

851 ):

852 v = m.group(0)

853 # The regexp above can match file links. Skip them.

854 if v.startswith("[[File:"): 854 ↛ 855line 854 didn't jump to line 855 because the condition on line 854 was never true

855 continue

856 if v == "/wiki.local/": 856 ↛ 857line 856 didn't jump to line 857 because the condition on line 856 was never true

857 continue

858 if field == "ipa" and "__AUDIO_IGNORE_THIS__" in text: 858 ↛ 859line 858 didn't jump to line 859 because the condition on line 858 was never true

859 m = re.search(r"__AUDIO_IGNORE_THIS__(\d+)__", text)

860 assert m

861 idx = int(m.group(1))

862 if idx >= len(audios):

863 continue

864 if not audios[idx].get("audio-ipa"):

865 audios[idx]["audio-ipa"] = v

866 if prefix:

867 audios[idx]["form"] = prefix

868 else:

869 if earlier_base_data:

870 pron = deepcopy(earlier_base_data)

871 pron[field] = v

872 else:

873 pron = {field: v} # type: ignore[misc]

874 if active_pos:

875 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]

876 if prefix:

877 pron["form"] = prefix

878 if active_pos:

879 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]

880 data_append(data, "sounds", pron)

881 # have_pronunciations = True

882

883 # XXX what about {{hyphenation|...}}, {{hyph|...}}

884 # and those used to be stored under "hyphenation"

885

886 # Add data that was collected in template_fn

887 for audio in audios:

888 if "audio" in audio: 888 ↛ 945line 888 didn't jump to line 945 because the condition on line 888 was always true

889 # Compute audio file URLs

890 fn = audio["audio"]

891 # Strip certain characters, e.g., left-to-right mark

892 fn = re.sub(r"[\u200f\u200e]", "", fn)

893 fn = fn.strip()

894 fn = urllib.parse.unquote(fn)

895 # First character is usually uppercased

896 if re.match(r"^[a-z][a-z]+", fn):

897 fn = fn[0].upper() + fn[1:]

898 if fn in wxr.config.redirects: 898 ↛ 899line 898 didn't jump to line 899 because the condition on line 898 was never true

899 fn = wxr.config.redirects[fn]

900 # File extension is lowercased

901 # XXX some words seem to need this, some don't seem to

902 # have this??? what is the exact rule?

903 # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn)

904 # Spaces are converted to underscores

905 fn = re.sub(r"\s+", "_", fn)

906 # Compute hash digest part

907 h = hashlib.md5()

908 hname = fn.encode("utf-8")

909 h.update(hname)

910 digest = h.hexdigest()

911 # Quote filename for URL

912 qfn = urllib.parse.quote(fn)

913 # For safety when writing files

914 qfn = qfn.replace("/", "__slash__")

915 if re.search(r"(?i)\.(ogg|oga)$", fn):

916 ogg = (

917 "https://upload.wikimedia.org/wikipedia/"

918 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)

919 )

920 else:

921 ogg = (

922 "https://upload.wikimedia.org/wikipedia/"

923 "commons/transcoded/"

924 "{}/{}/{}/{}.ogg".format(

925 digest[:1], digest[:2], qfn, qfn

926 )

927 )

928 if re.search(r"(?i)\.(mp3)$", fn): 928 ↛ 929line 928 didn't jump to line 929 because the condition on line 928 was never true

929 mp3 = (

930 "https://upload.wikimedia.org/wikipedia/"

931 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)

932 )

933 else:

934 mp3 = (

935 "https://upload.wikimedia.org/wikipedia/"

936 "commons/transcoded/"

937 "{}/{}/{}/{}.mp3".format(

938 digest[:1], digest[:2], qfn, qfn

939 )

940 )

941 audio["ogg_url"] = ogg

942 audio["mp3_url"] = mp3

943 if active_pos:

944 audio["pos"] = active_pos # type: ignore[typeddict-unknown-key]

945 if audio not in data.get("sounds", ()):

946 data_append(data, "sounds", audio)

947

948 # if audios:

949 # have_pronunciations = True

950 audios = []

951

952 data_extend(data, "hyphenations", hyphenations)

953 hyphenations = []

954

955 ## I have commented out the otherwise unused have_pronunciation

956 ## toggles; uncomment them to use this debug print

957 # if not have_pronunciations and not have_panel_templates:

958 # wxr.wtp.debug("no pronunciations found from pronunciation section",

959 # sortid="pronunciations/533")