Coverage for src/wiktextract/extractor/en/pronunciation.py: 67%
535 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
1import hashlib
2import re
3import urllib
4from copy import deepcopy
5from typing import Iterator, Optional, Union
7from wikitextprocessor import NodeKind, TemplateNode, WikiNode
9from ...clean import clean_value
10from ...datautils import data_append, data_extend, split_at_comma_semi
11from ...page import LEVEL_KINDS, clean_node, is_panel_template
12from ...tags import valid_tags
13from ...wxr_context import WiktextractContext
14from .form_descriptions import (
15 classify_desc,
16 decode_tags,
17 parse_pronunciation_tags,
18)
19from .parts_of_speech import part_of_speech_map
20from .type_utils import Hyphenation, SoundData, TemplateArgs, WordData
21from .zh_pron_tags import ZH_PRON_TAGS
23# Prefixes, tags, and regexp for finding romanizations from the pronuncation
24# section
25pron_romanizations = {
26 " Revised Romanization ": "romanization revised",
27 " Revised Romanization (translit.) ": "romanization revised transliteration",
28 " McCune-Reischauer ": "McCune-Reischauer romanization",
29 " McCune–Reischauer ": "McCune-Reischauer romanization",
30 " Yale Romanization ": "Yale romanization",
31}
32pron_romanization_re = re.compile(
33 "(?m)^("
34 + "|".join(
35 re.escape(x)
36 for x in sorted(pron_romanizations.keys(), key=len, reverse=True)
37 )
38 + ")([^\n]+)"
39)
41IPA_EXTRACT = r"^(\((.+)\) )?(IPA⁽ᵏᵉʸ⁾|enPR): ((.+?)( \(([^(]+)\))?\s*)$"
42IPA_EXTRACT_RE = re.compile(IPA_EXTRACT)
45def extract_pron_template(
46 wxr: WiktextractContext, tname: str, targs: TemplateArgs, expanded: str
47) -> Optional[tuple[SoundData, list[SoundData]]]:
48 """In post_template_fn, this is used to handle all enPR and IPA templates
49 so that we can leave breadcrumbs in the text that can later be handled
50 there. We return a `base_data` so that if there are two
51 or more templates on the same line, like this:
52 (Tags for the whole line, really) enPR: foo, IPA(keys): /foo/
53 then we can apply base_data fields to other templates, too, if needed.
54 """
55 cleaned = clean_value(wxr, expanded)
56 # print(f"extract_pron_template input: {tname=} {expanded=}-> {cleaned=}")
57 m = IPA_EXTRACT_RE.match(cleaned)
58 if not m:
59 wxr.wtp.error(
60 f"Text cannot match IPA_EXTRACT_RE regex: "
61 f"{cleaned=}, {tname=}, {targs=}",
62 sortid="en/pronunciation/54",
63 )
64 return None
65 # for i, group in enumerate(m.groups()):
66 # print(i + 1, repr(group))
67 main_qual = m.group(2) or ""
68 if "qq" in targs:
69 # If the template has been given a qualifier that applies to
70 # every entry, but which also happens to appear at the end
71 # which can be confused with the post-qualifier of a single
72 # entry in the style of "... /ipa3/ (foo) (bar)", where foo
73 # might not be present so the bar looks like it only might
74 # apply to `/ipa3/`
75 pron_body = m.group(5)
76 post_qual = m.group(7)
77 else:
78 pron_body = m.group(4)
79 post_qual = ""
81 if not pron_body: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true
82 wxr.wtp.error(
83 f"Regex failed to find 'body' from {cleaned=}",
84 sortid="en/pronunciation/81",
85 )
86 return None
88 base_data: SoundData = {}
89 if main_qual:
90 parse_pronunciation_tags(wxr, main_qual, base_data)
91 if post_qual:
92 parse_pronunciation_tags(wxr, post_qual, base_data)
93 # This base_data is used as the base copy for all entries from this
94 # template, but it is also returned so that its contents may be applied
95 # to other templates on the same line.
96 # print(f"{base_data=}")
98 sound_datas: list[SoundData] = []
100 parts: list[list[str]] = [[]]
101 inside = 0
102 current: list[str] = []
103 for i, p in enumerate(re.split(r"(\s*,|;|\(|\)\s*)", pron_body)):
104 # Split the line on commas and semicolons outside of parens. This
105 # gives us lines with "(main-qualifier) /phon/ (post-qualifier, maybe)"
106 # print(f" {i=}, {p=}")
107 comp = p.strip()
108 if not p:
109 continue
110 if comp == "(":
111 if not inside and i > 0: 111 ↛ 114line 111 didn't jump to line 114 because the condition on line 111 was always true
112 if stripped := "".join(current).strip():
113 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]
114 current = [p]
115 inside += 1
116 continue
117 if comp == ")":
118 inside -= 1
119 if not inside: 119 ↛ 124line 119 didn't jump to line 124 because the condition on line 119 was always true
120 if stripped := "".join(current).strip(): 120 ↛ 124line 120 didn't jump to line 124 because the condition on line 120 was always true
121 current.append(p)
122 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]
123 current = []
124 continue
125 if not inside and comp in (",", ";"):
126 if stripped := "".join(current).strip():
127 parts[-1].append(stripped) # type:ignore[arg-type]
128 current = []
129 parts.append([])
130 continue
131 current.append(p)
132 if current:
133 parts[-1].append("".join(current).strip())
135 # print(f">>>>>> {parts=}")
136 new_parts: list[list[str]] = []
137 for entry in parts:
138 if not entry: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 continue
140 new_entry: list[str] = []
141 i1: int = entry[0].startswith("(") and entry[0].endswith(")")
142 if i1:
143 new_entry.append(entry[0][1:-1].strip())
144 else:
145 new_entry.append("")
146 i2: int = (
147 entry[-1].startswith("(")
148 and entry[-1].endswith(")")
149 and len(entry) > 1
150 )
151 if i2 == 0:
152 i2 = len(entry)
153 else:
154 i2 = -1
155 new_entry.append("".join(entry[i1:i2]).strip())
156 if not new_entry[-1]: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true
157 wxr.wtp.error(
158 f"Missing IPA/enPRO sound data between qualifiers?" f"{entry=}",
159 sortid="en/pronunciation/153",
160 )
161 if i2 == -1:
162 new_entry.append(entry[-1][1:-1].strip())
163 else:
164 new_entry.append("")
165 new_parts.append(new_entry)
167 # print(f">>>>> {new_parts=}")
169 for part in new_parts:
170 sd = deepcopy(base_data)
171 if part[0]:
172 parse_pronunciation_tags(wxr, part[0], sd)
173 if part[2]:
174 parse_pronunciation_tags(wxr, part[2], sd)
175 if tname == "enPR":
176 sd["enpr"] = part[1]
177 else:
178 sd["ipa"] = part[1]
179 sound_datas.append(sd)
181 # print(f"BASE_DATA: {base_data}")
182 # print(f"SOUND_DATAS: {sound_datas=}")
184 return base_data, sound_datas
187def parse_pronunciation(
188 wxr: WiktextractContext,
189 node: WikiNode,
190 data: WordData,
191 etym_data: WordData,
192 have_etym: bool,
193 base_data: WordData,
194 lang_code: str,
195) -> None:
196 """Parses the pronunciation section from a language section on a
197 page."""
198 assert isinstance(node, WikiNode)
199 if node.kind in LEVEL_KINDS: 199 ↛ 202line 199 didn't jump to line 202 because the condition on line 199 was always true
200 contents = node.children
201 else:
202 contents = [node]
203 # Remove subsections, such as Usage notes. They may contain IPAchar
204 # templates in running text, and we do not want to extract IPAs from
205 # those.
206 # Filter out only LEVEL_KINDS; 'or' is doing heavy lifting here
207 # Slip through not-WikiNodes, then slip through WikiNodes that
208 # are not LEVEL_KINDS.
209 contents = [
210 x
211 for x in contents
212 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
213 ]
214 if not any(
215 isinstance(x, WikiNode) and x.kind == NodeKind.LIST for x in contents
216 ):
217 # expand all templates
218 new_contents: list[Union[str, WikiNode]] = []
219 for lst in contents:
220 if (
221 isinstance(lst, TemplateNode)
222 and isinstance(lst.largs[0][0], str)
223 and lst.largs[0][0].strip() != "zh-pron"
224 ):
225 temp = wxr.wtp.node_to_wikitext(lst)
226 temp = wxr.wtp.expand(temp)
227 temp_parsed = wxr.wtp.parse(temp)
228 new_contents.extend(temp_parsed.children)
229 else:
230 new_contents.append(lst)
231 contents = new_contents
233 if have_etym and data is base_data: 233 ↛ 234line 233 didn't jump to line 234 because the condition on line 233 was never true
234 data = etym_data
235 pron_templates: list[tuple[SoundData, list[SoundData]]] = []
236 hyphenations: list[Hyphenation] = []
237 audios = []
238 have_panel_templates = False
240 def parse_pronunciation_template_fn(
241 name: str, ht: TemplateArgs
242 ) -> Optional[str]:
243 """Handle pronunciation and hyphenation templates"""
244 # _template_fn handles templates *before* they are expanded;
245 # this allows for special handling before all the work needed
246 # for expansion is done.
247 nonlocal have_panel_templates
248 if is_panel_template(wxr, name):
249 have_panel_templates = True
250 return ""
251 if name == "audio":
252 filename = ht.get(2) or ""
253 desc = ht.get(3) or ""
254 desc = clean_node(wxr, None, [desc])
255 audio: SoundData = {"audio": filename.strip()}
256 if desc: 256 ↛ 257line 256 didn't jump to line 257 because the condition on line 256 was never true
257 audio["text"] = desc
258 m = re.search(r"\((([^()]|\([^()]*\))*)\)", desc)
259 skip = False
260 if m: 260 ↛ 261line 260 didn't jump to line 261 because the condition on line 260 was never true
261 par = m.group(1)
262 cls = classify_desc(par)
263 if cls == "tags":
264 parse_pronunciation_tags(wxr, par, audio)
265 else:
266 skip = True
267 if skip: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true
268 return ""
269 audios.append(audio)
270 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
271 if name == "audio-IPA": 271 ↛ 272line 271 didn't jump to line 272 because the condition on line 271 was never true
272 filename = ht.get(2) or ""
273 ipa = ht.get(3) or ""
274 dial = ht.get("dial")
275 audio = {"audio": filename.strip()}
276 if dial:
277 dial = clean_node(wxr, None, [dial])
278 audio["text"] = dial
279 if ipa:
280 audio["audio-ipa"] = ipa
281 audios.append(audio)
282 # The problem with these IPAs is that they often just describe
283 # what's in the sound file, rather than giving the pronunciation
284 # of the word alone. It is common for audio files to contain
285 # multiple pronunciations or articles in the same file, and then
286 # this IPA often describes what is in the file.
287 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
288 if name == "audio-pron":
289 filename = ht.get(2) or ""
290 ipa = ht.get("ipa") or ""
291 dial = ht.get("dial")
292 country = ht.get("country")
293 audio = {"audio": filename.strip()}
294 if dial: 294 ↛ 298line 294 didn't jump to line 298 because the condition on line 294 was always true
295 dial = clean_node(wxr, None, [dial])
296 audio["text"] = dial
297 parse_pronunciation_tags(wxr, dial, audio)
298 if country: 298 ↛ 300line 298 didn't jump to line 300 because the condition on line 298 was always true
299 parse_pronunciation_tags(wxr, country, audio)
300 if ipa: 300 ↛ 302line 300 didn't jump to line 302 because the condition on line 300 was always true
301 audio["audio-ipa"] = ipa
302 audios.append(audio)
303 # XXX do we really want to extract pronunciations from these?
304 # Or are they spurious / just describing what is in the
305 # audio file?
306 # if ipa:
307 # pron = {"ipa": ipa}
308 # if dial:
309 # parse_pronunciation_tags(wxr, dial, pron)
310 # if country:
311 # parse_pronunciation_tags(wxr, country, pron)
312 # data_append(data, "sounds", pron)
313 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
314 if name in ("hyph", "hyphenation"):
315 # {{hyph|en|re|late|caption="Hyphenation UK:"}}
316 # {{hyphenation|it|quiè|to||qui|è|to||quié|to||qui|é|to}}
317 # and also nocaption=1
318 caption = clean_node(wxr, None, ht.get("caption", ""))
319 tagsets, _ = decode_tags(caption)
320 # flatten the tagsets into one; it would be really weird to have
321 # several tagsets for a hyphenation caption
322 tags = list(set(tag for tagset in tagsets for tag in tagset))
323 # We'll just ignore any errors from tags, it's not very important
324 # for hyphenation
325 tags = [tag for tag in tags if not tag.startswith("error")]
326 hyph_sequences: list[list[str]] = [[]]
327 for text in [
328 t for (k, t) in ht.items() if (isinstance(k, int) and k >= 2)
329 ]:
330 if not text:
331 hyph_sequences.append([])
332 else:
333 hyph_sequences[-1].append(clean_node(wxr, None, text))
334 for seq in hyph_sequences:
335 hyphenations.append(Hyphenation(parts=seq, tags=tags))
336 return ""
337 return None
339 def parse_pron_post_template_fn(
340 name: str, ht: TemplateArgs, text: str
341 ) -> Optional[str]:
342 # _post_template_fn handles templates *after* the work to expand
343 # them has been done; this is exactly the same as _template_fn,
344 # except with the additional expanded text as an input, and
345 # possible side-effects from the expansion and recursion (like
346 # calling other subtemplates that are handled in _template_fn.
347 if is_panel_template(wxr, name): 347 ↛ 348line 347 didn't jump to line 348 because the condition on line 347 was never true
348 return ""
349 if name in {
350 "q",
351 "qualifier",
352 "sense",
353 "a",
354 "accent",
355 "l",
356 "link",
357 "lb",
358 "lbl",
359 "label",
360 }:
361 # Kludge: when these templates expand to /.../ or [...],
362 # replace the expansion by something safe. This is used
363 # to filter spurious IPA-looking expansions that aren't really
364 # IPAs. We probably don't care about these templates in the
365 # contexts where they expand to something containing these.
366 v = re.sub(r'href="[^"]*"', "", text) # Ignore URLs
367 v = re.sub(r'src="[^"]*"', "", v)
368 if re.search(r"/[^/,]+?/|\[[^]0-9,/][^],/]*?\]", v): 368 ↛ 374line 368 didn't jump to line 374 because the condition on line 368 was always true
369 # Note: replacing by empty results in Lua errors that we
370 # would rather not have. For example, voi/Middle Vietnamese
371 # uses {{a|{{l{{vi|...}}}}, and the {{a|...}} will fail
372 # if {{l|...}} returns empty.
373 return "stripped-by-parse_pron_post_template_fn"
374 if name in ("IPA", "enPR"):
375 # Extract the data from IPA and enPR templates (same underlying
376 # template) and replace them in-text with magical cookie that
377 # can be later used to refer to the data's index inside
378 # pron_templates.
379 if pron_t := extract_pron_template(wxr, name, ht, text):
380 pron_templates.append(pron_t)
381 return f"__PRON_TEMPLATE_{len(pron_templates)-1}__"
382 return text
384 def parse_expanded_zh_pron(
385 node: WikiNode,
386 parent_hdrs: list[str],
387 specific_hdrs: list[str],
388 unknown_header_tags: set[str],
389 ) -> None:
390 def generate_pron(
391 v, new_parent_hdrs: list[str], new_specific_hdrs: list[str]
392 ) -> Optional[SoundData]:
393 pron: SoundData = {}
394 pron["tags"] = []
395 pron["zh-pron"] = v.strip()
396 for hdr in new_parent_hdrs + new_specific_hdrs:
397 hdr = hdr.strip()
398 valid_hdr = re.sub(r"\s+", "-", hdr)
399 if hdr in ZH_PRON_TAGS:
400 for tag in ZH_PRON_TAGS[hdr]:
401 if tag not in pron["tags"]:
402 pron["tags"].append(tag)
403 elif valid_hdr in valid_tags:
404 if valid_hdr not in pron["tags"]:
405 pron["tags"].append(valid_hdr)
406 else:
407 unknown_header_tags.add(hdr)
408 # convert into normal IPA format if has the IPA flag
409 if "IPA" in pron["tags"]:
410 pron["ipa"] = v
411 del pron["zh-pron"]
412 pron["tags"].remove("IPA")
413 # convert into IPA but retain the Sinological-IPA tag
414 elif "Sinological-IPA" in pron["tags"]:
415 pron["ipa"] = v
416 del pron["zh-pron"]
418 if not (pron.get("zh-pron") or pron.get("ipa")):
419 return None
420 return pron
422 if isinstance(node, list): 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true
423 for item in node:
424 parse_expanded_zh_pron(
425 item, parent_hdrs, specific_hdrs, unknown_header_tags
426 )
427 return
428 if not isinstance(node, WikiNode): 428 ↛ 429line 428 didn't jump to line 429 because the condition on line 428 was never true
429 return
430 if node.kind != NodeKind.LIST: 430 ↛ 436line 430 didn't jump to line 436 because the condition on line 430 was always true
431 for item in node.children:
432 parse_expanded_zh_pron(
433 item, parent_hdrs, specific_hdrs, unknown_header_tags
434 )
435 return
436 for item in node.children:
437 assert isinstance(item, WikiNode)
438 assert item.kind == NodeKind.LIST_ITEM
439 base_item = list(
440 x
441 for x in item.children
442 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
443 )
444 text = clean_node(wxr, None, base_item)
445 # print(f"{parent_hdrs} zhpron: {text}") # XXX remove me
446 text = re.sub(r"(?s)\(Note:.*?\)", "", text)
447 # Kludge to clean up text like
448 # '(Standard Chinese, erhua-ed) (旋兒/旋儿)' where
449 # the hanzi are examples
450 hanzi_m = re.match(r"\s*(\([^()]*\))\s*\(([^()]*)\)\s*$", text)
451 if hanzi_m:
452 if re.search("[\u4e00-\u9fff]", hanzi_m.group(2)):
453 text = hanzi_m.group(1)
454 new_parent_hdrs = list(parent_hdrs)
455 new_specific_hdrs = list(specific_hdrs)
456 # look no further, here be dragons...
458 if ": " in text or ":" in text:
459 parts = re.split(r": |:", text)
460 m = re.match(
461 r"\s*\((([^():]+)\s*(:|:)?\s*([^():]*))\)\s*$", text
462 )
463 # Matches lines with stuff like "(Hokkien: Xiamen, Quanzhou)"
464 # thrown into new_parent_hdrs
465 if m:
466 new_parent_hdrs.append(m.group(2).strip())
467 for hdr in m.group(4).split(","):
468 new_specific_hdrs.append(hdr.strip())
469 else:
470 # if "Zhangzhou" in text:
471 # print("\nFOUND IN:", text, "\n")
472 # print("PARTS: ", repr(parts))
473 # print(f" PARTS: {parts}")
474 extra_tags = parts[0]
475 # Kludge to handle how (Hokkien: Locations) and
476 # IPA (Specific Location) interact; this is why
477 # specific_hdrs was introduced to the soup, just
478 # to specify which are actual hierarchical higher
479 # level tags (Min'nan, Hokkien, etc.) which should
480 # always be present and then use specific_hdrs
481 # for that list of misc sublocations and subdialects
482 # that can be overridden by more specific stuff
483 # later.
484 m = re.match(r"\s*IPA\s*\((.*)\)\s*$", extra_tags)
485 if m:
486 new_parent_hdrs.append("IPA")
487 new_specific_hdrs = [
488 s.strip() for s in m.group(1).split(",")
489 ]
490 extra_tags = extra_tags[m.end() :]
492 m = re.match(r"\s*\([^()]*,[^()]*\)\s*$", extra_tags)
493 if m:
494 extra_tags = extra_tags.strip()[1:-1] # remove parens
495 new_parent_hdrs.extend(
496 s.strip() for s in extra_tags.split(",")
497 )
498 elif extra_tags:
499 new_parent_hdrs.append(extra_tags)
501 v = ":".join(parts[1:])
503 # check for phrases
504 if ("," in (wxr.wtp.title or "")) and len(
505 v.split(" ")
506 ) + v.count(",") == len(wxr.wtp.title or ""):
507 # This just captures exact matches where you have
508 # the pronunciation of the whole phrase and nothing
509 # else. Split on spaces, then because we're not
510 # splitting next to a comma we need to add the
511 # count of commas so that it synchs up with the
512 # unicode string length of the original hanzi,
513 # where the comma is a separate character (unlike
514 # in the split list, where it's part of a space-
515 # separated string, like "teo⁴,".
516 vals = [v]
517 pron = generate_pron(
518 v, new_parent_hdrs, new_specific_hdrs
519 )
521 if pron:
522 pron["tags"] = list(sorted(pron["tags"]))
523 if pron not in data.get("sounds", ()):
524 data_append(data, "sounds", pron)
525 elif "→" in v:
526 vals = re.split("→", v)
527 for v in vals:
528 pron = generate_pron(
529 v, new_parent_hdrs, new_specific_hdrs
530 )
531 if pron:
532 m = re.match(
533 r"([^()]+)\s*\(toneless"
534 r" final syllable variant\)\s*",
535 v,
536 )
537 if m:
538 pron["zh-pron"] = m.group(1).strip()
539 pron["tags"].append(
540 "toneless-final-syllable-variant"
541 )
543 pron["tags"] = list(sorted(pron["tags"]))
544 if pron not in data.get("sounds", ()):
545 data_append(data, "sounds", pron)
546 else:
547 # split alternative pronunciations split
548 # with "," or " / "
549 vals = re.split(r"\s*,\s*|\s+/\s+", v)
550 new_vals = []
551 for v2 in vals:
552 if v2.startswith("/") and v2.endswith("/"):
553 # items like /kɛiŋ²¹³⁻⁵³ ^((Ø-))ŋuaŋ³³/
554 new_vals.append(v2)
555 else:
556 # split in parentheses otherwise
557 new_vals.extend(re.split(r"[()]", v2))
558 vals = new_vals
559 for v in vals:
560 pron = generate_pron(
561 v, new_parent_hdrs, new_specific_hdrs
562 )
563 if pron:
564 pron["tags"] = list(sorted(pron["tags"]))
565 if pron not in data.get("sounds", ()):
566 data_append(data, "sounds", pron)
567 else:
568 new_parent_hdrs.append(text)
570 for x in item.children:
571 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST:
572 parse_expanded_zh_pron(
573 x, new_parent_hdrs, specific_hdrs, unknown_header_tags
574 )
576 def parse_chinese_pron(
577 contents: Union[list[Union[WikiNode, str]], WikiNode, str],
578 unknown_header_tags: set[str],
579 ) -> None:
580 if isinstance(contents, list):
581 for item in contents:
582 parse_chinese_pron(item, unknown_header_tags)
583 return
584 if not isinstance(contents, WikiNode):
585 return
586 if contents.kind != NodeKind.TEMPLATE:
587 for item in contents.children:
588 parse_chinese_pron(item, unknown_header_tags)
589 return
590 if (
591 len(contents.largs[0]) == 1
592 and isinstance(contents.largs[0][0], str)
593 and contents.largs[0][0].strip() == "zh-pron"
594 ):
595 src = wxr.wtp.node_to_wikitext(contents)
596 expanded = wxr.wtp.expand(src, templates_to_expand={"zh-pron"})
597 parsed = wxr.wtp.parse(expanded)
598 parse_expanded_zh_pron(parsed, [], [], unknown_header_tags)
599 else:
600 for item in contents.children: 600 ↛ 601line 600 didn't jump to line 601 because the loop on line 600 never started
601 parse_chinese_pron(item, unknown_header_tags)
602 return
604 if lang_code == "zh":
605 unknown_header_tags: set[str] = set()
606 parse_chinese_pron(contents, unknown_header_tags)
607 for hdr in unknown_header_tags: 607 ↛ 608line 607 didn't jump to line 608 because the loop on line 607 never started
608 wxr.wtp.debug(
609 f"Zh-pron header not found in zh_pron_tags or tags: "
610 f"{repr(hdr)}",
611 sortid="pronunciations/296/20230324",
612 )
614 def flattened_tree(
615 lines: list[Union[WikiNode, str]],
616 ) -> Iterator[Union[WikiNode, str]]:
617 assert isinstance(lines, list)
618 for line in lines:
619 yield from flattened_tree1(line)
621 def flattened_tree1(
622 node: Union[WikiNode, str],
623 ) -> Iterator[Union[WikiNode, str]]:
624 assert isinstance(node, (WikiNode, str))
625 if isinstance(node, str):
626 yield node
627 return
628 elif node.kind == NodeKind.LIST:
629 for item in node.children:
630 yield from flattened_tree1(item)
631 elif node.kind == NodeKind.LIST_ITEM:
632 new_children = []
633 sublist = None
634 for child in node.children:
635 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
636 sublist = child
637 else:
638 new_children.append(child)
639 node.children = new_children
640 node.sarg = "*"
641 yield node
642 if sublist:
643 yield from flattened_tree1(sublist)
644 else:
645 yield node
647 # XXX Do not use flattened_tree more than once here, for example for
648 # debug printing... The underlying data is changed, and the separated
649 # sublists disappear.
651 # Kludge for templates that generate several lines, but haven't
652 # been caught by earlier kludges...
653 def split_cleaned_node_on_newlines(
654 contents: list[Union[WikiNode, str]],
655 ) -> Iterator[str]:
656 for litem in flattened_tree(contents):
657 ipa_text = clean_node(
658 wxr,
659 data,
660 litem,
661 template_fn=parse_pronunciation_template_fn,
662 post_template_fn=parse_pron_post_template_fn,
663 )
664 for line in ipa_text.splitlines():
665 yield line
667 # have_pronunciations = False
668 active_pos: Optional[str] = None
670 for line in split_cleaned_node_on_newlines(contents):
671 # print(f"{line=}")
672 prefix: Optional[str] = None
673 earlier_base_data: Optional[SoundData] = None
674 if not line: 674 ↛ 675line 674 didn't jump to line 675 because the condition on line 674 was never true
675 continue
677 split_templates = re.split(r"__PRON_TEMPLATE_(\d+)__", line)
678 for i, text in enumerate(split_templates):
679 if not text:
680 continue
681 # clean up starts at the start of the line
682 text = re.sub(r"^\**\s*", "", text).strip()
683 if i == 0:
684 # At the start of a line, check for stuff like "Noun:"
685 # for active_pos; active_pos is a temporary data field
686 # given to each saved SoundData entry which is later
687 # used to sort the entries into their respective PoSes.
688 m = re.match(r"\s*(\w+\s?\w*)\s*:?\s*", text)
689 if m:
690 if (m_lower := m.group(1).lower()) in part_of_speech_map:
691 active_pos = part_of_speech_map[m_lower]["pos"]
692 text = text[m.end() :].strip()
693 if not text:
694 continue
695 if i % 2 == 1:
696 # re.split (with capture groups) splits the lines so that
697 # every even entry is a captured splitter; odd lines are either
698 # empty strings or stuff around the splitters.
699 base_pron_data, first_prons = pron_templates[int(text)]
700 if base_pron_data:
701 earlier_base_data = base_pron_data
702 # print(f"Set {earlier_base_data=}")
703 elif earlier_base_data is not None: 703 ↛ 720line 703 didn't jump to line 720 because the condition on line 703 was always true
704 # merge data from an earlier iteration of this loop
705 for pr in first_prons:
706 if "note" in pr and "note" in earlier_base_data: 706 ↛ 707line 706 didn't jump to line 707 because the condition on line 706 was never true
707 pr["note"] += ";" + earlier_base_data.get(
708 "note", ""
709 )
710 elif "note" in earlier_base_data: 710 ↛ 711line 710 didn't jump to line 711 because the condition on line 710 was never true
711 pr["note"] = earlier_base_data["note"]
712 if "topics" in earlier_base_data: 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true
713 data_extend(
714 pr, "topics", earlier_base_data["topics"]
715 )
716 if "tags" in pr and "tags" in earlier_base_data: 716 ↛ 717line 716 didn't jump to line 717 because the condition on line 716 was never true
717 pr["tags"].extend(earlier_base_data["tags"])
718 elif "tags" in earlier_base_data: 718 ↛ 705line 718 didn't jump to line 705 because the condition on line 718 was always true
719 pr["tags"] = sorted(set(earlier_base_data["tags"]))
720 for pr in first_prons:
721 if active_pos:
722 pr["pos"] = active_pos # type: ignore[typeddict-unknown-key]
723 if pr not in data.get("sounds", ()): 723 ↛ 720line 723 didn't jump to line 720 because the condition on line 723 was always true
724 data_append(data, "sounds", pr)
725 # This bit is handled
726 continue
728 if "IPA" in text:
729 field = "ipa"
730 else:
731 # This is used for Rhymes, Homophones, etc
732 field = "other"
734 # Check if it contains Japanese "Tokyo" pronunciation with
735 # special syntax
736 m = re.search(r"(?m)\(Tokyo\) +([^ ]+) +\[", text)
737 if m: 737 ↛ 738line 737 didn't jump to line 738 because the condition on line 737 was never true
738 pron: SoundData = {field: m.group(1)} # type: ignore[misc]
739 if active_pos:
740 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
741 data_append(data, "sounds", pron)
742 # have_pronunciations = True
743 continue
745 # Check if it contains Rhymes
746 m = re.match(r"\s*Rhymes?: (.*)", text)
747 if m:
748 for ending in split_at_comma_semi(m.group(1)):
749 ending = ending.strip()
750 if ending: 750 ↛ 748line 750 didn't jump to line 748 because the condition on line 750 was always true
751 pron = {"rhymes": ending}
752 if active_pos:
753 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
754 data_append(data, "sounds", pron)
755 # have_pronunciations = True
756 continue
758 # Check if it contains homophones
759 m = re.search(r"(?m)\bHomophones?: (.*)", text)
760 if m:
761 for w in split_at_comma_semi(m.group(1)):
762 w = w.strip()
763 if w: 763 ↛ 761line 763 didn't jump to line 761 because the condition on line 763 was always true
764 pron = {"homophone": w}
765 if active_pos:
766 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
767 data_append(data, "sounds", pron)
768 # have_pronunciations = True
769 continue
771 # Check if it contains Phonetic hangeul
772 m = re.search(r"(?m)\bPhonetic hange?ul: \[([^]]+)\]", text)
773 if m: 773 ↛ 774line 773 didn't jump to line 774 because the condition on line 773 was never true
774 seen = set()
775 for w in m.group(1).split("/"):
776 w = w.strip()
777 if w and w not in seen:
778 seen.add(w)
779 pron = {"hangeul": w}
780 if active_pos:
781 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
782 data_append(data, "sounds", pron)
783 # have_pronunciations = True
785 # This regex-based hyphenation detection left as backup
786 m = re.search(r"\b(Syllabification|Hyphenation): *([^\n.]*)", text)
787 if m:
788 data_append(data, "hyphenation", m.group(2))
789 commaseparated = m.group(2).split(",")
790 if len(commaseparated) > 1: 790 ↛ 801line 790 didn't jump to line 801 because the condition on line 790 was always true
791 for h in commaseparated:
792 # That second characters looks like a dash but it's
793 # actually unicode decimal code 8231, hyphenation dash
794 # Add more delimiters here if needed.
795 parts = re.split(r"-|‧", h.strip())
796 data_append(
797 data, "hyphenations", Hyphenation(parts=parts)
798 )
799 ...
800 else:
801 data_append(
802 data,
803 "hyphenations",
804 Hyphenation(parts=m.group(2).split(sep="-")),
805 )
806 # have_pronunciations = True
808 # See if it contains a word prefix restricting which forms the
809 # pronunciation applies to (see amica/Latin) and/or parenthesized
810 # tags.
811 m = re.match(
812 r"^[*#\s]*(([-\w]+):\s+)?\((([^()]|\([^()]*\))*?)\)", text
813 )
814 if m:
815 prefix = m.group(2) or ""
816 tagstext = m.group(3)
817 text = text[m.end() :]
818 else:
819 m = re.match(r"^[*#\s]*([-\w]+):\s+", text)
820 if m:
821 prefix = m.group(1)
822 tagstext = ""
823 text = text[m.end() :]
824 else:
825 # Spanish has tags before pronunciations, eg. aceite/Spanish
826 m = re.match(r".*:\s+\(([^)]*)\)\s+(.*)", text)
827 if m: 827 ↛ 828line 827 didn't jump to line 828 because the condition on line 827 was never true
828 tagstext = m.group(1)
829 text = m.group(2)
830 else:
831 # No prefix. In this case, we inherit prefix
832 # from previous entry. This particularly
833 # applies for nested Audio files.
834 tagstext = ""
835 if tagstext:
836 earlier_base_data = {}
837 parse_pronunciation_tags(wxr, tagstext, earlier_base_data)
839 # Find romanizations from the pronunciation section (routinely
840 # produced for Korean by {{ko-IPA}})
841 for m in re.finditer(pron_romanization_re, text): 841 ↛ 842line 841 didn't jump to line 842 because the loop on line 841 never started
842 prefix = m.group(1)
843 w = m.group(2).strip()
844 tag = pron_romanizations[prefix]
845 form = {"form": w, "tags": tag.split()}
846 data_append(data, "forms", form)
848 # Find IPA pronunciations
849 for m in re.finditer(
850 r"(?m)/[^][\n/,]+?/" r"|" r"\[[^]\n0-9,/][^],/]*?\]", text
851 ):
852 v = m.group(0)
853 # The regexp above can match file links. Skip them.
854 if v.startswith("[[File:"): 854 ↛ 855line 854 didn't jump to line 855 because the condition on line 854 was never true
855 continue
856 if v == "/wiki.local/": 856 ↛ 857line 856 didn't jump to line 857 because the condition on line 856 was never true
857 continue
858 if field == "ipa" and "__AUDIO_IGNORE_THIS__" in text: 858 ↛ 859line 858 didn't jump to line 859 because the condition on line 858 was never true
859 m = re.search(r"__AUDIO_IGNORE_THIS__(\d+)__", text)
860 assert m
861 idx = int(m.group(1))
862 if idx >= len(audios):
863 continue
864 if not audios[idx].get("audio-ipa"):
865 audios[idx]["audio-ipa"] = v
866 if prefix:
867 audios[idx]["form"] = prefix
868 else:
869 if earlier_base_data:
870 pron = deepcopy(earlier_base_data)
871 pron[field] = v
872 else:
873 pron = {field: v} # type: ignore[misc]
874 if active_pos:
875 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
876 if prefix:
877 pron["form"] = prefix
878 if active_pos:
879 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
880 data_append(data, "sounds", pron)
881 # have_pronunciations = True
883 # XXX what about {{hyphenation|...}}, {{hyph|...}}
884 # and those used to be stored under "hyphenation"
886 # Add data that was collected in template_fn
887 for audio in audios:
888 if "audio" in audio: 888 ↛ 945line 888 didn't jump to line 945 because the condition on line 888 was always true
889 # Compute audio file URLs
890 fn = audio["audio"]
891 # Strip certain characters, e.g., left-to-right mark
892 fn = re.sub(r"[\u200f\u200e]", "", fn)
893 fn = fn.strip()
894 fn = urllib.parse.unquote(fn)
895 # First character is usually uppercased
896 if re.match(r"^[a-z][a-z]+", fn):
897 fn = fn[0].upper() + fn[1:]
898 if fn in wxr.config.redirects: 898 ↛ 899line 898 didn't jump to line 899 because the condition on line 898 was never true
899 fn = wxr.config.redirects[fn]
900 # File extension is lowercased
901 # XXX some words seem to need this, some don't seem to
902 # have this??? what is the exact rule?
903 # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn)
904 # Spaces are converted to underscores
905 fn = re.sub(r"\s+", "_", fn)
906 # Compute hash digest part
907 h = hashlib.md5()
908 hname = fn.encode("utf-8")
909 h.update(hname)
910 digest = h.hexdigest()
911 # Quote filename for URL
912 qfn = urllib.parse.quote(fn)
913 # For safety when writing files
914 qfn = qfn.replace("/", "__slash__")
915 if re.search(r"(?i)\.(ogg|oga)$", fn):
916 ogg = (
917 "https://upload.wikimedia.org/wikipedia/"
918 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)
919 )
920 else:
921 ogg = (
922 "https://upload.wikimedia.org/wikipedia/"
923 "commons/transcoded/"
924 "{}/{}/{}/{}.ogg".format(
925 digest[:1], digest[:2], qfn, qfn
926 )
927 )
928 if re.search(r"(?i)\.(mp3)$", fn): 928 ↛ 929line 928 didn't jump to line 929 because the condition on line 928 was never true
929 mp3 = (
930 "https://upload.wikimedia.org/wikipedia/"
931 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)
932 )
933 else:
934 mp3 = (
935 "https://upload.wikimedia.org/wikipedia/"
936 "commons/transcoded/"
937 "{}/{}/{}/{}.mp3".format(
938 digest[:1], digest[:2], qfn, qfn
939 )
940 )
941 audio["ogg_url"] = ogg
942 audio["mp3_url"] = mp3
943 if active_pos:
944 audio["pos"] = active_pos # type: ignore[typeddict-unknown-key]
945 if audio not in data.get("sounds", ()):
946 data_append(data, "sounds", audio)
948 # if audios:
949 # have_pronunciations = True
950 audios = []
952 data_extend(data, "hyphenations", hyphenations)
953 hyphenations = []
955 ## I have commented out the otherwise unused have_pronunciation
956 ## toggles; uncomment them to use this debug print
957 # if not have_pronunciations and not have_panel_templates:
958 # wxr.wtp.debug("no pronunciations found from pronunciation section",
959 # sortid="pronunciations/533")