Coverage for src/wiktextract/extractor/en/pronunciation.py: 65%
512 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1import hashlib
2import re
3import urllib
4from copy import deepcopy
5from typing import Iterator, Optional, Union
7from wikitextprocessor import NodeKind, TemplateNode, WikiNode
9from ...clean import clean_value
10from ...datautils import data_append, data_extend, split_at_comma_semi
11from ...page import LEVEL_KINDS, clean_node, is_panel_template
12from ...tags import valid_tags
13from ...wxr_context import WiktextractContext
14from .form_descriptions import classify_desc, parse_pronunciation_tags
15from .parts_of_speech import part_of_speech_map
16from .type_utils import SoundData, TemplateArgs, WordData
17from .zh_pron_tags import ZH_PRON_TAGS
19# Prefixes, tags, and regexp for finding romanizations from the pronuncation
20# section
21pron_romanizations = {
22 " Revised Romanization ": "romanization revised",
23 " Revised Romanization (translit.) ": "romanization revised transliteration",
24 " McCune-Reischauer ": "McCune-Reischauer romanization",
25 " McCune–Reischauer ": "McCune-Reischauer romanization",
26 " Yale Romanization ": "Yale romanization",
27}
28pron_romanization_re = re.compile(
29 "(?m)^("
30 + "|".join(
31 re.escape(x)
32 for x in sorted(pron_romanizations.keys(), key=len, reverse=True)
33 )
34 + ")([^\n]+)"
35)
37IPA_EXTRACT = r"^(\((.+)\) )?(IPA⁽ᵏᵉʸ⁾|enPR): ((.+?)( \(([^(]+)\))?\s*)$"
38IPA_EXTRACT_RE = re.compile(IPA_EXTRACT)
41def extract_pron_template(
42 wxr: WiktextractContext, tname: str, targs: TemplateArgs, expanded: str
43) -> Optional[tuple[SoundData, list[SoundData]]]:
44 """In post_template_fn, this is used to handle all enPR and IPA templates
45 so that we can leave breadcrumbs in the text that can later be handled
46 there. We return a `base_data` so that if there are two
47 or more templates on the same line, like this:
48 (Tags for the whole line, really) enPR: foo, IPA(keys): /foo/
49 then we can apply base_data fields to other templates, too, if needed.
50 """
51 cleaned = clean_value(wxr, expanded)
52 # print(f"extract_pron_template input: {tname=} {expanded=}-> {cleaned=}")
53 m = IPA_EXTRACT_RE.match(cleaned)
54 if not m:
55 wxr.wtp.error(
56 f"Text cannot match IPA_EXTRACT_RE regex: "
57 f"{cleaned=}, {tname=}, {targs=}",
58 sortid="en/pronunciation/54",
59 )
60 return None
61 # for i, group in enumerate(m.groups()):
62 # print(i + 1, repr(group))
63 main_qual = m.group(2) or ""
64 if "qq" in targs:
65 # If the template has been given a qualifier that applies to
66 # every entry, but which also happens to appear at the end
67 # which can be confused with the post-qualifier of a single
68 # entry in the style of "... /ipa3/ (foo) (bar)", where foo
69 # might not be present so the bar looks like it only might
70 # apply to `/ipa3/`
71 pron_body = m.group(5)
72 post_qual = m.group(7)
73 else:
74 pron_body = m.group(4)
75 post_qual = ""
77 if not pron_body: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true
78 wxr.wtp.error(
79 f"Regex failed to find 'body' from {cleaned=}",
80 sortid="en/pronunciation/81",
81 )
82 return None
84 base_data: SoundData = {}
85 if main_qual:
86 parse_pronunciation_tags(wxr, main_qual, base_data)
87 if post_qual:
88 parse_pronunciation_tags(wxr, post_qual, base_data)
89 # This base_data is used as the base copy for all entries from this
90 # template, but it is also returned so that its contents may be applied
91 # to other templates on the same line.
92 # print(f"{base_data=}")
94 sound_datas: list[SoundData] = []
96 parts: list[list[str]] = [[]]
97 inside = 0
98 current: list[str] = []
99 for i, p in enumerate(re.split(r"(\s*,|;|\(|\)\s*)", pron_body)):
100 # Split the line on commas and semicolons outside of parens.
101 # This gives us lines with "(main-qualifier) /phon/ (post-qualifier, maybe)"
102 # print(f" {i=}, {p=}")
103 comp = p.strip()
104 if not p:
105 continue
106 if comp == "(":
107 if not inside and i > 0: 107 ↛ 110line 107 didn't jump to line 110 because the condition on line 107 was always true
108 if stripped := "".join(current).strip():
109 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]
110 current = [p]
111 inside += 1
112 continue
113 if comp == ")":
114 inside -= 1
115 if not inside: 115 ↛ 120line 115 didn't jump to line 120 because the condition on line 115 was always true
116 if stripped := "".join(current).strip(): 116 ↛ 120line 116 didn't jump to line 120 because the condition on line 116 was always true
117 current.append(p)
118 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]
119 current = []
120 continue
121 if not inside and comp in (",", ";"):
122 if stripped := "".join(current).strip():
123 parts[-1].append(stripped) # type:ignore[arg-type]
124 current = []
125 parts.append([])
126 continue
127 current.append(p)
128 if current:
129 parts[-1].append("".join(current).strip())
131 # print(f">>>>>> {parts=}")
132 new_parts: list[list[str]] = []
133 for entry in parts:
134 if not entry: 134 ↛ 135line 134 didn't jump to line 135 because the condition on line 134 was never true
135 continue
136 new_entry: list[str] = []
137 i1: int = entry[0].startswith("(") and entry[0].endswith(")")
138 if i1:
139 new_entry.append(entry[0][1:-1].strip())
140 else:
141 new_entry.append("")
142 i2: int = (
143 entry[-1].startswith("(")
144 and entry[-1].endswith(")")
145 and len(entry) > 1
146 )
147 if i2 == 0:
148 i2 = len(entry)
149 else:
150 i2 = -1
151 new_entry.append("".join(entry[i1:i2]).strip())
152 if not new_entry[-1]: 152 ↛ 153line 152 didn't jump to line 153 because the condition on line 152 was never true
153 wxr.wtp.error(
154 f"Missing IPA/enPRO sound data between qualifiers?" f"{entry=}",
155 sortid="en/pronunciation/153",
156 )
157 if i2 == -1:
158 new_entry.append(entry[-1][1:-1].strip())
159 else:
160 new_entry.append("")
161 new_parts.append(new_entry)
163 # print(f">>>>> {new_parts=}")
165 for part in new_parts:
166 sd = deepcopy(base_data)
167 if part[0]:
168 parse_pronunciation_tags(wxr, part[0], sd)
169 if part[2]:
170 parse_pronunciation_tags(wxr, part[2], sd)
171 if tname == "enPR":
172 sd["enpr"] = part[1]
173 else:
174 sd["ipa"] = part[1]
175 sound_datas.append(sd)
177 # print(f"BASE_DATA: {base_data}")
178 # print(f"SOUND_DATAS: {sound_datas=}")
180 return base_data, sound_datas
183def parse_pronunciation(
184 wxr: WiktextractContext,
185 node: WikiNode,
186 data: WordData,
187 etym_data: WordData,
188 have_etym: bool,
189 base_data: WordData,
190 lang_code: str,
191) -> None:
192 """Parses the pronunciation section from a language section on a
193 page."""
194 assert isinstance(node, WikiNode)
195 if node.kind in LEVEL_KINDS: 195 ↛ 198line 195 didn't jump to line 198 because the condition on line 195 was always true
196 contents = node.children
197 else:
198 contents = [node]
199 # Remove subsections, such as Usage notes. They may contain IPAchar
200 # templates in running text, and we do not want to extract IPAs from
201 # those.
202 # Filter out only LEVEL_KINDS; 'or' is doing heavy lifting here
203 # Slip through not-WikiNodes, then slip through WikiNodes that
204 # are not LEVEL_KINDS.
205 contents = [
206 x
207 for x in contents
208 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
209 ]
210 if not any(
211 isinstance(x, WikiNode) and x.kind == NodeKind.LIST for x in contents
212 ):
213 # expand all templates
214 new_contents: list[Union[str, WikiNode]] = []
215 for lst in contents:
216 if (
217 isinstance(lst, TemplateNode)
218 and isinstance(lst.largs[0][0], str)
219 and lst.largs[0][0].strip() != "zh-pron"
220 ):
221 temp = wxr.wtp.node_to_wikitext(lst)
222 temp = wxr.wtp.expand(temp)
223 temp_parsed = wxr.wtp.parse(temp)
224 new_contents.extend(temp_parsed.children)
225 else:
226 new_contents.append(lst)
227 contents = new_contents
229 if have_etym and data is base_data: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true
230 data = etym_data
231 pron_templates: list[tuple[SoundData, list[SoundData]]] = []
232 audios = []
233 have_panel_templates = False
235 def parse_pronunciation_template_fn(
236 name: str, ht: TemplateArgs
237 ) -> Optional[str]:
238 # _template_fn handles templates *before* they are expanded;
239 # this allows for special handling before all the work needed
240 # for expansion is done.
241 nonlocal have_panel_templates
242 if is_panel_template(wxr, name):
243 have_panel_templates = True
244 return ""
245 if name == "audio":
246 filename = ht.get(2) or ""
247 desc = ht.get(3) or ""
248 desc = clean_node(wxr, None, [desc])
249 audio: SoundData = {"audio": filename.strip()}
250 if desc: 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true
251 audio["text"] = desc
252 m = re.search(r"\((([^()]|\([^()]*\))*)\)", desc)
253 skip = False
254 if m: 254 ↛ 255line 254 didn't jump to line 255 because the condition on line 254 was never true
255 par = m.group(1)
256 cls = classify_desc(par)
257 if cls == "tags":
258 parse_pronunciation_tags(wxr, par, audio)
259 else:
260 skip = True
261 if skip: 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true
262 return ""
263 audios.append(audio)
264 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
265 if name == "audio-IPA": 265 ↛ 266line 265 didn't jump to line 266 because the condition on line 265 was never true
266 filename = ht.get(2) or ""
267 ipa = ht.get(3) or ""
268 dial = ht.get("dial")
269 audio = {"audio": filename.strip()}
270 if dial:
271 dial = clean_node(wxr, None, [dial])
272 audio["text"] = dial
273 if ipa:
274 audio["audio-ipa"] = ipa
275 audios.append(audio)
276 # The problem with these IPAs is that they often just describe
277 # what's in the sound file, rather than giving the pronunciation
278 # of the word alone. It is common for audio files to contain
279 # multiple pronunciations or articles in the same file, and then
280 # this IPA often describes what is in the file.
281 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
282 if name == "audio-pron":
283 filename = ht.get(2) or ""
284 ipa = ht.get("ipa") or ""
285 dial = ht.get("dial")
286 country = ht.get("country")
287 audio = {"audio": filename.strip()}
288 if dial: 288 ↛ 292line 288 didn't jump to line 292 because the condition on line 288 was always true
289 dial = clean_node(wxr, None, [dial])
290 audio["text"] = dial
291 parse_pronunciation_tags(wxr, dial, audio)
292 if country: 292 ↛ 294line 292 didn't jump to line 294 because the condition on line 292 was always true
293 parse_pronunciation_tags(wxr, country, audio)
294 if ipa: 294 ↛ 296line 294 didn't jump to line 296 because the condition on line 294 was always true
295 audio["audio-ipa"] = ipa
296 audios.append(audio)
297 # XXX do we really want to extract pronunciations from these?
298 # Or are they spurious / just describing what is in the
299 # audio file?
300 # if ipa:
301 # pron = {"ipa": ipa}
302 # if dial:
303 # parse_pronunciation_tags(wxr, dial, pron)
304 # if country:
305 # parse_pronunciation_tags(wxr, country, pron)
306 # data_append(data, "sounds", pron)
307 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
308 return None
310 def parse_pron_post_template_fn(
311 name: str, ht: TemplateArgs, text: str
312 ) -> Optional[str]:
313 # _post_template_fn handles templates *after* the work to expand
314 # them has been done; this is exactly the same as _template_fn,
315 # except with the additional expanded text as an input, and
316 # possible side-effects from the expansion and recursion (like
317 # calling other subtemplates that are handled in _template_fn.
318 if is_panel_template(wxr, name): 318 ↛ 319line 318 didn't jump to line 319 because the condition on line 318 was never true
319 return ""
320 if name in {
321 "q",
322 "qualifier",
323 "sense",
324 "a",
325 "accent",
326 "l",
327 "link",
328 "lb",
329 "lbl",
330 "label",
331 }:
332 # Kludge: when these templates expand to /.../ or [...],
333 # replace the expansion by something safe. This is used
334 # to filter spurious IPA-looking expansions that aren't really
335 # IPAs. We probably don't care about these templates in the
336 # contexts where they expand to something containing these.
337 v = re.sub(r'href="[^"]*"', "", text) # Ignore URLs
338 v = re.sub(r'src="[^"]*"', "", v)
339 if re.search(r"/[^/,]+?/|\[[^]0-9,/][^],/]*?\]", v): 339 ↛ 345line 339 didn't jump to line 345 because the condition on line 339 was always true
340 # Note: replacing by empty results in Lua errors that we
341 # would rather not have. For example, voi/Middle Vietnamese
342 # uses {{a|{{l{{vi|...}}}}, and the {{a|...}} will fail
343 # if {{l|...}} returns empty.
344 return "stripped-by-parse_pron_post_template_fn"
345 if name in ("IPA", "enPR"):
346 # Extract the data from IPA and enPR templates (same underlying
347 # template) and replace them in-text with magical cookie that
348 # can be later used to refer to the data's index inside
349 # pron_templates.
350 if pron_t := extract_pron_template(wxr, name, ht, text):
351 pron_templates.append(pron_t)
352 return f"__PRON_TEMPLATE_{len(pron_templates)-1}__"
353 return text
355 def parse_expanded_zh_pron(
356 node: WikiNode,
357 parent_hdrs: list[str],
358 specific_hdrs: list[str],
359 unknown_header_tags: set[str],
360 ) -> None:
361 def generate_pron(
362 v, new_parent_hdrs: list[str], new_specific_hdrs: list[str]
363 ) -> Optional[SoundData]:
364 pron: SoundData = {}
365 pron["tags"] = []
366 pron["zh-pron"] = v.strip()
367 for hdr in new_parent_hdrs + new_specific_hdrs:
368 hdr = hdr.strip()
369 valid_hdr = re.sub(r"\s+", "-", hdr)
370 if hdr in ZH_PRON_TAGS:
371 for tag in ZH_PRON_TAGS[hdr]:
372 if tag not in pron["tags"]:
373 pron["tags"].append(tag)
374 elif valid_hdr in valid_tags:
375 if valid_hdr not in pron["tags"]:
376 pron["tags"].append(valid_hdr)
377 else:
378 unknown_header_tags.add(hdr)
379 # convert into normal IPA format if has the IPA flag
380 if "IPA" in pron["tags"]:
381 pron["ipa"] = v
382 del pron["zh-pron"]
383 pron["tags"].remove("IPA")
384 # convert into IPA but retain the Sinological-IPA tag
385 elif "Sinological-IPA" in pron["tags"]:
386 pron["ipa"] = v
387 del pron["zh-pron"]
389 if not (pron.get("zh-pron") or pron.get("ipa")):
390 return None
391 return pron
393 if isinstance(node, list): 393 ↛ 394line 393 didn't jump to line 394 because the condition on line 393 was never true
394 for item in node:
395 parse_expanded_zh_pron(
396 item, parent_hdrs, specific_hdrs, unknown_header_tags
397 )
398 return
399 if not isinstance(node, WikiNode): 399 ↛ 400line 399 didn't jump to line 400 because the condition on line 399 was never true
400 return
401 if node.kind != NodeKind.LIST: 401 ↛ 407line 401 didn't jump to line 407 because the condition on line 401 was always true
402 for item in node.children:
403 parse_expanded_zh_pron(
404 item, parent_hdrs, specific_hdrs, unknown_header_tags
405 )
406 return
407 for item in node.children:
408 assert isinstance(item, WikiNode)
409 assert item.kind == NodeKind.LIST_ITEM
410 base_item = list(
411 x
412 for x in item.children
413 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
414 )
415 text = clean_node(wxr, None, base_item)
416 # print(f"{parent_hdrs} zhpron: {text}") # XXX remove me
417 text = re.sub(r"(?s)\(Note:.*?\)", "", text)
418 # Kludge to clean up text like
419 # '(Standard Chinese, erhua-ed) (旋兒/旋儿)' where
420 # the hanzi are examples
421 hanzi_m = re.match(r"\s*(\([^()]*\))\s*\(([^()]*)\)\s*$", text)
422 if hanzi_m:
423 if re.search("[\u4e00-\u9fff]", hanzi_m.group(2)):
424 text = hanzi_m.group(1)
425 new_parent_hdrs = list(parent_hdrs)
426 new_specific_hdrs = list(specific_hdrs)
427 # look no further, here be dragons...
429 if ": " in text or ":" in text:
430 parts = re.split(r": |:", text)
431 m = re.match(
432 r"\s*\((([^():]+)\s*(:|:)?\s*([^():]*))\)\s*$", text
433 )
434 # Matches lines with stuff like "(Hokkien: Xiamen, Quanzhou)"
435 # thrown into new_parent_hdrs
436 if m:
437 new_parent_hdrs.append(m.group(2).strip())
438 for hdr in m.group(4).split(","):
439 new_specific_hdrs.append(hdr.strip())
440 else:
441 # if "Zhangzhou" in text:
442 # print("\nFOUND IN:", text, "\n")
443 # print("PARTS: ", repr(parts))
444 # print(f" PARTS: {parts}")
445 extra_tags = parts[0]
446 # Kludge to handle how (Hokkien: Locations) and
447 # IPA (Specific Location) interact; this is why
448 # specific_hdrs was introduced to the soup, just
449 # to specify which are actual hierarchical higher
450 # level tags (Min'nan, Hokkien, etc.) which should
451 # always be present and then use specific_hdrs
452 # for that list of misc sublocations and subdialects
453 # that can be overridden by more specific stuff
454 # later.
455 m = re.match(r"\s*IPA\s*\((.*)\)\s*$", extra_tags)
456 if m:
457 new_parent_hdrs.append("IPA")
458 new_specific_hdrs = [
459 s.strip() for s in m.group(1).split(",")
460 ]
461 extra_tags = extra_tags[m.end() :]
463 m = re.match(r"\s*\([^()]*,[^()]*\)\s*$", extra_tags)
464 if m:
465 extra_tags = extra_tags.strip()[1:-1] # remove parens
466 new_parent_hdrs.extend(
467 s.strip() for s in extra_tags.split(",")
468 )
469 elif extra_tags:
470 new_parent_hdrs.append(extra_tags)
472 v = ":".join(parts[1:])
474 # check for phrases
475 if ("," in (wxr.wtp.title or "")) and len(
476 v.split(" ")
477 ) + v.count(",") == len(wxr.wtp.title or ""):
478 # This just captures exact matches where you have
479 # the pronunciation of the whole phrase and nothing
480 # else. Split on spaces, then because we're not
481 # splitting next to a comma we need to add the
482 # count of commas so that it synchs up with the
483 # unicode string length of the original hanzi,
484 # where the comma is a separate character (unlike
485 # in the split list, where it's part of a space-
486 # separated string, like "teo⁴,".
487 vals = [v]
488 pron = generate_pron(
489 v, new_parent_hdrs, new_specific_hdrs
490 )
492 if pron:
493 pron["tags"] = list(sorted(pron["tags"]))
494 if pron not in data.get("sounds", ()):
495 data_append(data, "sounds", pron)
496 elif "→" in v:
497 vals = re.split("→", v)
498 for v in vals:
499 pron = generate_pron(
500 v, new_parent_hdrs, new_specific_hdrs
501 )
502 if pron:
503 m = re.match(
504 r"([^()]+)\s*\(toneless"
505 r" final syllable variant\)\s*",
506 v,
507 )
508 if m:
509 pron["zh-pron"] = m.group(1).strip()
510 pron["tags"].append(
511 "toneless-final-syllable-variant"
512 )
514 pron["tags"] = list(sorted(pron["tags"]))
515 if pron not in data.get("sounds", ()):
516 data_append(data, "sounds", pron)
517 else:
518 # split alternative pronunciations split
519 # with "," or " / "
520 vals = re.split(r"\s*,\s*|\s+/\s+", v)
521 new_vals = []
522 for v2 in vals:
523 if v2.startswith("/") and v2.endswith("/"):
524 # items like /kɛiŋ²¹³⁻⁵³ ^((Ø-))ŋuaŋ³³/
525 new_vals.append(v2)
526 else:
527 # split in parentheses otherwise
528 new_vals.extend(re.split(r"[()]", v2))
529 vals = new_vals
530 for v in vals:
531 pron = generate_pron(
532 v, new_parent_hdrs, new_specific_hdrs
533 )
534 if pron:
535 pron["tags"] = list(sorted(pron["tags"]))
536 if pron not in data.get("sounds", ()):
537 data_append(data, "sounds", pron)
538 else:
539 new_parent_hdrs.append(text)
541 for x in item.children:
542 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST:
543 parse_expanded_zh_pron(
544 x, new_parent_hdrs, specific_hdrs, unknown_header_tags
545 )
547 def parse_chinese_pron(
548 contents: Union[list[Union[WikiNode, str]], WikiNode, str],
549 unknown_header_tags: set[str],
550 ) -> None:
551 if isinstance(contents, list):
552 for item in contents:
553 parse_chinese_pron(item, unknown_header_tags)
554 return
555 if not isinstance(contents, WikiNode):
556 return
557 if contents.kind != NodeKind.TEMPLATE:
558 for item in contents.children:
559 parse_chinese_pron(item, unknown_header_tags)
560 return
561 if (
562 len(contents.largs[0]) == 1
563 and isinstance(contents.largs[0][0], str)
564 and contents.largs[0][0].strip() == "zh-pron"
565 ):
566 src = wxr.wtp.node_to_wikitext(contents)
567 expanded = wxr.wtp.expand(src, templates_to_expand={"zh-pron"})
568 parsed = wxr.wtp.parse(expanded)
569 parse_expanded_zh_pron(parsed, [], [], unknown_header_tags)
570 else:
571 for item in contents.children: 571 ↛ 572line 571 didn't jump to line 572 because the loop on line 571 never started
572 parse_chinese_pron(item, unknown_header_tags)
573 return
575 if lang_code == "zh":
576 unknown_header_tags: set[str] = set()
577 parse_chinese_pron(contents, unknown_header_tags)
578 for hdr in unknown_header_tags: 578 ↛ 579line 578 didn't jump to line 579 because the loop on line 578 never started
579 wxr.wtp.debug(
580 f"Zh-pron header not found in zh_pron_tags or tags: "
581 f"{repr(hdr)}",
582 sortid="pronunciations/296/20230324",
583 )
585 def flattened_tree(
586 lines: list[Union[WikiNode, str]],
587 ) -> Iterator[Union[WikiNode, str]]:
588 assert isinstance(lines, list)
589 for line in lines:
590 yield from flattened_tree1(line)
592 def flattened_tree1(
593 node: Union[WikiNode, str],
594 ) -> Iterator[Union[WikiNode, str]]:
595 assert isinstance(node, (WikiNode, str))
596 if isinstance(node, str):
597 yield node
598 return
599 elif node.kind == NodeKind.LIST:
600 for item in node.children:
601 yield from flattened_tree1(item)
602 elif node.kind == NodeKind.LIST_ITEM:
603 new_children = []
604 sublist = None
605 for child in node.children:
606 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
607 sublist = child
608 else:
609 new_children.append(child)
610 node.children = new_children
611 node.sarg = "*"
612 yield node
613 if sublist:
614 yield from flattened_tree1(sublist)
615 else:
616 yield node
618 # XXX Do not use flattened_tree more than once here, for example for
619 # debug printing... The underlying data is changed, and the separated
620 # sublists disappear.
622 # Kludge for templates that generate several lines, but haven't
623 # been caught by earlier kludges...
624 def split_cleaned_node_on_newlines(
625 contents: list[Union[WikiNode, str]],
626 ) -> Iterator[str]:
627 for litem in flattened_tree(contents):
628 ipa_text = clean_node(
629 wxr,
630 data,
631 litem,
632 template_fn=parse_pronunciation_template_fn,
633 post_template_fn=parse_pron_post_template_fn,
634 )
635 for line in ipa_text.splitlines():
636 yield line
638 # have_pronunciations = False
639 active_pos: Optional[str] = None
641 for line in split_cleaned_node_on_newlines(contents):
642 # print(f"{line=}")
643 prefix: Optional[str] = None
644 earlier_base_data: Optional[SoundData] = None
645 if not line: 645 ↛ 646line 645 didn't jump to line 646 because the condition on line 645 was never true
646 continue
648 split_templates = re.split(r"__PRON_TEMPLATE_(\d+)__", line)
649 for i, text in enumerate(split_templates):
650 if not text:
651 continue
652 # clean up starts at the start of the line
653 text = re.sub(r"^\**\s*", "", text).strip()
654 if i == 0:
655 # At the start of a line, check for stuff like "Noun:"
656 # for active_pos; active_pos is a temporary data field
657 # given to each saved SoundData entry which is later
658 # used to sort the entries into their respective PoSes.
659 m = re.match(r"\s*(\w+\s?\w*)\s*:?\s*", text)
660 if m:
661 if (m_lower := m.group(1).lower()) in part_of_speech_map:
662 active_pos = part_of_speech_map[m_lower]["pos"]
663 text = text[m.end() :].strip()
664 if not text:
665 continue
666 if i % 2 == 1:
667 # re.split (with capture groups) splits the lines so that
668 # every even entry is a captured splitter; odd lines are either
669 # empty strings or stuff around the splitters.
670 base_pron_data, first_prons = pron_templates[int(text)]
671 if base_pron_data:
672 earlier_base_data = base_pron_data
673 # print(f"Set {earlier_base_data=}")
674 elif earlier_base_data is not None: 674 ↛ 691line 674 didn't jump to line 691 because the condition on line 674 was always true
675 # merge data from an earlier iteration of this loop
676 for pr in first_prons:
677 if "note" in pr and "note" in earlier_base_data: 677 ↛ 678line 677 didn't jump to line 678 because the condition on line 677 was never true
678 pr["note"] += ";" + earlier_base_data.get(
679 "note", ""
680 )
681 elif "note" in earlier_base_data: 681 ↛ 682line 681 didn't jump to line 682 because the condition on line 681 was never true
682 pr["note"] = earlier_base_data["note"]
683 if "topics" in earlier_base_data: 683 ↛ 684line 683 didn't jump to line 684 because the condition on line 683 was never true
684 data_extend(
685 pr, "topics", earlier_base_data["topics"]
686 )
687 if "tags" in pr and "tags" in earlier_base_data: 687 ↛ 688line 687 didn't jump to line 688 because the condition on line 687 was never true
688 pr["tags"].extend(earlier_base_data["tags"])
689 elif "tags" in earlier_base_data: 689 ↛ 676line 689 didn't jump to line 676 because the condition on line 689 was always true
690 pr["tags"] = sorted(set(earlier_base_data["tags"]))
691 for pr in first_prons:
692 if active_pos:
693 pr["pos"] = active_pos # type: ignore[typeddict-unknown-key]
694 if pr not in data.get("sounds", ()): 694 ↛ 691line 694 didn't jump to line 691 because the condition on line 694 was always true
695 data_append(data, "sounds", pr)
696 # This bit is handled
697 continue
699 if "IPA" in text:
700 field = "ipa"
701 else:
702 # This is used for Rhymes, Homophones, etc
703 field = "other"
705 # Check if it contains Japanese "Tokyo" pronunciation with
706 # special syntax
707 m = re.search(r"(?m)\(Tokyo\) +([^ ]+) +\[", text)
708 if m: 708 ↛ 709line 708 didn't jump to line 709 because the condition on line 708 was never true
709 pron: SoundData = {field: m.group(1)} # type: ignore[misc]
710 if active_pos:
711 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
712 data_append(data, "sounds", pron)
713 # have_pronunciations = True
714 continue
716 # Check if it contains Rhymes
717 m = re.match(r"\s*Rhymes?: (.*)", text)
718 if m:
719 for ending in split_at_comma_semi(m.group(1)):
720 ending = ending.strip()
721 if ending: 721 ↛ 719line 721 didn't jump to line 719 because the condition on line 721 was always true
722 pron = {"rhymes": ending}
723 if active_pos:
724 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
725 data_append(data, "sounds", pron)
726 # have_pronunciations = True
727 continue
729 # Check if it contains homophones
730 m = re.search(r"(?m)\bHomophones?: (.*)", text)
731 if m:
732 for w in split_at_comma_semi(m.group(1)):
733 w = w.strip()
734 if w: 734 ↛ 732line 734 didn't jump to line 732 because the condition on line 734 was always true
735 pron = {"homophone": w}
736 if active_pos:
737 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
738 data_append(data, "sounds", pron)
739 # have_pronunciations = True
740 continue
742 # Check if it contains Phonetic hangeul
743 m = re.search(r"(?m)\bPhonetic hange?ul: \[([^]]+)\]", text)
744 if m: 744 ↛ 745line 744 didn't jump to line 745 because the condition on line 744 was never true
745 seen = set()
746 for w in m.group(1).split("/"):
747 w = w.strip()
748 if w and w not in seen:
749 seen.add(w)
750 pron = {"hangeul": w}
751 if active_pos:
752 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
753 data_append(data, "sounds", pron)
754 # have_pronunciations = True
756 m = re.search(r"\b(Syllabification|Hyphenation): ([^\s,]*)", text)
757 if m: 757 ↛ 758line 757 didn't jump to line 758 because the condition on line 757 was never true
758 data_append(data, "hyphenation", m.group(2))
759 # have_pronunciations = True
761 # See if it contains a word prefix restricting which forms the
762 # pronunciation applies to (see amica/Latin) and/or parenthesized
763 # tags.
764 m = re.match(
765 r"^[*#\s]*(([-\w]+):\s+)?\((([^()]|\([^()]*\))*?)\)", text
766 )
767 if m:
768 prefix = m.group(2) or ""
769 tagstext = m.group(3)
770 text = text[m.end() :]
771 else:
772 m = re.match(r"^[*#\s]*([-\w]+):\s+", text)
773 if m:
774 prefix = m.group(1)
775 tagstext = ""
776 text = text[m.end() :]
777 else:
778 # Spanish has tags before pronunciations, eg. aceite/Spanish
779 m = re.match(r".*:\s+\(([^)]*)\)\s+(.*)", text)
780 if m: 780 ↛ 781line 780 didn't jump to line 781 because the condition on line 780 was never true
781 tagstext = m.group(1)
782 text = m.group(2)
783 else:
784 # No prefix. In this case, we inherit prefix
785 # from previous entry. This particularly
786 # applies for nested Audio files.
787 tagstext = ""
788 if tagstext:
789 earlier_base_data = {}
790 parse_pronunciation_tags(wxr, tagstext, earlier_base_data)
792 # Find romanizations from the pronunciation section (routinely
793 # produced for Korean by {{ko-IPA}})
794 for m in re.finditer(pron_romanization_re, text): 794 ↛ 795line 794 didn't jump to line 795 because the loop on line 794 never started
795 prefix = m.group(1)
796 w = m.group(2).strip()
797 tag = pron_romanizations[prefix]
798 form = {"form": w, "tags": tag.split()}
799 data_append(data, "forms", form)
801 # Find IPA pronunciations
802 for m in re.finditer(
803 r"(?m)/[^][\n/,]+?/" r"|" r"\[[^]\n0-9,/][^],/]*?\]", text
804 ):
805 v = m.group(0)
806 # The regexp above can match file links. Skip them.
807 if v.startswith("[[File:"): 807 ↛ 808line 807 didn't jump to line 808 because the condition on line 807 was never true
808 continue
809 if v == "/wiki.local/": 809 ↛ 810line 809 didn't jump to line 810 because the condition on line 809 was never true
810 continue
811 if field == "ipa" and "__AUDIO_IGNORE_THIS__" in text: 811 ↛ 812line 811 didn't jump to line 812 because the condition on line 811 was never true
812 m = re.search(r"__AUDIO_IGNORE_THIS__(\d+)__", text)
813 assert m
814 idx = int(m.group(1))
815 if idx >= len(audios):
816 continue
817 if not audios[idx].get("audio-ipa"):
818 audios[idx]["audio-ipa"] = v
819 if prefix:
820 audios[idx]["form"] = prefix
821 else:
822 if earlier_base_data:
823 pron = deepcopy(earlier_base_data)
824 pron[field] = v
825 else:
826 pron = {field: v} # type: ignore[misc]
827 if active_pos:
828 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
829 if prefix:
830 pron["form"] = prefix
831 if active_pos:
832 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
833 data_append(data, "sounds", pron)
834 # have_pronunciations = True
836 # XXX what about {{hyphenation|...}}, {{hyph|...}}
837 # and those used to be stored under "hyphenation"
839 # Add data that was collected in template_fn
840 for audio in audios:
841 if "audio" in audio: 841 ↛ 898line 841 didn't jump to line 898 because the condition on line 841 was always true
842 # Compute audio file URLs
843 fn = audio["audio"]
844 # Strip certain characters, e.g., left-to-right mark
845 fn = re.sub(r"[\u200f\u200e]", "", fn)
846 fn = fn.strip()
847 fn = urllib.parse.unquote(fn)
848 # First character is usually uppercased
849 if re.match(r"^[a-z][a-z]+", fn):
850 fn = fn[0].upper() + fn[1:]
851 if fn in wxr.config.redirects: 851 ↛ 852line 851 didn't jump to line 852 because the condition on line 851 was never true
852 fn = wxr.config.redirects[fn]
853 # File extension is lowercased
854 # XXX some words seem to need this, some don't seem to
855 # have this??? what is the exact rule?
856 # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn)
857 # Spaces are converted to underscores
858 fn = re.sub(r"\s+", "_", fn)
859 # Compute hash digest part
860 h = hashlib.md5()
861 hname = fn.encode("utf-8")
862 h.update(hname)
863 digest = h.hexdigest()
864 # Quote filename for URL
865 qfn = urllib.parse.quote(fn)
866 # For safety when writing files
867 qfn = qfn.replace("/", "__slash__")
868 if re.search(r"(?i)\.(ogg|oga)$", fn):
869 ogg = (
870 "https://upload.wikimedia.org/wikipedia/"
871 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)
872 )
873 else:
874 ogg = (
875 "https://upload.wikimedia.org/wikipedia/"
876 "commons/transcoded/"
877 "{}/{}/{}/{}.ogg".format(
878 digest[:1], digest[:2], qfn, qfn
879 )
880 )
881 if re.search(r"(?i)\.(mp3)$", fn): 881 ↛ 882line 881 didn't jump to line 882 because the condition on line 881 was never true
882 mp3 = (
883 "https://upload.wikimedia.org/wikipedia/"
884 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)
885 )
886 else:
887 mp3 = (
888 "https://upload.wikimedia.org/wikipedia/"
889 "commons/transcoded/"
890 "{}/{}/{}/{}.mp3".format(
891 digest[:1], digest[:2], qfn, qfn
892 )
893 )
894 audio["ogg_url"] = ogg
895 audio["mp3_url"] = mp3
896 if active_pos:
897 audio["pos"] = active_pos # type: ignore[typeddict-unknown-key]
898 if audio not in data.get("sounds", ()):
899 data_append(data, "sounds", audio)
900 # if audios:
901 # have_pronunciations = True
902 audios = []
904 ## I have commented out the otherwise unused have_pronunciation
905 ## toggles; uncomment them to use this debug print
906 # if not have_pronunciations and not have_panel_templates:
907 # wxr.wtp.debug("no pronunciations found from pronunciation section",
908 # sortid="pronunciations/533")