Coverage for src/wiktextract/extractor/en/pronunciation.py: 52%
510 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import hashlib
2import re
3import urllib
4from copy import deepcopy
5from typing import Iterator, Optional, Union
7from wikitextprocessor import NodeKind, TemplateNode, WikiNode
9from ...clean import clean_value
10from ...datautils import data_append, data_extend, split_at_comma_semi
11from ...page import LEVEL_KINDS, clean_node, is_panel_template
12from ...tags import valid_tags
13from ...wxr_context import WiktextractContext
14from .form_descriptions import classify_desc, parse_pronunciation_tags
15from .parts_of_speech import part_of_speech_map
16from .type_utils import SoundData, TemplateArgs, WordData
17from .zh_pron_tags import ZH_PRON_TAGS
19# Prefixes, tags, and regexp for finding romanizations from the pronuncation
20# section
21pron_romanizations = {
22 " Revised Romanization ": "romanization revised",
23 " Revised Romanization (translit.) ": "romanization revised transliteration",
24 " McCune-Reischauer ": "McCune-Reischauer romanization",
25 " McCune–Reischauer ": "McCune-Reischauer romanization",
26 " Yale Romanization ": "Yale romanization",
27}
28pron_romanization_re = re.compile(
29 "(?m)^("
30 + "|".join(
31 re.escape(x)
32 for x in sorted(pron_romanizations.keys(), key=len, reverse=True)
33 )
34 + ")([^\n]+)"
35)
37IPA_EXTRACT = r"^(\((.+)\) )?(IPA⁽ᵏᵉʸ⁾|enPR): ((.+?)( \(([^(]+)\))?\s*)$"
38IPA_EXTRACT_RE = re.compile(IPA_EXTRACT)
41def extract_pron_template(
42 wxr: WiktextractContext, tname: str, targs: TemplateArgs, expanded: str
43) -> Optional[tuple[SoundData, list[SoundData]]]:
44 """In post_template_fn, this is used to handle all enPR and IPA templates
45 so that we can leave breadcrumbs in the text that can later be handled
46 there. We return a `base_data` so that if there are two
47 or more templates on the same line, like this:
48 (Tags for the whole line, really) enPR: foo, IPA(keys): /foo/
49 then we can apply base_data fields to other templates, too, if needed.
50 """
51 cleaned = clean_value(wxr, expanded)
52 # print(f"extract_pron_template input: {tname=} {expanded=}-> {cleaned=}")
53 m = IPA_EXTRACT_RE.match(cleaned)
54 if not m: 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 wxr.wtp.error(
56 f"Text cannot match IPA_EXTRACT_RE regex: "
57 f"{cleaned=}, {tname=}, {targs=}",
58 sortid="en/pronunciation/54",
59 )
60 return None
61 # for i, group in enumerate(m.groups()):
62 # print(i + 1, repr(group))
63 main_qual = m.group(2) or ""
64 if "qq" in targs:
65 # If the template has been given a qualifier that applies to
66 # every entry, but which also happens to appear at the end
67 # which can be confused with the post-qualifier of a single
68 # entry in the style of "... /ipa3/ (foo) (bar)", where foo
69 # might not be present so the bar looks like it only might
70 # apply to `/ipa3/`
71 pron_body = m.group(5)
72 post_qual = m.group(7)
73 else:
74 pron_body = m.group(4)
75 post_qual = ""
77 if not pron_body: 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true
78 wxr.wtp.error(
79 f"Regex failed to find 'body' from {cleaned=}",
80 sortid="en/pronunciation/81",
81 )
82 return None
84 base_data: SoundData = {}
85 if main_qual:
86 parse_pronunciation_tags(wxr, main_qual, base_data)
87 if post_qual:
88 parse_pronunciation_tags(wxr, post_qual, base_data)
89 # This base_data is used as the base copy for all entries from this
90 # template, but it is also returned so that its contents may be applied
91 # to other templates on the same line.
92 # print(f"{base_data=}")
94 sound_datas: list[SoundData] = []
96 parts: list[list[str]] = [[]]
97 inside = 0
98 current: list[str] = []
99 for i, p in enumerate(re.split(r"(\s*,|;|\(|\)\s*)", pron_body)):
100 # Split the line on commas and semicolons outside of parens.
101 # This gives us lines with "(main-qualifier) /phon/ (post-qualifier, maybe)"
102 # print(f" {i=}, {p=}")
103 comp = p.strip()
104 if not p:
105 continue
106 if comp == "(":
107 if not inside and i > 0: 107 ↛ 110line 107 didn't jump to line 110 because the condition on line 107 was always true
108 if stripped := "".join(current).strip():
109 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]
110 current = [p]
111 inside += 1
112 continue
113 if comp == ")":
114 inside -= 1
115 if not inside: 115 ↛ 120line 115 didn't jump to line 120 because the condition on line 115 was always true
116 if stripped := "".join(current).strip(): 116 ↛ 120line 116 didn't jump to line 120 because the condition on line 116 was always true
117 current.append(p)
118 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]
119 current = []
120 continue
121 if not inside and comp in (",", ";"):
122 if stripped := "".join(current).strip():
123 parts[-1].append(stripped) # type:ignore[arg-type]
124 current = []
125 parts.append([])
126 continue
127 current.append(p)
128 if current:
129 parts[-1].append("".join(current).strip())
131 # print(f">>>>>> {parts=}")
132 new_parts: list[list[str]] = []
133 for entry in parts:
134 if not entry: 134 ↛ 135line 134 didn't jump to line 135 because the condition on line 134 was never true
135 continue
136 new_entry: list[str] = []
137 i1: int = entry[0].startswith("(") and entry[0].endswith(")")
138 if i1:
139 new_entry.append(entry[0][1:-1].strip())
140 else:
141 new_entry.append("")
142 i2: int = (
143 entry[-1].startswith("(")
144 and entry[-1].endswith(")")
145 and len(entry) > 1
146 )
147 if i2 == 0:
148 i2 = len(entry)
149 else:
150 i2 = -1
151 new_entry.append("".join(entry[i1:i2]).strip())
152 if not new_entry[-1]: 152 ↛ 153line 152 didn't jump to line 153 because the condition on line 152 was never true
153 wxr.wtp.error(
154 f"Missing IPA/enPRO sound data between qualifiers?" f"{entry=}",
155 sortid="en/pronunciation/153",
156 )
157 if i2 == -1:
158 new_entry.append(entry[-1][1:-1].strip())
159 else:
160 new_entry.append("")
161 new_parts.append(new_entry)
163 # print(f">>>>> {new_parts=}")
165 for part in new_parts:
166 sd = deepcopy(base_data)
167 if part[0]:
168 parse_pronunciation_tags(wxr, part[0], sd)
169 if part[2]:
170 parse_pronunciation_tags(wxr, part[2], sd)
171 if tname == "enPR":
172 sd["enpr"] = part[1]
173 else:
174 sd["ipa"] = part[1]
175 sound_datas.append(sd)
177 # print(f"BASE_DATA: {base_data}")
178 # print(f"SOUND_DATAS: {sound_datas=}")
180 return base_data, sound_datas
183def parse_pronunciation(
184 wxr: WiktextractContext,
185 node: WikiNode,
186 data: WordData,
187 etym_data: WordData,
188 have_etym: bool,
189 base_data: WordData,
190 lang_code: str,
191) -> None:
192 """Parses the pronunciation section from a language section on a
193 page."""
194 assert isinstance(node, WikiNode)
195 if node.kind in LEVEL_KINDS: 195 ↛ 198line 195 didn't jump to line 198 because the condition on line 195 was always true
196 contents = node.children
197 else:
198 contents = [node]
199 # Remove subsections, such as Usage notes. They may contain IPAchar
200 # templates in running text, and we do not want to extract IPAs from
201 # those.
202 # Filter out only LEVEL_KINDS; 'or' is doing heavy lifting here
203 # Slip through not-WikiNodes, then slip through WikiNodes that
204 # are not LEVEL_KINDS.
205 contents = [
206 x
207 for x in contents
208 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
209 ]
210 if not any( 210 ↛ 214line 210 didn't jump to line 214 because the condition on line 210 was never true
211 isinstance(x, WikiNode) and x.kind == NodeKind.LIST for x in contents
212 ):
213 # expand all templates
214 new_contents: list[Union[str, WikiNode]] = []
215 for lst in contents:
216 if (
217 isinstance(lst, TemplateNode)
218 and isinstance(lst.largs[0][0], str)
219 and lst.largs[0][0].strip() != "zh-pron"
220 ):
221 temp = wxr.wtp.node_to_wikitext(lst)
222 temp = wxr.wtp.expand(temp)
223 temp_parsed = wxr.wtp.parse(temp)
224 new_contents.extend(temp_parsed.children)
225 else:
226 new_contents.append(lst)
227 contents = new_contents
229 if have_etym and data is base_data: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true
230 data = etym_data
231 pron_templates: list[tuple[SoundData, list[SoundData]]] = []
232 audios = []
233 have_panel_templates = False
235 def parse_pronunciation_template_fn(
236 name: str, ht: TemplateArgs
237 ) -> Optional[str]:
238 # _template_fn handles templates *before* they are expanded;
239 # this allows for special handling before all the work needed
240 # for expansion is done.
241 nonlocal have_panel_templates
242 if is_panel_template(wxr, name): 242 ↛ 243line 242 didn't jump to line 243 because the condition on line 242 was never true
243 have_panel_templates = True
244 return ""
245 if name == "audio":
246 filename = ht.get(2) or ""
247 desc = ht.get(3) or ""
248 desc = clean_node(wxr, None, [desc])
249 audio: SoundData = {"audio": filename.strip()}
250 if desc: 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true
251 audio["text"] = desc
252 m = re.search(r"\((([^()]|\([^()]*\))*)\)", desc)
253 skip = False
254 if m: 254 ↛ 255line 254 didn't jump to line 255 because the condition on line 254 was never true
255 par = m.group(1)
256 cls = classify_desc(par)
257 if cls == "tags":
258 parse_pronunciation_tags(wxr, par, audio)
259 else:
260 skip = True
261 if skip: 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true
262 return ""
263 audios.append(audio)
264 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
265 if name == "audio-IPA": 265 ↛ 266line 265 didn't jump to line 266 because the condition on line 265 was never true
266 filename = ht.get(2) or ""
267 ipa = ht.get(3) or ""
268 dial = ht.get("dial")
269 audio = {"audio": filename.strip()}
270 if dial:
271 dial = clean_node(wxr, None, [dial])
272 audio["text"] = dial
273 if ipa:
274 audio["audio-ipa"] = ipa
275 audios.append(audio)
276 # The problem with these IPAs is that they often just describe
277 # what's in the sound file, rather than giving the pronunciation
278 # of the word alone. It is common for audio files to contain
279 # multiple pronunciations or articles in the same file, and then
280 # this IPA often describes what is in the file.
281 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
282 if name == "audio-pron": 282 ↛ 283line 282 didn't jump to line 283 because the condition on line 282 was never true
283 filename = ht.get(2) or ""
284 ipa = ht.get("ipa") or ""
285 dial = ht.get("dial")
286 country = ht.get("country")
287 audio = {"audio": filename.strip()}
288 if dial:
289 dial = clean_node(wxr, None, [dial])
290 audio["text"] = dial
291 parse_pronunciation_tags(wxr, dial, audio)
292 if country:
293 parse_pronunciation_tags(wxr, country, audio)
294 if ipa:
295 audio["audio-ipa"] = ipa
296 audios.append(audio)
297 # XXX do we really want to extract pronunciations from these?
298 # Or are they spurious / just describing what is in the
299 # audio file?
300 # if ipa:
301 # pron = {"ipa": ipa}
302 # if dial:
303 # parse_pronunciation_tags(wxr, dial, pron)
304 # if country:
305 # parse_pronunciation_tags(wxr, country, pron)
306 # data_append(data, "sounds", pron)
307 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
308 return None
310 def parse_pron_post_template_fn(
311 name: str, ht: TemplateArgs, text: str
312 ) -> Optional[str]:
313 # _post_template_fn handles templates *after* the work to expand
314 # them has been done; this is exactly the same as _template_fn,
315 # except with the additional expanded text as an input, and
316 # possible side-effects from the expansion and recursion (like
317 # calling other subtemplates that are handled in _template_fn.
318 if is_panel_template(wxr, name): 318 ↛ 319line 318 didn't jump to line 319 because the condition on line 318 was never true
319 return ""
320 if name in { 320 ↛ 337line 320 didn't jump to line 337 because the condition on line 320 was never true
321 "q",
322 "qualifier",
323 "sense",
324 "a",
325 "accent",
326 "l",
327 "link",
328 "lb",
329 "lbl",
330 "label",
331 }:
332 # Kludge: when these templates expand to /.../ or [...],
333 # replace the expansion by something safe. This is used
334 # to filter spurious IPA-looking expansions that aren't really
335 # IPAs. We probably don't care about these templates in the
336 # contexts where they expand to something containing these.
337 v = re.sub(r'href="[^"]*"', "", text) # Ignore URLs
338 v = re.sub(r'src="[^"]*"', "", v)
339 if re.search(r"/[^/,]+?/|\[[^]0-9,/][^],/]*?\]", v):
340 # Note: replacing by empty results in Lua errors that we
341 # would rather not have. For example, voi/Middle Vietnamese
342 # uses {{a|{{l{{vi|...}}}}, and the {{a|...}} will fail
343 # if {{l|...}} returns empty.
344 return "stripped-by-parse_pron_post_template_fn"
345 if name in ("IPA", "enPR"):
346 # Extract the data from IPA and enPR templates (same underlying
347 # template) and replace them in-text with magical cookie that
348 # can be later used to refer to the data's index inside
349 # pron_templates.
350 if pron_t := extract_pron_template(wxr, name, ht, text): 350 ↛ 353line 350 didn't jump to line 353 because the condition on line 350 was always true
351 pron_templates.append(pron_t)
352 return f"__PRON_TEMPLATE_{len(pron_templates)-1}__"
353 return text
355 def parse_expanded_zh_pron(
356 node: WikiNode,
357 parent_hdrs: list[str],
358 specific_hdrs: list[str],
359 unknown_header_tags: set[str],
360 ) -> None:
361 def generate_pron(
362 v, new_parent_hdrs: list[str], new_specific_hdrs: list[str]
363 ) -> Optional[SoundData]:
364 pron: SoundData = {}
365 pron["tags"] = []
366 pron["zh-pron"] = v.strip()
367 for hdr in new_parent_hdrs + new_specific_hdrs:
368 hdr = hdr.strip()
369 valid_hdr = re.sub(r"\s+", "-", hdr)
370 if hdr in ZH_PRON_TAGS:
371 for tag in ZH_PRON_TAGS[hdr]:
372 if tag not in pron["tags"]:
373 pron["tags"].append(tag)
374 elif valid_hdr in valid_tags:
375 if valid_hdr not in pron["tags"]:
376 pron["tags"].append(valid_hdr)
377 else:
378 unknown_header_tags.add(hdr)
379 # convert into normal IPA format if has the IPA flag
380 if "IPA" in pron["tags"]:
381 pron["ipa"] = v
382 del pron["zh-pron"]
383 pron["tags"].remove("IPA")
384 # convert into IPA but retain the Sinological-IPA tag
385 elif "Sinological-IPA" in pron["tags"]:
386 pron["ipa"] = v
387 del pron["zh-pron"]
389 if not (pron.get("zh-pron") or pron.get("ipa")):
390 return None
391 return pron
393 if isinstance(node, list):
394 for item in node:
395 parse_expanded_zh_pron(
396 item, parent_hdrs, specific_hdrs, unknown_header_tags
397 )
398 return
399 if not isinstance(node, WikiNode):
400 return
401 if node.kind != NodeKind.LIST:
402 for item in node.children:
403 parse_expanded_zh_pron(
404 item, parent_hdrs, specific_hdrs, unknown_header_tags
405 )
406 return
407 for item in node.children:
408 assert isinstance(item, WikiNode)
409 assert item.kind == NodeKind.LIST_ITEM
410 base_item = list(
411 x
412 for x in item.children
413 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
414 )
415 text = clean_node(wxr, None, base_item)
416 # print(f"{parent_hdrs} zhpron: {text}") # XXX remove me
417 text = re.sub(r"(?s)\(Note:.*?\)", "", text)
418 # Kludge to clean up text like
419 # '(Standard Chinese, erhua-ed) (旋兒/旋儿)' where
420 # the hanzi are examples
421 hanzi_m = re.match(r"\s*(\([^()]*\))\s*\(([^()]*)\)\s*$", text)
422 if hanzi_m:
423 if re.search("[\u4e00-\u9fff]", hanzi_m.group(2)):
424 text = hanzi_m.group(1)
425 new_parent_hdrs = list(parent_hdrs)
426 new_specific_hdrs = list(specific_hdrs)
427 # look no further, here be dragons...
429 if ": " in text or ":" in text:
430 parts = re.split(r": |:", text)
431 m = re.match(
432 r"\s*\((([^():]+)\s*(:|:)?\s*([^():]*))\)\s*$", text
433 )
434 # Matches lines with stuff like "(Hokkien: Xiamen, Quanzhou)"
435 # thrown into new_parent_hdrs
436 if m:
437 new_parent_hdrs.append(m.group(2).strip())
438 for hdr in m.group(4).split(","):
439 new_specific_hdrs.append(hdr.strip())
440 else:
441 # if "Zhangzhou" in text:
442 # print("\nFOUND IN:", text, "\n")
443 # print("PARTS: ", repr(parts))
444 # print(f" PARTS: {parts}")
445 extra_tags = parts[0]
446 # Kludge to handle how (Hokkien: Locations) and
447 # IPA (Specific Location) interact; this is why
448 # specific_hdrs was introduced to the soup, just
449 # to specify which are actual hierarchical higher
450 # level tags (Min'nan, Hokkien, etc.) which should
451 # always be present and then use specific_hdrs
452 # for that list of misc sublocations and subdialects
453 # that can be overridden by more specific stuff
454 # later.
455 m = re.match(r"\s*IPA\s*\((.*)\)\s*$", extra_tags)
456 if m:
457 new_parent_hdrs.append("IPA")
458 new_specific_hdrs = [
459 s.strip() for s in m.group(1).split(",")
460 ]
461 extra_tags = extra_tags[m.end() :]
463 m = re.match(r"\s*\([^()]*,[^()]*\)\s*$", extra_tags)
464 if m:
465 extra_tags = extra_tags.strip()[1:-1] # remove parens
466 new_parent_hdrs.extend(
467 s.strip() for s in extra_tags.split(",")
468 )
469 elif extra_tags:
470 new_parent_hdrs.append(extra_tags)
472 v = ":".join(parts[1:])
474 # check for phrases
475 if ("," in (wxr.wtp.title or "")) and len(
476 v.split(" ")
477 ) + v.count(",") == len(wxr.wtp.title or ""):
478 # This just captures exact matches where you have
479 # the pronunciation of the whole phrase and nothing
480 # else. Split on spaces, then because we're not
481 # splitting next to a comma we need to add the
482 # count of commas so that it synchs up with the
483 # unicode string length of the original hanzi,
484 # where the comma is a separate character (unlike
485 # in the split list, where it's part of a space-
486 # separated string, like "teo⁴,".
487 vals = [v]
488 pron = generate_pron(
489 v, new_parent_hdrs, new_specific_hdrs
490 )
492 if pron:
493 pron["tags"] = list(sorted(pron["tags"]))
494 if pron not in data.get("sounds", ()):
495 data_append(data, "sounds", pron)
496 elif "→" in v:
497 vals = re.split("→", v)
498 for v in vals:
499 pron = generate_pron(
500 v, new_parent_hdrs, new_specific_hdrs
501 )
502 if pron:
503 m = re.match(
504 r"([^()]+)\s*\(toneless"
505 r" final syllable variant\)\s*",
506 v,
507 )
508 if m:
509 pron["zh-pron"] = m.group(1).strip()
510 pron["tags"].append(
511 "toneless-final-syllable-variant"
512 )
514 pron["tags"] = list(sorted(pron["tags"]))
515 if pron not in data.get("sounds", ()):
516 data_append(data, "sounds", pron)
517 else:
518 # split alternative pronunciations split
519 # with "," or " / "
520 vals = re.split(r"\s*,\s*|\s+/\s+", v)
521 new_vals = []
522 for v2 in vals:
523 if v2.startswith("/") and v2.endswith("/"):
524 # items like /kɛiŋ²¹³⁻⁵³ ^((Ø-))ŋuaŋ³³/
525 new_vals.append(v2)
526 else:
527 # split in parentheses otherwise
528 new_vals.extend(re.split(r"[()]", v2))
529 vals = new_vals
530 for v in vals:
531 pron = generate_pron(
532 v, new_parent_hdrs, new_specific_hdrs
533 )
534 if pron:
535 pron["tags"] = list(sorted(pron["tags"]))
536 if pron not in data.get("sounds", ()):
537 data_append(data, "sounds", pron)
538 else:
539 new_parent_hdrs.append(text)
541 for x in item.children:
542 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST:
543 parse_expanded_zh_pron(
544 x, new_parent_hdrs, specific_hdrs, unknown_header_tags
545 )
547 def parse_chinese_pron(
548 contents: Union[list[Union[WikiNode, str]], WikiNode, str],
549 unknown_header_tags: set[str],
550 ) -> None:
551 if isinstance(contents, list):
552 for item in contents:
553 parse_chinese_pron(item, unknown_header_tags)
554 return
555 if not isinstance(contents, WikiNode):
556 return
557 if contents.kind != NodeKind.TEMPLATE:
558 for item in contents.children:
559 parse_chinese_pron(item, unknown_header_tags)
560 return
561 if (
562 len(contents.largs[0]) == 1
563 and isinstance(contents.largs[0][0], str)
564 and contents.largs[0][0].strip() == "zh-pron"
565 ):
566 src = wxr.wtp.node_to_wikitext(contents)
567 expanded = wxr.wtp.expand(src, templates_to_expand={"zh-pron"})
568 parsed = wxr.wtp.parse(expanded)
569 parse_expanded_zh_pron(parsed, [], [], unknown_header_tags)
570 else:
571 for item in contents.children:
572 parse_chinese_pron(item, unknown_header_tags)
573 return
575 if lang_code == "zh": 575 ↛ 576line 575 didn't jump to line 576 because the condition on line 575 was never true
576 unknown_header_tags: set[str] = set()
577 parse_chinese_pron(contents, unknown_header_tags)
578 for hdr in unknown_header_tags:
579 wxr.wtp.debug(
580 f"Zh-pron header not found in zh_pron_tags or tags: "
581 f"{repr(hdr)}",
582 sortid="pronunciations/296/20230324",
583 )
585 def flattened_tree(
586 lines: list[Union[WikiNode, str]],
587 ) -> Iterator[Union[WikiNode, str]]:
588 assert isinstance(lines, list)
589 for line in lines:
590 yield from flattened_tree1(line)
592 def flattened_tree1(
593 node: Union[WikiNode, str],
594 ) -> Iterator[Union[WikiNode, str]]:
595 assert isinstance(node, (WikiNode, str))
596 if isinstance(node, str):
597 yield node
598 return
599 elif node.kind == NodeKind.LIST:
600 for item in node.children:
601 yield from flattened_tree1(item)
602 elif node.kind == NodeKind.LIST_ITEM: 602 ↛ 616line 602 didn't jump to line 616 because the condition on line 602 was always true
603 new_children = []
604 sublist = None
605 for child in node.children:
606 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST: 606 ↛ 607line 606 didn't jump to line 607 because the condition on line 606 was never true
607 sublist = child
608 else:
609 new_children.append(child)
610 node.children = new_children
611 node.sarg = "*"
612 yield node
613 if sublist: 613 ↛ 614line 613 didn't jump to line 614 because the condition on line 613 was never true
614 yield from flattened_tree1(sublist)
615 else:
616 yield node
618 # XXX Do not use flattened_tree more than once here, for example for
619 # debug printing... The underlying data is changed, and the separated
620 # sublists disappear.
622 # Kludge for templates that generate several lines, but haven't
623 # been caught by earlier kludges...
624 def split_cleaned_node_on_newlines(
625 contents: list[Union[WikiNode, str]],
626 ) -> Iterator[str]:
627 for litem in flattened_tree(contents):
628 ipa_text = clean_node(
629 wxr,
630 data,
631 litem,
632 template_fn=parse_pronunciation_template_fn,
633 post_template_fn=parse_pron_post_template_fn,
634 )
635 for line in ipa_text.splitlines():
636 yield line
638 # have_pronunciations = False
639 active_pos: Optional[str] = None
641 for line in split_cleaned_node_on_newlines(contents):
642 # print(f"{line=}")
643 prefix: Optional[str] = None
644 earlier_base_data: Optional[SoundData] = None
645 if not line: 645 ↛ 646line 645 didn't jump to line 646 because the condition on line 645 was never true
646 continue
648 split_templates = re.split(r"__PRON_TEMPLATE_(\d+)__", line)
649 for i, text in enumerate(split_templates):
650 if not text:
651 continue
652 # clean up starts at the start of the line
653 text = re.sub(r"^\**\s*", "", text).strip()
654 if i == 0:
655 # At the start of a line, check for stuff like "Noun:"
656 # for active_pos; active_pos is a temporary data field
657 # given to each saved SoundData entry which is later
658 # used to sort the entries into their respective PoSes.
659 m = re.match(r"\s*(\w+\s?\w*)\s*:?\s*", text)
660 if m:
661 if (m_lower := m.group(1).lower()) in part_of_speech_map:
662 active_pos = part_of_speech_map[m_lower]["pos"]
663 text = text[m.end() :].strip()
664 if not text:
665 continue
666 if i % 2 == 1:
667 # re.split (with capture groups) splits the lines so that
668 # every even entry is a captured splitter; odd lines are either
669 # empty strings or stuff around the splitters.
670 base_pron_data, first_prons = pron_templates[int(text)]
671 if base_pron_data:
672 earlier_base_data = base_pron_data
673 # print(f"Set {earlier_base_data=}")
674 elif earlier_base_data is not None: 674 ↛ 691line 674 didn't jump to line 691 because the condition on line 674 was always true
675 # merge data from an earlier iteration of this loop
676 for pr in first_prons:
677 if "note" in pr and "note" in earlier_base_data: 677 ↛ 678line 677 didn't jump to line 678 because the condition on line 677 was never true
678 pr["note"] += ";" + earlier_base_data.get(
679 "note", ""
680 )
681 elif "note" in earlier_base_data: 681 ↛ 682line 681 didn't jump to line 682 because the condition on line 681 was never true
682 pr["note"] = earlier_base_data["note"]
683 if "topics" in earlier_base_data: 683 ↛ 684line 683 didn't jump to line 684 because the condition on line 683 was never true
684 data_extend(
685 pr, "topics", earlier_base_data["topics"]
686 )
687 if "tags" in pr and "tags" in earlier_base_data: 687 ↛ 688line 687 didn't jump to line 688 because the condition on line 687 was never true
688 pr["tags"].extend(earlier_base_data["tags"])
689 elif "tags" in earlier_base_data: 689 ↛ 676line 689 didn't jump to line 676 because the condition on line 689 was always true
690 pr["tags"] = sorted(set(earlier_base_data["tags"]))
691 for pr in first_prons:
692 if active_pos:
693 pr["pos"] = active_pos # type: ignore[typeddict-unknown-key]
694 if pr not in data.get("sounds", ()): 694 ↛ 691line 694 didn't jump to line 691 because the condition on line 694 was always true
695 data_append(data, "sounds", pr)
696 # This bit is handled
697 continue
699 if "IPA" in text:
700 field = "ipa"
701 else:
702 # This is used for Rhymes, Homophones, etc
703 field = "other"
705 # Check if it contains Japanese "Tokyo" pronunciation with
706 # special syntax
707 m = re.search(r"(?m)\(Tokyo\) +([^ ]+) +\[", text)
708 if m: 708 ↛ 709line 708 didn't jump to line 709 because the condition on line 708 was never true
709 pron: SoundData = {field: m.group(1)} # type: ignore[misc]
710 if active_pos:
711 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
712 data_append(data, "sounds", pron)
713 # have_pronunciations = True
714 continue
716 # Check if it contains Rhymes
717 m = re.match(r"\s*Rhymes?: (.*)", text)
718 if m:
719 for ending in split_at_comma_semi(m.group(1)):
720 ending = ending.strip()
721 if ending: 721 ↛ 719line 721 didn't jump to line 719 because the condition on line 721 was always true
722 pron = {"rhymes": ending}
723 if active_pos: 723 ↛ 725line 723 didn't jump to line 725 because the condition on line 723 was always true
724 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
725 data_append(data, "sounds", pron)
726 # have_pronunciations = True
727 continue
729 # Check if it contains homophones
730 m = re.search(r"(?m)\bHomophones?: (.*)", text)
731 if m:
732 for w in split_at_comma_semi(m.group(1)):
733 w = w.strip()
734 if w: 734 ↛ 732line 734 didn't jump to line 732 because the condition on line 734 was always true
735 pron = {"homophone": w}
736 if active_pos: 736 ↛ 738line 736 didn't jump to line 738 because the condition on line 736 was always true
737 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
738 data_append(data, "sounds", pron)
739 # have_pronunciations = True
740 continue
742 # Check if it contains Phonetic hangeul
743 m = re.search(r"(?m)\bPhonetic hange?ul: \[([^]]+)\]", text)
744 if m: 744 ↛ 745line 744 didn't jump to line 745 because the condition on line 744 was never true
745 seen = set()
746 for w in m.group(1).split("/"):
747 w = w.strip()
748 if w and w not in seen:
749 seen.add(w)
750 pron = {"hangeul": w}
751 if active_pos:
752 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
753 data_append(data, "sounds", pron)
754 # have_pronunciations = True
756 m = re.search(r"\b(Syllabification|Hyphenation): ([^\s,]*)", text)
757 if m: 757 ↛ 758line 757 didn't jump to line 758 because the condition on line 757 was never true
758 data_append(data, "hyphenation", m.group(2))
759 # have_pronunciations = True
761 # See if it contains a word prefix restricting which forms the
762 # pronunciation applies to (see amica/Latin) and/or parenthesized
763 # tags.
764 m = re.match(
765 r"^[*#\s]*(([-\w]+):\s+)?\((([^()]|\([^()]*\))*?)\)", text
766 )
767 if m:
768 prefix = m.group(2) or ""
769 tagstext = m.group(3)
770 text = text[m.end() :]
771 else:
772 m = re.match(r"^[*#\s]*([-\w]+):\s+", text)
773 if m: 773 ↛ 774line 773 didn't jump to line 774 because the condition on line 773 was never true
774 prefix = m.group(1)
775 tagstext = ""
776 text = text[m.end() :]
777 else:
778 # Spanish has tags before pronunciations, eg. aceite/Spanish
779 m = re.match(r".*:\s+\(([^)]*)\)\s+(.*)", text)
780 if m: 780 ↛ 781line 780 didn't jump to line 781 because the condition on line 780 was never true
781 tagstext = m.group(1)
782 text = m.group(2)
783 else:
784 # No prefix. In this case, we inherit prefix
785 # from previous entry. This particularly
786 # applies for nested Audio files.
787 tagstext = ""
788 if tagstext:
789 earlier_base_data = {}
790 parse_pronunciation_tags(wxr, tagstext, earlier_base_data)
792 # Find romanizations from the pronunciation section (routinely
793 # produced for Korean by {{ko-IPA}})
794 for m in re.finditer(pron_romanization_re, text): 794 ↛ 795line 794 didn't jump to line 795 because the loop on line 794 never started
795 prefix = m.group(1)
796 w = m.group(2).strip()
797 tag = pron_romanizations[prefix]
798 form = {"form": w, "tags": tag.split()}
799 data_append(data, "forms", form)
801 # Find IPA pronunciations
802 for m in re.finditer(
803 r"(?m)/[^][\n/,]+?/" r"|" r"\[[^]\n0-9,/][^],/]*?\]", text
804 ):
805 v = m.group(0)
806 # The regexp above can match file links. Skip them.
807 if v.startswith("[[File:"): 807 ↛ 808line 807 didn't jump to line 808 because the condition on line 807 was never true
808 continue
809 if v == "/wiki.local/": 809 ↛ 810line 809 didn't jump to line 810 because the condition on line 809 was never true
810 continue
811 if field == "ipa" and "__AUDIO_IGNORE_THIS__" in text: 811 ↛ 812line 811 didn't jump to line 812 because the condition on line 811 was never true
812 m = re.search(r"__AUDIO_IGNORE_THIS__(\d+)__", text)
813 assert m
814 idx = int(m.group(1))
815 if not audios[idx].get("audio-ipa"):
816 audios[idx]["audio-ipa"] = v
817 if prefix:
818 audios[idx]["form"] = prefix
819 else:
820 if earlier_base_data: 820 ↛ 824line 820 didn't jump to line 824 because the condition on line 820 was always true
821 pron = deepcopy(earlier_base_data)
822 pron[field] = v
823 else:
824 pron = {field: v} # type: ignore[misc]
825 if active_pos: 825 ↛ 827line 825 didn't jump to line 827 because the condition on line 825 was always true
826 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
827 if prefix: 827 ↛ 828line 827 didn't jump to line 828 because the condition on line 827 was never true
828 pron["form"] = prefix
829 if active_pos: 829 ↛ 831line 829 didn't jump to line 831 because the condition on line 829 was always true
830 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
831 data_append(data, "sounds", pron)
832 # have_pronunciations = True
834 # XXX what about {{hyphenation|...}}, {{hyph|...}}
835 # and those used to be stored under "hyphenation"
837 # Add data that was collected in template_fn
838 for audio in audios:
839 if "audio" in audio: 839 ↛ 896line 839 didn't jump to line 896 because the condition on line 839 was always true
840 # Compute audio file URLs
841 fn = audio["audio"]
842 # Strip certain characters, e.g., left-to-right mark
843 fn = re.sub(r"[\u200f\u200e]", "", fn)
844 fn = fn.strip()
845 fn = urllib.parse.unquote(fn)
846 # First character is usually uppercased
847 if re.match(r"^[a-z][a-z]+", fn):
848 fn = fn[0].upper() + fn[1:]
849 if fn in wxr.config.redirects: 849 ↛ 850line 849 didn't jump to line 850 because the condition on line 849 was never true
850 fn = wxr.config.redirects[fn]
851 # File extension is lowercased
852 # XXX some words seem to need this, some don't seem to
853 # have this??? what is the exact rule?
854 # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn)
855 # Spaces are converted to underscores
856 fn = re.sub(r"\s+", "_", fn)
857 # Compute hash digest part
858 h = hashlib.md5()
859 hname = fn.encode("utf-8")
860 h.update(hname)
861 digest = h.hexdigest()
862 # Quote filename for URL
863 qfn = urllib.parse.quote(fn)
864 # For safety when writing files
865 qfn = qfn.replace("/", "__slash__")
866 if re.search(r"(?i)\.(ogg|oga)$", fn):
867 ogg = (
868 "https://upload.wikimedia.org/wikipedia/"
869 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)
870 )
871 else:
872 ogg = (
873 "https://upload.wikimedia.org/wikipedia/"
874 "commons/transcoded/"
875 "{}/{}/{}/{}.ogg".format(
876 digest[:1], digest[:2], qfn, qfn
877 )
878 )
879 if re.search(r"(?i)\.(mp3)$", fn): 879 ↛ 880line 879 didn't jump to line 880 because the condition on line 879 was never true
880 mp3 = (
881 "https://upload.wikimedia.org/wikipedia/"
882 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)
883 )
884 else:
885 mp3 = (
886 "https://upload.wikimedia.org/wikipedia/"
887 "commons/transcoded/"
888 "{}/{}/{}/{}.mp3".format(
889 digest[:1], digest[:2], qfn, qfn
890 )
891 )
892 audio["ogg_url"] = ogg
893 audio["mp3_url"] = mp3
894 if active_pos: 894 ↛ 896line 894 didn't jump to line 896 because the condition on line 894 was always true
895 audio["pos"] = active_pos # type: ignore[typeddict-unknown-key]
896 if audio not in data.get("sounds", ()): 896 ↛ 838line 896 didn't jump to line 838 because the condition on line 896 was always true
897 data_append(data, "sounds", audio)
898 # if audios:
899 # have_pronunciations = True
900 audios = []
902 ## I have commented out the otherwise unused have_pronunciation
903 ## toggles; uncomment them to use this debug print
904 # if not have_pronunciations and not have_panel_templates:
905 # wxr.wtp.debug("no pronunciations found from pronunciation section",
906 # sortid="pronunciations/533")