Coverage for src/wiktextract/extractor/en/pronunciation.py: 80%
633 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import hashlib
2import re
3import urllib
4from copy import deepcopy
5from dataclasses import dataclass
6from typing import Iterator
8from wikitextprocessor import (
9 HTMLNode,
10 LevelNode,
11 NodeKind,
12 TemplateNode,
13 WikiNode,
14)
16from ...clean import clean_value
17from ...datautils import data_append, data_extend, split_at_comma_semi
18from ...page import LEVEL_KINDS, clean_node, is_panel_template
19from ...tags import valid_tags
20from ...wxr_context import WiktextractContext
21from ..share import create_audio_url_dict
22from .form_descriptions import (
23 classify_desc,
24 decode_tags,
25 parse_pronunciation_tags,
26)
27from .parts_of_speech import part_of_speech_map
28from .type_utils import Hyphenation, SoundData, TemplateArgs, WordData
30# Prefixes, tags, and regexp for finding romanizations from the pronuncation
31# section
32pron_romanizations = {
33 " Revised Romanization ": "romanization revised",
34 " Revised Romanization (translit.) ": "romanization revised transliteration",
35 " McCune-Reischauer ": "McCune-Reischauer romanization",
36 " McCune–Reischauer ": "McCune-Reischauer romanization",
37 " Yale Romanization ": "Yale romanization",
38}
39pron_romanization_re = re.compile(
40 "(?m)^("
41 + "|".join(
42 re.escape(x)
43 for x in sorted(pron_romanizations.keys(), key=len, reverse=True)
44 )
45 + ")([^\n]+)"
46)
48IPA_EXTRACT = r"^(\((.+)\) )?(IPA⁽ᵏᵉʸ⁾|enPR): ((.+?)( \(([^(]+)\))?\s*)$"
49IPA_EXTRACT_RE = re.compile(IPA_EXTRACT)
52def extract_pron_template(
53 wxr: WiktextractContext, tname: str, targs: TemplateArgs, expanded: str
54) -> tuple[SoundData, list[SoundData]] | None:
55 """In post_template_fn, this is used to handle all enPR and IPA templates
56 so that we can leave breadcrumbs in the text that can later be handled
57 there. We return a `base_data` so that if there are two
58 or more templates on the same line, like this:
59 (Tags for the whole line, really) enPR: foo, IPA(keys): /foo/
60 then we can apply base_data fields to other templates, too, if needed.
61 """
62 cleaned = clean_value(wxr, expanded)
63 # print(f"extract_pron_template input: {tname=} {expanded=}-> {cleaned=}")
64 m = IPA_EXTRACT_RE.match(cleaned)
65 if not m:
66 wxr.wtp.error(
67 f"Text cannot match IPA_EXTRACT_RE regex: "
68 f"{cleaned=}, {tname=}, {targs=}",
69 sortid="en/pronunciation/54",
70 )
71 return None
72 # for i, group in enumerate(m.groups()):
73 # print(i + 1, repr(group))
74 main_qual = m.group(2) or ""
75 if "qq" in targs:
76 # If the template has been given a qualifier that applies to
77 # every entry, but which also happens to appear at the end
78 # which can be confused with the post-qualifier of a single
79 # entry in the style of "... /ipa3/ (foo) (bar)", where foo
80 # might not be present so the bar looks like it only might
81 # apply to `/ipa3/`
82 pron_body = m.group(5)
83 post_qual = m.group(7)
84 else:
85 pron_body = m.group(4)
86 post_qual = ""
88 if not pron_body: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 wxr.wtp.error(
90 f"Regex failed to find 'body' from {cleaned=}",
91 sortid="en/pronunciation/81",
92 )
93 return None
95 base_data: SoundData = {}
96 if main_qual:
97 parse_pronunciation_tags(wxr, main_qual, base_data)
98 if post_qual:
99 parse_pronunciation_tags(wxr, post_qual, base_data)
100 # This base_data is used as the base copy for all entries from this
101 # template, but it is also returned so that its contents may be applied
102 # to other templates on the same line.
103 # print(f"{base_data=}")
105 sound_datas: list[SoundData] = []
107 parts: list[list[str]] = [[]]
108 inside = 0
109 current: list[str] = []
110 for i, p in enumerate(re.split(r"(\s*,|;|\(|\)\s*)", pron_body)):
111 # Split the line on commas and semicolons outside of parens. This
112 # gives us lines with "(main-qualifier) /phon/ (post-qualifier, maybe)"
113 # print(f" {i=}, {p=}")
114 comp = p.strip()
115 if not p:
116 continue
117 if comp == "(":
118 if not inside and i > 0: 118 ↛ 121line 118 didn't jump to line 121 because the condition on line 118 was always true
119 if stripped := "".join(current).strip():
120 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]
121 current = [p]
122 inside += 1
123 continue
124 if comp == ")":
125 inside -= 1
126 if not inside: 126 ↛ 131line 126 didn't jump to line 131 because the condition on line 126 was always true
127 if stripped := "".join(current).strip(): 127 ↛ 131line 127 didn't jump to line 131 because the condition on line 127 was always true
128 current.append(p)
129 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]
130 current = []
131 continue
132 if not inside and comp in (",", ";"):
133 if stripped := "".join(current).strip():
134 parts[-1].append(stripped) # type:ignore[arg-type]
135 current = []
136 parts.append([])
137 continue
138 current.append(p)
139 if current:
140 parts[-1].append("".join(current).strip())
142 # print(f">>>>>> {parts=}")
143 new_parts: list[list[str]] = []
144 for entry in parts:
145 if not entry: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true
146 continue
147 new_entry: list[str] = []
148 i1: int = entry[0].startswith("(") and entry[0].endswith(")")
149 if i1:
150 new_entry.append(entry[0][1:-1].strip())
151 else:
152 new_entry.append("")
153 i2: int = (
154 entry[-1].startswith("(")
155 and entry[-1].endswith(")")
156 and len(entry) > 1
157 )
158 if i2 == 0:
159 i2 = len(entry)
160 else:
161 i2 = -1
162 new_entry.append("".join(entry[i1:i2]).strip())
163 if not new_entry[-1]: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true
164 wxr.wtp.error(
165 f"Missing IPA/enPRO sound data between qualifiers?{entry=}",
166 sortid="en/pronunciation/153",
167 )
168 if i2 == -1:
169 new_entry.append(entry[-1][1:-1].strip())
170 else:
171 new_entry.append("")
172 new_parts.append(new_entry)
174 # print(f">>>>> {new_parts=}")
176 for part in new_parts:
177 sd = deepcopy(base_data)
178 if part[0]:
179 parse_pronunciation_tags(wxr, part[0], sd)
180 if part[2]:
181 parse_pronunciation_tags(wxr, part[2], sd)
182 if tname == "enPR":
183 sd["enpr"] = part[1]
184 else:
185 sd["ipa"] = part[1]
186 sound_datas.append(sd)
188 # print(f"BASE_DATA: {base_data}")
189 # print(f"SOUND_DATAS: {sound_datas=}")
191 return base_data, sound_datas
194def parse_pronunciation(
195 wxr: WiktextractContext,
196 level_node: LevelNode,
197 data: WordData,
198 etym_data: WordData,
199 have_etym: bool,
200 base_data: WordData,
201 lang_code: str,
202) -> None:
203 """Parses the pronunciation section from a language section on a
204 page."""
205 if level_node.kind in LEVEL_KINDS: 205 ↛ 218line 205 didn't jump to line 218 because the condition on line 205 was always true
206 contents = []
207 for node in level_node.children:
208 if isinstance(node, TemplateNode):
209 if node.template_name == "th-pron":
210 extract_th_pron_template(wxr, data, node)
211 elif node.template_name == "zh-pron":
212 extract_zh_pron_template(wxr, data, node)
213 else:
214 contents.append(node)
215 else:
216 contents.append(node)
217 else:
218 contents = [level_node]
219 # Remove subsections, such as Usage notes. They may contain IPAchar
220 # templates in running text, and we do not want to extract IPAs from
221 # those.
222 # Filter out only LEVEL_KINDS; 'or' is doing heavy lifting here
223 # Slip through not-WikiNodes, then slip through WikiNodes that
224 # are not LEVEL_KINDS.
225 contents = [
226 x
227 for x in contents
228 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
229 ]
230 if not any(
231 isinstance(x, WikiNode) and x.kind == NodeKind.LIST for x in contents
232 ):
233 # expand all templates
234 new_contents: list[str | WikiNode] = []
235 for lst in contents:
236 if isinstance(lst, TemplateNode):
237 temp = wxr.wtp.node_to_wikitext(lst)
238 temp = wxr.wtp.expand(temp)
239 temp_parsed = wxr.wtp.parse(temp)
240 new_contents.extend(temp_parsed.children)
241 else:
242 new_contents.append(lst)
243 contents = new_contents
245 if have_etym and data is base_data: 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true
246 data = etym_data
247 pron_templates: list[tuple[SoundData, list[SoundData]]] = []
248 hyphenations: list[Hyphenation] = []
249 audios = []
250 have_panel_templates = False
252 def parse_pronunciation_template_fn(
253 name: str, ht: TemplateArgs
254 ) -> str | None:
255 """Handle pronunciation and hyphenation templates"""
256 # _template_fn handles templates *before* they are expanded;
257 # this allows for special handling before all the work needed
258 # for expansion is done.
259 nonlocal have_panel_templates
260 if is_panel_template(wxr, name):
261 have_panel_templates = True
262 return ""
263 if name == "audio":
264 filename = ht.get(2) or ""
265 desc = ht.get(3) or ""
266 desc = clean_node(wxr, None, [desc])
267 audio: SoundData = {"audio": filename.strip()}
268 if desc: 268 ↛ 269line 268 didn't jump to line 269 because the condition on line 268 was never true
269 audio["text"] = desc
270 m = re.search(r"\((([^()]|\([^()]*\))*)\)", desc)
271 skip = False
272 if m: 272 ↛ 273line 272 didn't jump to line 273 because the condition on line 272 was never true
273 par = m.group(1)
274 cls = classify_desc(par)
275 if cls == "tags":
276 parse_pronunciation_tags(wxr, par, audio)
277 else:
278 skip = True
279 if skip: 279 ↛ 280line 279 didn't jump to line 280 because the condition on line 279 was never true
280 return ""
281 audios.append(audio)
282 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
283 if name == "audio-IPA": 283 ↛ 284line 283 didn't jump to line 284 because the condition on line 283 was never true
284 filename = ht.get(2) or ""
285 ipa = ht.get(3) or ""
286 dial = ht.get("dial")
287 audio = {"audio": filename.strip()}
288 if dial:
289 dial = clean_node(wxr, None, [dial])
290 audio["text"] = dial
291 if ipa:
292 audio["audio-ipa"] = ipa
293 audios.append(audio)
294 # The problem with these IPAs is that they often just describe
295 # what's in the sound file, rather than giving the pronunciation
296 # of the word alone. It is common for audio files to contain
297 # multiple pronunciations or articles in the same file, and then
298 # this IPA often describes what is in the file.
299 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
300 if name == "audio-pron":
301 filename = ht.get(2) or ""
302 ipa = ht.get("ipa") or ""
303 dial = ht.get("dial")
304 country = ht.get("country")
305 audio = {"audio": filename.strip()}
306 if dial: 306 ↛ 310line 306 didn't jump to line 310 because the condition on line 306 was always true
307 dial = clean_node(wxr, None, [dial])
308 audio["text"] = dial
309 parse_pronunciation_tags(wxr, dial, audio)
310 if country: 310 ↛ 312line 310 didn't jump to line 312 because the condition on line 310 was always true
311 parse_pronunciation_tags(wxr, country, audio)
312 if ipa: 312 ↛ 314line 312 didn't jump to line 314 because the condition on line 312 was always true
313 audio["audio-ipa"] = ipa
314 audios.append(audio)
315 # XXX do we really want to extract pronunciations from these?
316 # Or are they spurious / just describing what is in the
317 # audio file?
318 # if ipa:
319 # pron = {"ipa": ipa}
320 # if dial:
321 # parse_pronunciation_tags(wxr, dial, pron)
322 # if country:
323 # parse_pronunciation_tags(wxr, country, pron)
324 # data_append(data, "sounds", pron)
325 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
326 if name in ("hyph", "hyphenation"):
327 # {{hyph|en|re|late|caption="Hyphenation UK:"}}
328 # {{hyphenation|it|quiè|to||qui|è|to||quié|to||qui|é|to}}
329 # and also nocaption=1
330 caption = clean_node(wxr, None, ht.get("caption", ""))
331 tagsets, _ = decode_tags(caption)
332 # flatten the tagsets into one; it would be really weird to have
333 # several tagsets for a hyphenation caption
334 tags = list(set(tag for tagset in tagsets for tag in tagset))
335 # We'll just ignore any errors from tags, it's not very important
336 # for hyphenation
337 tags = [tag for tag in tags if not tag.startswith("error")]
338 hyph_sequences: list[list[str]] = [[]]
339 for text in [
340 t for (k, t) in ht.items() if (isinstance(k, int) and k >= 2)
341 ]:
342 if not text:
343 hyph_sequences.append([])
344 else:
345 hyph_sequences[-1].append(clean_node(wxr, None, text))
346 for seq in hyph_sequences:
347 hyphenations.append(Hyphenation(parts=seq, tags=tags))
348 return ""
349 return None
351 def parse_pron_post_template_fn(
352 name: str, ht: TemplateArgs, text: str
353 ) -> str | None:
354 # _post_template_fn handles templates *after* the work to expand
355 # them has been done; this is exactly the same as _template_fn,
356 # except with the additional expanded text as an input, and
357 # possible side-effects from the expansion and recursion (like
358 # calling other subtemplates that are handled in _template_fn.
359 if is_panel_template(wxr, name): 359 ↛ 360line 359 didn't jump to line 360 because the condition on line 359 was never true
360 return ""
361 if name in {
362 "q",
363 "qualifier",
364 "sense",
365 "a",
366 "accent",
367 "l",
368 "link",
369 "lb",
370 "lbl",
371 "label",
372 }:
373 # Kludge: when these templates expand to /.../ or [...],
374 # replace the expansion by something safe. This is used
375 # to filter spurious IPA-looking expansions that aren't really
376 # IPAs. We probably don't care about these templates in the
377 # contexts where they expand to something containing these.
378 v = re.sub(r'href="[^"]*"', "", text) # Ignore URLs
379 v = re.sub(r'src="[^"]*"', "", v)
380 if re.search(r"/[^/,]+?/|\[[^]0-9,/][^],/]*?\]", v): 380 ↛ 386line 380 didn't jump to line 386 because the condition on line 380 was always true
381 # Note: replacing by empty results in Lua errors that we
382 # would rather not have. For example, voi/Middle Vietnamese
383 # uses {{a|{{l{{vi|...}}}}, and the {{a|...}} will fail
384 # if {{l|...}} returns empty.
385 return "stripped-by-parse_pron_post_template_fn"
386 if name in ("IPA", "enPR"):
387 # Extract the data from IPA and enPR templates (same underlying
388 # template) and replace them in-text with magical cookie that
389 # can be later used to refer to the data's index inside
390 # pron_templates.
391 if pron_t := extract_pron_template(wxr, name, ht, text):
392 pron_templates.append(pron_t)
393 return f"__PRON_TEMPLATE_{len(pron_templates) - 1}__"
394 return text
396 def flattened_tree(lines: list[WikiNode | str]) -> Iterator[WikiNode | str]:
397 assert isinstance(lines, list)
398 for line in lines:
399 yield from flattened_tree1(line)
401 def flattened_tree1(node: WikiNode | str) -> Iterator[WikiNode | str]:
402 assert isinstance(node, (WikiNode, str))
403 if isinstance(node, str):
404 yield node
405 return
406 elif node.kind == NodeKind.LIST:
407 for item in node.children:
408 yield from flattened_tree1(item)
409 elif node.kind == NodeKind.LIST_ITEM:
410 new_children = []
411 sublist = None
412 for child in node.children:
413 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
414 sublist = child
415 else:
416 new_children.append(child)
417 node.children = new_children
418 node.sarg = "*"
419 yield node
420 if sublist:
421 yield from flattened_tree1(sublist)
422 else:
423 yield node
425 # XXX Do not use flattened_tree more than once here, for example for
426 # debug printing... The underlying data is changed, and the separated
427 # sublists disappear.
429 # Kludge for templates that generate several lines, but haven't
430 # been caught by earlier kludges...
431 def split_cleaned_node_on_newlines(
432 contents: list[WikiNode | str],
433 ) -> Iterator[str]:
434 for litem in flattened_tree(contents):
435 ipa_text = clean_node(
436 wxr,
437 data,
438 litem,
439 template_fn=parse_pronunciation_template_fn,
440 post_template_fn=parse_pron_post_template_fn,
441 )
442 for line in ipa_text.splitlines():
443 yield line
445 # have_pronunciations = False
446 active_pos: str | None = None
448 for line in split_cleaned_node_on_newlines(contents):
449 # print(f"{line=}")
450 prefix: str | None = None
451 earlier_base_data: SoundData | None = None
452 if not line: 452 ↛ 453line 452 didn't jump to line 453 because the condition on line 452 was never true
453 continue
455 split_templates = re.split(r"__PRON_TEMPLATE_(\d+)__", line)
456 for i, text in enumerate(split_templates):
457 if not text:
458 continue
459 # clean up starts at the start of the line
460 text = re.sub(r"^\**\s*", "", text).strip()
461 if i == 0:
462 # At the start of a line, check for stuff like "Noun:"
463 # for active_pos; active_pos is a temporary data field
464 # given to each saved SoundData entry which is later
465 # used to sort the entries into their respective PoSes.
466 m = re.match(r"\s*(\w+\s?\w*)\s*:?\s*", text)
467 if m:
468 if (m_lower := m.group(1).lower()) in part_of_speech_map:
469 active_pos = part_of_speech_map[m_lower]["pos"]
470 text = text[m.end() :].strip()
471 if not text:
472 continue
473 if i % 2 == 1:
474 # re.split (with capture groups) splits the lines so that
475 # every even entry is a captured splitter; odd lines are either
476 # empty strings or stuff around the splitters.
477 base_pron_data, first_prons = pron_templates[int(text)]
478 if base_pron_data:
479 earlier_base_data = base_pron_data
480 # print(f"Set {earlier_base_data=}")
481 elif earlier_base_data is not None:
482 # merge data from an earlier iteration of this loop
483 for pr in first_prons:
484 if "note" in pr and "note" in earlier_base_data: 484 ↛ 485line 484 didn't jump to line 485 because the condition on line 484 was never true
485 pr["note"] += ";" + earlier_base_data.get(
486 "note", ""
487 )
488 elif "note" in earlier_base_data: 488 ↛ 489line 488 didn't jump to line 489 because the condition on line 488 was never true
489 pr["note"] = earlier_base_data["note"]
490 if "topics" in earlier_base_data: 490 ↛ 491line 490 didn't jump to line 491 because the condition on line 490 was never true
491 data_extend(
492 pr, "topics", earlier_base_data["topics"]
493 )
494 if "tags" in pr and "tags" in earlier_base_data: 494 ↛ 495line 494 didn't jump to line 495 because the condition on line 494 was never true
495 pr["tags"].extend(earlier_base_data["tags"])
496 elif "tags" in earlier_base_data: 496 ↛ 483line 496 didn't jump to line 483 because the condition on line 496 was always true
497 pr["tags"] = sorted(set(earlier_base_data["tags"]))
498 for pr in first_prons:
499 if active_pos:
500 pr["pos"] = active_pos # type: ignore[typeddict-unknown-key]
501 if pr not in data.get("sounds", ()): 501 ↛ 498line 501 didn't jump to line 498 because the condition on line 501 was always true
502 data_append(data, "sounds", pr)
503 # This bit is handled
504 continue
506 if "IPA" in text:
507 field = "ipa"
508 else:
509 # This is used for Rhymes, Homophones, etc
510 field = "other"
512 # Check if it contains Japanese "Tokyo" pronunciation with
513 # special syntax
514 m = re.search(r"(?m)\(Tokyo\) +([^ ]+) +\[", text)
515 if m: 515 ↛ 516line 515 didn't jump to line 516 because the condition on line 515 was never true
516 pron: SoundData = {field: m.group(1)} # type: ignore[misc]
517 if active_pos:
518 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
519 data_append(data, "sounds", pron)
520 # have_pronunciations = True
521 continue
523 # Check if it contains Rhymes
524 m = re.match(r"\s*Rhymes?: (.*)", text)
525 if m:
526 for ending in split_at_comma_semi(m.group(1)):
527 ending = ending.strip()
528 if ending: 528 ↛ 526line 528 didn't jump to line 526 because the condition on line 528 was always true
529 pron = {"rhymes": ending}
530 if active_pos:
531 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
532 data_append(data, "sounds", pron)
533 # have_pronunciations = True
534 continue
536 # Check if it contains homophones
537 m = re.search(r"(?m)\bHomophones?: (.*)", text)
538 if m:
539 for w in split_at_comma_semi(m.group(1)):
540 w = w.strip()
541 if w: 541 ↛ 539line 541 didn't jump to line 539 because the condition on line 541 was always true
542 pron = {"homophone": w}
543 if active_pos:
544 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
545 data_append(data, "sounds", pron)
546 # have_pronunciations = True
547 continue
549 # Check if it contains Phonetic hangeul
550 m = re.search(r"(?m)\bPhonetic hange?ul: \[([^]]+)\]", text)
551 if m: 551 ↛ 552line 551 didn't jump to line 552 because the condition on line 551 was never true
552 seen = set()
553 for w in m.group(1).split("/"):
554 w = w.strip()
555 if w and w not in seen:
556 seen.add(w)
557 pron = {"hangeul": w}
558 if active_pos:
559 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
560 data_append(data, "sounds", pron)
561 # have_pronunciations = True
563 # This regex-based hyphenation detection left as backup
564 m = re.search(r"\b(Syllabification|Hyphenation): *([^\n.]*)", text)
565 if m:
566 data_append(data, "hyphenation", m.group(2))
567 commaseparated = m.group(2).split(",")
568 if len(commaseparated) > 1: 568 ↛ 579line 568 didn't jump to line 579 because the condition on line 568 was always true
569 for h in commaseparated:
570 # That second characters looks like a dash but it's
571 # actually unicode decimal code 8231, hyphenation dash
572 # Add more delimiters here if needed.
573 parts = re.split(r"-|‧", h.strip())
574 data_append(
575 data, "hyphenations", Hyphenation(parts=parts)
576 )
577 ...
578 else:
579 data_append(
580 data,
581 "hyphenations",
582 Hyphenation(parts=m.group(2).split(sep="-")),
583 )
584 # have_pronunciations = True
586 # See if it contains a word prefix restricting which forms the
587 # pronunciation applies to (see amica/Latin) and/or parenthesized
588 # tags.
589 m = re.match(
590 r"^[*#\s]*(([-\w]+):\s+)?\((([^()]|\([^()]*\))*?)\)", text
591 )
592 if m:
593 prefix = m.group(2) or ""
594 tagstext = m.group(3)
595 text = text[m.end() :]
596 else:
597 m = re.match(r"^[*#\s]*([-\w]+):\s+", text)
598 if m:
599 prefix = m.group(1)
600 tagstext = ""
601 text = text[m.end() :]
602 else:
603 # Spanish has tags before pronunciations, eg. aceite/Spanish
604 m = re.match(r".*:\s+\(([^)]*)\)\s+(.*)", text)
605 if m: 605 ↛ 606line 605 didn't jump to line 606 because the condition on line 605 was never true
606 tagstext = m.group(1)
607 text = m.group(2)
608 else:
609 # No prefix. In this case, we inherit prefix
610 # from previous entry. This particularly
611 # applies for nested Audio files.
612 tagstext = ""
613 if tagstext:
614 earlier_base_data = {}
615 parse_pronunciation_tags(wxr, tagstext, earlier_base_data)
617 # Find romanizations from the pronunciation section (routinely
618 # produced for Korean by {{ko-IPA}})
619 for m in re.finditer(pron_romanization_re, text): 619 ↛ 620line 619 didn't jump to line 620 because the loop on line 619 never started
620 prefix = m.group(1)
621 w = m.group(2).strip()
622 tag = pron_romanizations[prefix]
623 form = {"form": w, "tags": tag.split()}
624 data_append(data, "forms", form)
626 # Find IPA pronunciations
627 for m in re.finditer(
628 r"(?m)/[^][\n/,]+?/" r"|" r"\[[^]\n0-9,/][^],/]*?\]", text
629 ):
630 v = m.group(0)
631 # The regexp above can match file links. Skip them.
632 if v.startswith("[[File:"): 632 ↛ 633line 632 didn't jump to line 633 because the condition on line 632 was never true
633 continue
634 if v == "/wiki.local/": 634 ↛ 635line 634 didn't jump to line 635 because the condition on line 634 was never true
635 continue
636 if field == "ipa" and "__AUDIO_IGNORE_THIS__" in text: 636 ↛ 637line 636 didn't jump to line 637 because the condition on line 636 was never true
637 m = re.search(r"__AUDIO_IGNORE_THIS__(\d+)__", text)
638 assert m
639 idx = int(m.group(1))
640 if idx >= len(audios):
641 continue
642 if not audios[idx].get("audio-ipa"):
643 audios[idx]["audio-ipa"] = v
644 if prefix:
645 audios[idx]["form"] = prefix
646 else:
647 if earlier_base_data:
648 pron = deepcopy(earlier_base_data)
649 pron[field] = v
650 else:
651 pron = {field: v} # type: ignore[misc]
652 if active_pos:
653 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
654 if prefix:
655 pron["form"] = prefix
656 if active_pos:
657 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
658 data_append(data, "sounds", pron)
659 # have_pronunciations = True
661 # XXX what about {{hyphenation|...}}, {{hyph|...}}
662 # and those used to be stored under "hyphenation"
664 # Add data that was collected in template_fn
665 for audio in audios:
666 if "audio" in audio: 666 ↛ 723line 666 didn't jump to line 723 because the condition on line 666 was always true
667 # Compute audio file URLs
668 fn = audio["audio"]
669 # Strip certain characters, e.g., left-to-right mark
670 fn = re.sub(r"[\u200f\u200e]", "", fn)
671 fn = fn.strip()
672 fn = urllib.parse.unquote(fn)
673 # First character is usually uppercased
674 if re.match(r"^[a-z][a-z]+", fn):
675 fn = fn[0].upper() + fn[1:]
676 if fn in wxr.config.redirects: 676 ↛ 677line 676 didn't jump to line 677 because the condition on line 676 was never true
677 fn = wxr.config.redirects[fn]
678 # File extension is lowercased
679 # XXX some words seem to need this, some don't seem to
680 # have this??? what is the exact rule?
681 # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn)
682 # Spaces are converted to underscores
683 fn = re.sub(r"\s+", "_", fn)
684 # Compute hash digest part
685 h = hashlib.md5()
686 hname = fn.encode("utf-8")
687 h.update(hname)
688 digest = h.hexdigest()
689 # Quote filename for URL
690 qfn = urllib.parse.quote(fn)
691 # For safety when writing files
692 qfn = qfn.replace("/", "__slash__")
693 if re.search(r"(?i)\.(ogg|oga)$", fn):
694 ogg = (
695 "https://upload.wikimedia.org/wikipedia/"
696 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)
697 )
698 else:
699 ogg = (
700 "https://upload.wikimedia.org/wikipedia/"
701 "commons/transcoded/"
702 "{}/{}/{}/{}.ogg".format(
703 digest[:1], digest[:2], qfn, qfn
704 )
705 )
706 if re.search(r"(?i)\.(mp3)$", fn): 706 ↛ 707line 706 didn't jump to line 707 because the condition on line 706 was never true
707 mp3 = (
708 "https://upload.wikimedia.org/wikipedia/"
709 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)
710 )
711 else:
712 mp3 = (
713 "https://upload.wikimedia.org/wikipedia/"
714 "commons/transcoded/"
715 "{}/{}/{}/{}.mp3".format(
716 digest[:1], digest[:2], qfn, qfn
717 )
718 )
719 audio["ogg_url"] = ogg
720 audio["mp3_url"] = mp3
721 if active_pos:
722 audio["pos"] = active_pos # type: ignore[typeddict-unknown-key]
723 if audio not in data.get("sounds", ()):
724 data_append(data, "sounds", audio)
726 # if audios:
727 # have_pronunciations = True
728 audios = []
730 data_extend(data, "hyphenations", hyphenations)
731 hyphenations = []
733 ## I have commented out the otherwise unused have_pronunciation
734 ## toggles; uncomment them to use this debug print
735 # if not have_pronunciations and not have_panel_templates:
736 # wxr.wtp.debug("no pronunciations found from pronunciation section",
737 # sortid="pronunciations/533")
740@dataclass
741class TableHeader:
742 text: str
743 rowspan: int
746def extract_th_pron_template(
747 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode
748):
749 # https://en.wiktionary.org/wiki/Template:th-pron
750 expanded_node = wxr.wtp.parse(
751 wxr.wtp.node_to_wikitext(t_node), expand_all=True
752 )
753 sounds = []
754 for table_tag in expanded_node.find_html("table"):
755 row_headers = []
756 for tr_tag in table_tag.find_html("tr"):
757 field = "other"
758 new_headers = []
759 for header in row_headers:
760 if header.rowspan > 1:
761 header.rowspan -= 1
762 new_headers.append(header)
763 row_headers = new_headers
764 for th_tag in tr_tag.find_html("th"):
765 header_str = clean_node(wxr, None, th_tag)
766 if header_str.startswith("(standard) IPA"):
767 field = "ipa"
768 elif header_str.startswith("Homophones"): 768 ↛ 769line 768 didn't jump to line 769 because the condition on line 768 was never true
769 field = "homophone"
770 elif header_str == "Audio":
771 field = "audio"
772 elif header_str != "": 772 ↛ 764line 772 didn't jump to line 764 because the condition on line 772 was always true
773 rowspan = 1
774 rowspan_str = th_tag.attrs.get("rowspan", "1")
775 if re.fullmatch(r"\d+", rowspan_str): 775 ↛ 777line 775 didn't jump to line 777 because the condition on line 775 was always true
776 rowspan = int(rowspan_str)
777 row_headers.append(TableHeader(header_str, rowspan))
779 for td_tag in tr_tag.find_html("td"):
780 if field == "audio":
781 for link_node in td_tag.find_child(NodeKind.LINK):
782 filename = clean_node(wxr, None, link_node.largs[0])
783 if filename != "": 783 ↛ 781line 783 didn't jump to line 781 because the condition on line 783 was always true
784 sound = create_audio_url_dict(filename)
785 sounds.append(sound)
786 elif field == "homophone": 786 ↛ 787line 786 didn't jump to line 787 because the condition on line 786 was never true
787 for span_tag in td_tag.find_html_recursively(
788 "span", attr_name="lang", attr_value="th"
789 ):
790 word = clean_node(wxr, None, span_tag)
791 if word != "":
792 sounds.append({"homophone": word})
793 else:
794 raw_tag = ""
795 for html_node in td_tag.find_child_recursively(
796 NodeKind.HTML
797 ):
798 if html_node.tag == "small":
799 node_str = clean_node(wxr, None, html_node)
800 if node_str.startswith("[") and node_str.endswith(
801 "]"
802 ):
803 raw_tag = node_str.strip("[]")
804 elif len(sounds) > 0: 804 ↛ 795line 804 didn't jump to line 795 because the condition on line 804 was always true
805 sounds[-1]["roman"] = node_str
806 elif html_node.tag == "span":
807 node_str = clean_node(wxr, None, html_node)
808 span_lang = html_node.attrs.get("lang", "")
809 span_class = html_node.attrs.get("class", "")
810 if node_str != "" and (
811 span_lang == "th" or span_class in ["IPA", "tr"]
812 ):
813 sound = {field: node_str}
814 if raw_tag != "":
815 if raw_tag in valid_tags: 815 ↛ 818line 815 didn't jump to line 818 because the condition on line 815 was always true
816 data_append(sound, "tags", raw_tag)
817 else:
818 data_append(sound, "raw_tags", raw_tag)
819 for header in row_headers:
820 if header.text.lower() in valid_tags:
821 data_append(
822 sound, "tags", header.text.lower()
823 )
824 else:
825 data_append(
826 sound, "raw_tags", header.text
827 )
828 sounds.append(sound)
830 clean_node(wxr, word_entry, expanded_node)
831 data_extend(word_entry, "sounds", sounds)
834def extract_zh_pron_template(
835 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode
836):
837 # https://en.wiktionary.org/wiki/Template:zh-pron
838 expanded_node = wxr.wtp.parse(
839 wxr.wtp.node_to_wikitext(t_node), expand_all=True
840 )
841 seen_lists = set()
842 sounds = []
843 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):
844 if list_node not in seen_lists:
845 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
846 sounds.extend(
847 extract_zh_pron_list_item(wxr, list_item, [], seen_lists)
848 )
849 clean_node(wxr, word_entry, expanded_node)
850 data_extend(word_entry, "sounds", sounds)
853def extract_zh_pron_list_item(
854 wxr: WiktextractContext,
855 list_item: WikiNode,
856 raw_tags: list[str],
857 seen_lists: set[WikiNode],
858) -> list[SoundData]:
859 current_tags = raw_tags[:]
860 sounds = []
861 is_first_small_tag = True
862 for node in list_item.children:
863 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
864 link_str = clean_node(wxr, None, node.largs)
865 node_str = clean_node(wxr, None, node)
866 if link_str.startswith("File:"): 866 ↛ 867line 866 didn't jump to line 867 because the condition on line 866 was never true
867 sound = create_audio_url_dict(link_str.removeprefix("File:"))
868 sound["raw_tags"] = current_tags[:]
869 translate_zh_pron_raw_tags(sound)
870 sounds.append(sound)
871 elif node_str != "": 871 ↛ 862line 871 didn't jump to line 862 because the condition on line 871 was always true
872 current_tags.append(node_str)
873 elif isinstance(node, HTMLNode):
874 if node.tag == "small":
875 if is_first_small_tag: 875 ↛ 886line 875 didn't jump to line 886 because the condition on line 875 was always true
876 raw_tag_text = clean_node(
877 wxr,
878 None,
879 [
880 n
881 for n in node.children
882 if not (isinstance(n, HTMLNode) and n.tag == "sup")
883 ],
884 )
885 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))
886 elif len(sounds) > 0:
887 data_extend(
888 sounds[-1],
889 "raw_tags",
890 split_zh_pron_raw_tag(clean_node(wxr, None, node)),
891 )
892 translate_zh_pron_raw_tags(sounds[-1])
893 is_first_small_tag = False
894 elif node.tag == "span":
895 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))
896 elif ( 896 ↛ 901line 896 didn't jump to line 901 because the condition on line 896 was never true
897 node.tag == "table"
898 and len(current_tags) > 0
899 and current_tags[-1] == "Homophones"
900 ):
901 sounds.extend(
902 extract_zh_pron_homophone_table(wxr, node, current_tags)
903 )
904 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
905 seen_lists.add(node)
906 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
907 sounds.extend(
908 extract_zh_pron_list_item(
909 wxr, child_list_item, current_tags, seen_lists
910 )
911 )
913 return sounds
916def extract_zh_pron_homophone_table(
917 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]
918) -> list[SoundData]:
919 sounds = []
920 for td_tag in table.find_html_recursively("td"):
921 for span_tag in td_tag.find_html("span"):
922 span_class = span_tag.attrs.get("class", "")
923 span_lang = span_tag.attrs.get("lang", "")
924 span_str = clean_node(wxr, None, span_tag)
925 if (
926 span_str not in ["", "/"]
927 and span_lang != ""
928 and span_class in ["Hant", "Hans", "Hani"]
929 ):
930 sound = {"homophone": span_str, "raw_tags": raw_tags[:]}
931 if span_class == "Hant":
932 data_append(sound, "tags", "Traditional-Chinese")
933 elif span_class == "Hans":
934 data_append(sound, "tags", "Simplified-Chinese")
935 translate_zh_pron_raw_tags(sound)
936 sounds.append(sound)
938 return sounds
941def translate_zh_pron_raw_tags(sound: SoundData):
942 from .zh_pron_tags import ZH_PRON_TAGS
944 raw_tags = []
945 for raw_tag in sound.get("raw_tags", []):
946 if raw_tag in ZH_PRON_TAGS:
947 tr_tag = ZH_PRON_TAGS[raw_tag]
948 if isinstance(tr_tag, str):
949 data_append(sound, "tags", tr_tag)
950 elif isinstance(tr_tag, list) and tr_tag not in sound.get( 950 ↛ 945line 950 didn't jump to line 945 because the condition on line 950 was always true
951 "tags", []
952 ):
953 data_extend(sound, "tags", tr_tag)
954 elif raw_tag in valid_tags:
955 if raw_tag not in sound.get("tags", []): 955 ↛ 945line 955 didn't jump to line 945 because the condition on line 955 was always true
956 data_append(sound, "tags", raw_tag)
957 elif raw_tag not in raw_tags: 957 ↛ 945line 957 didn't jump to line 945 because the condition on line 957 was always true
958 raw_tags.append(raw_tag)
960 if len(raw_tags) > 0:
961 sound["raw_tags"] = raw_tags
962 elif "raw_tags" in sound: 962 ↛ exitline 962 didn't return from function 'translate_zh_pron_raw_tags' because the condition on line 962 was always true
963 del sound["raw_tags"]
966def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:
967 raw_tags = []
968 if "(" not in raw_tag_text:
969 for raw_tag in re.split(r",|:|;| and ", raw_tag_text):
970 raw_tag = raw_tag.strip().removeprefix("incl. ").strip()
971 if raw_tag != "":
972 raw_tags.append(raw_tag)
973 else:
974 processed_offsets = []
975 for match in re.finditer(r"\([^()]+\)", raw_tag_text):
976 processed_offsets.append((match.start(), match.end()))
977 raw_tags.extend(
978 split_zh_pron_raw_tag(
979 raw_tag_text[match.start() + 1 : match.end() - 1]
980 )
981 )
982 not_processed = ""
983 last_end = 0
984 for start, end in processed_offsets:
985 not_processed += raw_tag_text[last_end:start]
986 last_end = end
987 not_processed += raw_tag_text[last_end:]
988 if not_processed != raw_tag_text: 988 ↛ 991line 988 didn't jump to line 991 because the condition on line 988 was always true
989 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags
990 else:
991 raw_tags.append(not_processed)
993 return raw_tags
996def extract_zh_pron_span(
997 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]
998) -> list[SoundData]:
999 sounds = []
1000 small_tags = []
1001 pron_nodes = []
1002 roman = ""
1003 phonetic_pron = ""
1004 for index, node in enumerate(span_tag.children):
1005 if isinstance(node, HTMLNode) and node.tag == "small": 1005 ↛ 1006line 1005 didn't jump to line 1006 because the condition on line 1005 was never true
1006 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))
1007 elif ( 1007 ↛ 1012line 1007 didn't jump to line 1012 because the condition on line 1007 was never true
1008 isinstance(node, HTMLNode)
1009 and node.tag == "span"
1010 and "-Latn" in node.attrs.get("lang", "")
1011 ):
1012 roman = clean_node(wxr, None, node).strip("() ")
1013 elif isinstance(node, str) and node.strip() == "[Phonetic:": 1013 ↛ 1014line 1013 didn't jump to line 1014 because the condition on line 1013 was never true
1014 phonetic_pron = clean_node(
1015 wxr, None, span_tag.children[index + 1 :]
1016 ).strip("] ")
1017 break
1018 else:
1019 pron_nodes.append(node)
1020 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):
1021 zh_pron = zh_pron.strip("[]: ")
1022 if len(zh_pron) > 0: 1022 ↛ 1020line 1022 didn't jump to line 1020 because the condition on line 1022 was always true
1023 if "IPA" in span_tag.attrs.get("class", ""): 1023 ↛ 1024line 1023 didn't jump to line 1024 because the condition on line 1023 was never true
1024 sound = {"ipa": zh_pron, "raw_tags": raw_tags[:]}
1025 else:
1026 sound = {"zh_pron": zh_pron, "raw_tags": raw_tags[:]}
1027 if roman != "": 1027 ↛ 1028line 1027 didn't jump to line 1028 because the condition on line 1027 was never true
1028 sound["roman"] = roman
1029 sounds.append(sound)
1030 if len(sounds) > 0: 1030 ↛ 1032line 1030 didn't jump to line 1032 because the condition on line 1030 was always true
1031 data_extend(sounds[-1], "raw_tags", small_tags)
1032 if phonetic_pron != "": 1032 ↛ 1033line 1032 didn't jump to line 1033 because the condition on line 1032 was never true
1033 sound = {
1034 "zh_pron": phonetic_pron,
1035 "raw_tags": raw_tags[:] + ["Phonetic"],
1036 }
1037 if roman != "":
1038 sound["roman"] = roman
1039 sounds.append(sound)
1040 for sound in sounds:
1041 translate_zh_pron_raw_tags(sound)
1042 return sounds
1045def split_zh_pron(zh_pron: str) -> list[str]:
1046 # split by comma and other symbols that outside parentheses
1047 parentheses = 0
1048 pron_list = []
1049 pron = ""
1050 for c in zh_pron:
1051 if (
1052 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))
1053 and parentheses == 0
1054 and len(pron.strip()) > 0
1055 ):
1056 pron_list.append(pron.strip())
1057 pron = ""
1058 elif c == "(":
1059 parentheses += 1
1060 pron += c
1061 elif c == ")":
1062 parentheses -= 1
1063 pron += c
1064 else:
1065 pron += c
1067 if pron.strip() != "": 1067 ↛ 1069line 1067 didn't jump to line 1069 because the condition on line 1067 was always true
1068 pron_list.append(pron)
1069 return pron_list