Coverage for src / wiktextract / extractor / en / pronunciation.py: 80%
668 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-04 10:35 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-05-04 10:35 +0000
1import hashlib
2import re
3import urllib
4from copy import deepcopy
5from dataclasses import dataclass
6from typing import Iterator, Literal
8from wikitextprocessor import (
9 HTMLNode,
10 LevelNode,
11 NodeKind,
12 TemplateNode,
13 WikiNode,
14)
16from ...clean import clean_value
17from ...datautils import data_append, data_extend, split_at_comma_semi
18from ...page import LEVEL_KINDS, clean_node, is_panel_template
19from ...tags import valid_tags
20from ...wxr_context import WiktextractContext
21from ..share import create_audio_url_dict
22from .form_descriptions import (
23 classify_desc,
24 decode_tags,
25 parse_pronunciation_tags,
26)
27from .parts_of_speech import part_of_speech_map
28from .type_utils import Hyphenation, SoundData, TemplateArgs, WordData
30# Prefixes, tags, and regexp for finding romanizations from the pronuncation
31# section
32pron_romanizations = {
33 " Revised Romanization ": "romanization revised",
34 " Revised Romanization (translit.) ": "romanization revised transliteration",
35 " McCune-Reischauer ": "McCune-Reischauer romanization",
36 " McCune–Reischauer ": "McCune-Reischauer romanization",
37 " Yale Romanization ": "Yale romanization",
38}
39pron_romanization_re = re.compile(
40 "(?m)^("
41 + "|".join(
42 re.escape(x)
43 for x in sorted(pron_romanizations.keys(), key=len, reverse=True)
44 )
45 + ")([^\n]+)"
46)
48IPA_EXTRACT = r"^(\((.+)\) )?(IPA⁽ᵏᵉʸ⁾|enPR): ((.+?)( \(([^(]+)\))?\s*)$"
49IPA_EXTRACT_RE = re.compile(IPA_EXTRACT)
52def extract_pron_template(
53 wxr: WiktextractContext, tname: str, targs: TemplateArgs, expanded: str
54) -> tuple[SoundData, list[SoundData]] | None:
55 """In post_template_fn, this is used to handle all enPR and IPA templates
56 so that we can leave breadcrumbs in the text that can later be handled
57 there. We return a `base_data` so that if there are two
58 or more templates on the same line, like this:
59 (Tags for the whole line, really) enPR: foo, IPA(keys): /foo/
60 then we can apply base_data fields to other templates, too, if needed.
61 """
62 cleaned = clean_value(wxr, expanded)
63 # print(f"extract_pron_template input: {tname=} {expanded=}-> {cleaned=}")
64 m = IPA_EXTRACT_RE.match(cleaned)
65 if not m:
66 wxr.wtp.error(
67 f"Text cannot match IPA_EXTRACT_RE regex: "
68 f"{cleaned=}, {tname=}, {targs=}",
69 sortid="en/pronunciation/54",
70 )
71 return None
72 # for i, group in enumerate(m.groups()):
73 # print(i + 1, repr(group))
74 main_qual = m.group(2) or ""
75 if "qq" in targs:
76 # If the template has been given a qualifier that applies to
77 # every entry, but which also happens to appear at the end
78 # which can be confused with the post-qualifier of a single
79 # entry in the style of "... /ipa3/ (foo) (bar)", where foo
80 # might not be present so the bar looks like it only might
81 # apply to `/ipa3/`
82 pron_body = m.group(5)
83 post_qual = m.group(7)
84 else:
85 pron_body = m.group(4)
86 post_qual = ""
88 if not pron_body: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 wxr.wtp.error(
90 f"Regex failed to find 'body' from {cleaned=}",
91 sortid="en/pronunciation/81",
92 )
93 return None
95 base_data: SoundData = {}
96 if main_qual:
97 parse_pronunciation_tags(wxr, main_qual, base_data)
98 if post_qual:
99 parse_pronunciation_tags(wxr, post_qual, base_data)
100 # This base_data is used as the base copy for all entries from this
101 # template, but it is also returned so that its contents may be applied
102 # to other templates on the same line.
103 # print(f"{base_data=}")
105 sound_datas: list[SoundData] = []
107 parts: list[list[str]] = [[]]
108 inside = 0
109 current: list[str] = []
110 for i, p in enumerate(re.split(r"(\s*,|;|\(|\)\s*)", pron_body)):
111 # Split the line on commas and semicolons outside of parens. This
112 # gives us lines with "(main-qualifier) /phon/ (post-qualifier, maybe)"
113 # print(f" {i=}, {p=}")
114 comp = p.strip()
115 if not p:
116 continue
117 if comp == "(":
118 if not inside and i > 0: 118 ↛ 121line 118 didn't jump to line 121 because the condition on line 118 was always true
119 if stripped := "".join(current).strip():
120 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]
121 current = [p]
122 inside += 1
123 continue
124 if comp == ")":
125 inside -= 1
126 if not inside: 126 ↛ 131line 126 didn't jump to line 131 because the condition on line 126 was always true
127 if stripped := "".join(current).strip(): 127 ↛ 131line 127 didn't jump to line 131 because the condition on line 127 was always true
128 current.append(p)
129 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]
130 current = []
131 continue
132 if not inside and comp in (",", ";"):
133 if stripped := "".join(current).strip():
134 parts[-1].append(stripped) # type:ignore[arg-type]
135 current = []
136 parts.append([])
137 continue
138 current.append(p)
139 if current:
140 parts[-1].append("".join(current).strip())
142 # print(f">>>>>> {parts=}")
143 new_parts: list[list[str]] = []
144 for entry in parts:
145 if not entry: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true
146 continue
147 new_entry: list[str] = []
148 i1: int = entry[0].startswith("(") and entry[0].endswith(")")
149 if i1:
150 new_entry.append(entry[0][1:-1].strip())
151 else:
152 new_entry.append("")
153 i2: int = (
154 entry[-1].startswith("(")
155 and entry[-1].endswith(")")
156 and len(entry) > 1
157 )
158 if i2 == 0:
159 i2 = len(entry)
160 else:
161 i2 = -1
162 new_entry.append("".join(entry[i1:i2]).strip())
163 if not new_entry[-1]: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true
164 wxr.wtp.error(
165 f"Missing IPA/enPRO sound data between qualifiers?{entry=}",
166 sortid="en/pronunciation/153",
167 )
168 if i2 == -1:
169 new_entry.append(entry[-1][1:-1].strip())
170 else:
171 new_entry.append("")
172 new_parts.append(new_entry)
174 # print(f">>>>> {new_parts=}")
176 for part in new_parts:
177 sd = deepcopy(base_data)
178 if part[0]:
179 parse_pronunciation_tags(wxr, part[0], sd)
180 if part[2]:
181 parse_pronunciation_tags(wxr, part[2], sd)
182 if tname == "enPR":
183 sd["enpr"] = part[1]
184 else:
185 sd["ipa"] = part[1]
186 sound_datas.append(sd)
188 # print(f"BASE_DATA: {base_data}")
189 # print(f"SOUND_DATAS: {sound_datas=}")
191 return base_data, sound_datas
194def parse_pronunciation(
195 wxr: WiktextractContext,
196 level_node: LevelNode,
197 data: WordData,
198 etym_data: WordData,
199 have_etym: bool,
200 base_data: WordData,
201 lang_code: str,
202) -> None:
203 """Parses the pronunciation section from a language section on a
204 page."""
205 if level_node.kind in LEVEL_KINDS: 205 ↛ 218line 205 didn't jump to line 218 because the condition on line 205 was always true
206 contents: list[str | WikiNode | TemplateNode] = []
207 for node in level_node.children:
208 if isinstance(node, TemplateNode):
209 if node.template_name == "th-pron":
210 extract_th_pron_template(wxr, data, node)
211 elif node.template_name == "zh-pron":
212 extract_zh_pron_template(wxr, data, node)
213 else:
214 contents.append(node)
215 else:
216 contents.append(node)
217 else:
218 contents = [level_node]
219 # Remove subsections, such as Usage notes. They may contain IPAchar
220 # templates in running text, and we do not want to extract IPAs from
221 # those.
222 # Filter out only LEVEL_KINDS; 'or' is doing heavy lifting here
223 # Slip through not-WikiNodes, then slip through WikiNodes that
224 # are not LEVEL_KINDS.
225 contents = [
226 x
227 for x in contents
228 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
229 ]
230 if not any(
231 isinstance(x, WikiNode) and x.kind == NodeKind.LIST for x in contents
232 ):
233 # expand all templates
234 new_contents: list[str | WikiNode | TemplateNode] = []
235 for lst in contents:
236 if isinstance(lst, TemplateNode):
237 temp = wxr.wtp.node_to_wikitext(lst)
238 temp = wxr.wtp.expand(temp)
239 temp_parsed = wxr.wtp.parse(temp)
240 new_contents.extend(temp_parsed.children)
241 else:
242 new_contents.append(lst)
243 contents = new_contents
245 if have_etym and data is base_data: 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true
246 data = etym_data
247 pron_templates: list[tuple[SoundData, list[SoundData]]] = []
248 hyphenations: list[Hyphenation] = []
249 audios: list[SoundData] = []
250 have_panel_templates = False
252 def parse_pronunciation_template_fn(
253 name: str, ht: TemplateArgs
254 ) -> str | None:
255 """Handle pronunciation and hyphenation templates"""
256 # _template_fn handles templates *before* they are expanded;
257 # this allows for special handling before all the work needed
258 # for expansion is done.
259 nonlocal have_panel_templates
260 if is_panel_template(wxr, name):
261 have_panel_templates = True
262 return ""
263 if name == "audio":
264 filename = ht.get(2) or ""
265 audio: SoundData = {"audio": filename.strip()}
266 dialect = ht.get("a", "")
267 if "aa" in ht: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true
268 dialect += ", " + ht.get("aa", "")
269 if dialect:
270 dialect = dialect.replace("<", "").replace(">", "")
271 dialect = clean_node(wxr, None, [dialect])
272 for part in split_at_comma_semi(dialect):
273 if "(" not in part:
274 parse_pronunciation_tags(wxr, part, audio)
275 else:
276 for ppart in re.split(r"[][()]", part):
277 parse_pronunciation_tags(wxr, ppart, audio)
278 desc = ht.get(3) or ""
279 desc = clean_node(wxr, None, [desc])
280 if desc: 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true
281 audio["text"] = desc
282 m = re.search(r"\((([^()]|\([^()]*\))*)\)", desc)
283 skip = False
284 if m: 284 ↛ 285line 284 didn't jump to line 285 because the condition on line 284 was never true
285 par = m.group(1)
286 cls = classify_desc(par)
287 if cls == "tags":
288 parse_pronunciation_tags(wxr, par, audio)
289 else:
290 skip = True
291 if skip: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true
292 return ""
293 audios.append(audio)
294 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
295 if name == "audio-IPA": 295 ↛ 296line 295 didn't jump to line 296 because the condition on line 295 was never true
296 filename = ht.get(2) or ""
297 ipa = ht.get(3) or ""
298 dial = ht.get("dial")
299 audio = {"audio": filename.strip()}
300 if dial:
301 dial = clean_node(wxr, None, [dial])
302 audio["text"] = dial
303 if ipa:
304 audio["audio-ipa"] = ipa
305 audios.append(audio)
306 # The problem with these IPAs is that they often just describe
307 # what's in the sound file, rather than giving the pronunciation
308 # of the word alone. It is common for audio files to contain
309 # multiple pronunciations or articles in the same file, and then
310 # this IPA often describes what is in the file.
311 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
312 if name == "audio-pron":
313 filename = ht.get(2) or ""
314 ipa = ht.get("ipa") or ""
315 dial = ht.get("dial")
316 country = ht.get("country")
317 audio = {"audio": filename.strip()}
318 if dial: 318 ↛ 322line 318 didn't jump to line 322 because the condition on line 318 was always true
319 dial = clean_node(wxr, None, [dial])
320 audio["text"] = dial
321 parse_pronunciation_tags(wxr, dial, audio)
322 if country: 322 ↛ 324line 322 didn't jump to line 324 because the condition on line 322 was always true
323 parse_pronunciation_tags(wxr, country, audio)
324 if ipa: 324 ↛ 326line 324 didn't jump to line 326 because the condition on line 324 was always true
325 audio["audio-ipa"] = ipa
326 audios.append(audio)
327 # XXX do we really want to extract pronunciations from these?
328 # Or are they spurious / just describing what is in the
329 # audio file?
330 # if ipa:
331 # pron = {"ipa": ipa}
332 # if dial:
333 # parse_pronunciation_tags(wxr, dial, pron)
334 # if country:
335 # parse_pronunciation_tags(wxr, country, pron)
336 # data_append(data, "sounds", pron)
337 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
338 if name in ("hyph", "hyphenation"):
339 # {{hyph|en|re|late|caption="Hyphenation UK:"}}
340 # {{hyphenation|it|quiè|to||qui|è|to||quié|to||qui|é|to}}
341 # and also nocaption=1
342 caption = clean_node(wxr, None, ht.get("caption", ""))
343 tagsets, _ = decode_tags(caption)
344 # flatten the tagsets into one; it would be really weird to have
345 # several tagsets for a hyphenation caption
346 tags = sorted(set(tag for tagset in tagsets for tag in tagset))
347 # We'll just ignore any errors from tags, it's not very important
348 # for hyphenation
349 tags = [tag for tag in tags if not tag.startswith("error")]
350 hyph_sequences: list[list[str]] = [[]]
351 for text in [
352 t for (k, t) in ht.items() if (isinstance(k, int) and k >= 2)
353 ]:
354 if not text:
355 hyph_sequences.append([])
356 else:
357 hyph_sequences[-1].append(clean_node(wxr, None, text))
358 for seq in hyph_sequences:
359 hyphenations.append(Hyphenation(parts=seq, tags=tags))
360 return ""
361 return None
363 may_be_duplicates = False
365 def parse_pron_post_template_fn(
366 name: str, ht: TemplateArgs, text: str
367 ) -> str | None:
368 # _post_template_fn handles templates *after* the work to expand
369 # them has been done; this is exactly the same as _template_fn,
370 # except with the additional expanded text as an input, and
371 # possible side-effects from the expansion and recursion (like
372 # calling other subtemplates that are handled in _template_fn.
373 nonlocal may_be_duplicates
374 if is_panel_template(wxr, name): 374 ↛ 375line 374 didn't jump to line 375 because the condition on line 374 was never true
375 return ""
376 if name in {
377 "q",
378 "qualifier",
379 "sense",
380 "a",
381 "accent",
382 "l",
383 "link",
384 "lb",
385 "lbl",
386 "label",
387 }:
388 # Kludge: when these templates expand to /.../ or [...],
389 # replace the expansion by something safe. This is used
390 # to filter spurious IPA-looking expansions that aren't really
391 # IPAs. We probably don't care about these templates in the
392 # contexts where they expand to something containing these.
393 v = re.sub(r'href="[^"]*"', "", text) # Ignore URLs
394 v = re.sub(r'src="[^"]*"', "", v)
395 if re.search(r"/[^/,]+?/|\[[^]0-9,/][^],/]*?\]", v): 395 ↛ 401line 395 didn't jump to line 401 because the condition on line 395 was always true
396 # Note: replacing by empty results in Lua errors that we
397 # would rather not have. For example, voi/Middle Vietnamese
398 # uses {{a|{{l{{vi|...}}}}, and the {{a|...}} will fail
399 # if {{l|...}} returns empty.
400 return "stripped-by-parse_pron_post_template_fn"
401 if name in ("IPA", "enPR"):
402 # Extract the data from IPA and enPR templates (same underlying
403 # template) and replace them in-text with magical cookie that
404 # can be later used to refer to the data's index inside
405 # pron_templates.
406 if pron_t := extract_pron_template(wxr, name, ht, text):
407 pron_templates.append(pron_t)
408 return f"__PRON_TEMPLATE_{len(pron_templates) - 1}__"
409 # Catch templates that generate duplicate sound data entries
410 # here; if the text produces a big, toggleable section, the
411 # "header" for that section might be duplicated. Add more conditions
412 # if necessary.
413 if text.startswith("<") and "vsToggleElement" in text: 413 ↛ 414line 413 didn't jump to line 414 because the condition on line 413 was never true
414 may_be_duplicates = True
415 return text
417 def flattened_tree(lines: list[WikiNode | str]) -> Iterator[WikiNode | str]:
418 assert isinstance(lines, list)
419 for line in lines:
420 yield from flattened_tree1(line)
422 def flattened_tree1(node: WikiNode | str) -> Iterator[WikiNode | str]:
423 assert isinstance(node, (WikiNode, str))
424 if isinstance(node, str):
425 yield node
426 return
427 elif node.kind == NodeKind.LIST:
428 for item in node.children:
429 yield from flattened_tree1(item)
430 elif node.kind == NodeKind.LIST_ITEM:
431 new_children = []
432 sublist = None
433 for child in node.children:
434 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
435 sublist = child
436 else:
437 new_children.append(child)
438 node.children = new_children
439 node.sarg = "*"
440 yield node
441 if sublist:
442 yield from flattened_tree1(sublist)
443 else:
444 yield node
446 # XXX Do not use flattened_tree more than once here, for example for
447 # debug printing... The underlying data is changed, and the separated
448 # sublists disappear.
450 # Kludge for templates that generate several lines, but haven't
451 # been caught by earlier kludges...
452 def split_cleaned_node_on_newlines(
453 contents: list[WikiNode | str],
454 ) -> Iterator[str]:
455 for litem in flattened_tree(contents):
456 ipa_text = clean_node(
457 wxr,
458 data,
459 litem,
460 template_fn=parse_pronunciation_template_fn,
461 post_template_fn=parse_pron_post_template_fn,
462 )
463 for line in ipa_text.splitlines():
464 yield line
466 # have_pronunciations = False
467 active_pos: str | None = None
469 for line in split_cleaned_node_on_newlines(contents):
470 prefix: str | None = None
471 earlier_base_data: SoundData | None = None
472 if not line: 472 ↛ 473line 472 didn't jump to line 473 because the condition on line 472 was never true
473 continue
475 split_templates = re.split(r"__PRON_TEMPLATE_(\d+)__", line)
476 for i, text in enumerate(split_templates):
477 if not text:
478 continue
479 # clean up starts at the start of the line
480 text = re.sub(r"^\**\s*", "", text).strip()
481 if i == 0:
482 # At the start of a line, check for stuff like "Noun:"
483 # for active_pos; active_pos is a temporary data field
484 # given to each saved SoundData entry which is later
485 # used to sort the entries into their respective PoSes.
486 m = re.match(r"\s*(\w+\s?\w*)\s*:?\s*", text)
487 if m:
488 if (m_lower := m.group(1).lower()) in part_of_speech_map:
489 active_pos = part_of_speech_map[m_lower]["pos"]
490 text = text[m.end() :].strip()
491 if not text:
492 continue
493 if i % 2 == 1:
494 # re.split (with capture groups) splits the lines so that
495 # every even entry is a captured splitter; odd lines are either
496 # empty strings or stuff around the splitters.
497 base_pron_data, first_prons = pron_templates[int(text)]
498 if base_pron_data:
499 earlier_base_data = base_pron_data
500 # print(f"Set {earlier_base_data=}")
501 elif earlier_base_data is not None:
502 # merge data from an earlier iteration of this loop
503 for pr in first_prons:
504 if "note" in pr and "note" in earlier_base_data: 504 ↛ 505line 504 didn't jump to line 505 because the condition on line 504 was never true
505 pr["note"] += ";" + earlier_base_data.get(
506 "note", ""
507 )
508 elif "note" in earlier_base_data: 508 ↛ 509line 508 didn't jump to line 509 because the condition on line 508 was never true
509 pr["note"] = earlier_base_data["note"]
510 if "topics" in earlier_base_data: 510 ↛ 511line 510 didn't jump to line 511 because the condition on line 510 was never true
511 data_extend(
512 pr, "topics", earlier_base_data["topics"]
513 )
514 if "tags" in pr and "tags" in earlier_base_data: 514 ↛ 515line 514 didn't jump to line 515 because the condition on line 514 was never true
515 pr["tags"].extend(earlier_base_data["tags"])
516 elif "tags" in earlier_base_data: 516 ↛ 503line 516 didn't jump to line 503 because the condition on line 516 was always true
517 pr["tags"] = sorted(set(earlier_base_data["tags"]))
518 for pr in first_prons:
519 if active_pos:
520 pr["pos"] = active_pos # type: ignore[typeddict-unknown-key]
521 if pr not in data.get("sounds", ()): 521 ↛ 518line 521 didn't jump to line 518 because the condition on line 521 was always true
522 data_append(data, "sounds", pr)
523 # This bit is handled
524 continue
526 if "IPA" in text:
527 field: Literal[
528 "audio",
529 "audio-ipa",
530 "enpr",
531 "form",
532 "hangeul",
533 "homophone",
534 "ipa",
535 "mp3_url",
536 "note",
537 "ogg_url",
538 "other",
539 "rhymes",
540 "tags",
541 "text",
542 "topics",
543 "zh-pron",
544 ] = "ipa"
545 else:
546 # This is used for Rhymes, Homophones, etc
547 field = "other"
549 # Check if it contains Japanese "Tokyo" pronunciation with
550 # special syntax
551 pron: SoundData
552 m = re.search(r"(?m)\(Tokyo\) +([^ ]+) +\[", text)
553 if m: 553 ↛ 554line 553 didn't jump to line 554 because the condition on line 553 was never true
554 pron = {field: m.group(1)} # type: ignore[misc]
555 if active_pos:
556 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
557 data_append(data, "sounds", pron)
558 # have_pronunciations = True
559 continue
561 # Check if it contains Rhymes
562 m = re.match(r"\s*Rhymes?: (.*)", text)
563 if m:
564 for ending in split_at_comma_semi(m.group(1)):
565 ending = ending.strip()
566 if ending: 566 ↛ 564line 566 didn't jump to line 564 because the condition on line 566 was always true
567 pron = {"rhymes": ending}
568 if active_pos:
569 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
570 data_append(data, "sounds", pron)
571 # have_pronunciations = True
572 continue
574 # Check if it contains homophones
575 m = re.search(r"(?m)\bHomophones?: (.*)", text)
576 if m:
577 for w in split_at_comma_semi(m.group(1)):
578 w = w.strip()
579 if w: 579 ↛ 577line 579 didn't jump to line 577 because the condition on line 579 was always true
580 pron = {"homophone": w}
581 if active_pos:
582 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
583 data_append(data, "sounds", pron)
584 # have_pronunciations = True
585 continue
587 # Check if it contains Phonetic hangeul
588 m = re.search(r"(?m)\bPhonetic hange?ul: \[([^]]+)\]", text)
589 if m: 589 ↛ 590line 589 didn't jump to line 590 because the condition on line 589 was never true
590 seen = set()
591 for w in m.group(1).split("/"):
592 w = w.strip()
593 if w and w not in seen:
594 seen.add(w)
595 pron = {"hangeul": w}
596 if active_pos:
597 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
598 data_append(data, "sounds", pron)
599 # have_pronunciations = True
601 # This regex-based hyphenation detection left as backup
602 m = re.search(r"\b(Syllabification|Hyphenation): *([^\n.]*)", text)
603 if m:
604 data_append(data, "hyphenation", m.group(2))
605 commaseparated = m.group(2).split(",")
606 if len(commaseparated) > 1: 606 ↛ 617line 606 didn't jump to line 617 because the condition on line 606 was always true
607 for h in commaseparated:
608 # That second characters looks like a dash but it's
609 # actually unicode decimal code 8231, hyphenation dash
610 # Add more delimiters here if needed.
611 parts = re.split(r"-|‧", h.strip())
612 data_append(
613 data, "hyphenations", Hyphenation(parts=parts)
614 )
615 ...
616 else:
617 data_append(
618 data,
619 "hyphenations",
620 Hyphenation(parts=m.group(2).split(sep="-")),
621 )
622 # have_pronunciations = True
624 # See if it contains a word prefix restricting which forms the
625 # pronunciation applies to (see amica/Latin) and/or parenthesized
626 # tags.
627 m = re.match(
628 r"^[*#\s]*(([-\w]+):\s+)?\((([^()]|\([^()]*\))*?)\)", text
629 )
630 if m:
631 prefix = m.group(2) or ""
632 tagstext = m.group(3)
633 text = text[m.end() :]
634 else:
635 m = re.match(r"^[*#\s]*([-\w]+):\s+", text)
636 if m:
637 prefix = m.group(1)
638 tagstext = ""
639 text = text[m.end() :]
640 else:
641 # Spanish has tags before pronunciations, eg. aceite/Spanish
642 m = re.match(r".*:\s+\(([^)]*)\)\s+(.*)", text)
643 if m: 643 ↛ 644line 643 didn't jump to line 644 because the condition on line 643 was never true
644 tagstext = m.group(1)
645 text = m.group(2)
646 else:
647 # No prefix. In this case, we inherit prefix
648 # from previous entry. This particularly
649 # applies for nested Audio files.
650 tagstext = ""
651 if tagstext:
652 earlier_base_data = {}
653 parse_pronunciation_tags(wxr, tagstext, earlier_base_data)
655 # Find romanizations from the pronunciation section (routinely
656 # produced for Korean by {{ko-IPA}})
657 for m in re.finditer(pron_romanization_re, text): 657 ↛ 658line 657 didn't jump to line 658 because the loop on line 657 never started
658 prefix = m.group(1)
659 w = m.group(2).strip()
660 tag = pron_romanizations[prefix]
661 form = {"form": w, "tags": tag.split()}
662 data_append(data, "forms", form)
664 # Find IPA pronunciations
665 for m in re.finditer(
666 r"(?m)/[^][\n/,]+?/" r"|" r"\[[^]\n0-9,/][^],/]*?\]", text
667 ):
668 v = m.group(0)
669 # The regexp above can match file links. Skip them.
670 if v.startswith("[[File:"): 670 ↛ 671line 670 didn't jump to line 671 because the condition on line 670 was never true
671 continue
672 if v == "/wiki.local/": 672 ↛ 673line 672 didn't jump to line 673 because the condition on line 672 was never true
673 continue
674 if field == "ipa" and "__AUDIO_IGNORE_THIS__" in text: 674 ↛ 675line 674 didn't jump to line 675 because the condition on line 674 was never true
675 m = re.search(r"__AUDIO_IGNORE_THIS__(\d+)__", text)
676 assert m
677 idx = int(m.group(1))
678 if idx >= len(audios):
679 continue
680 if not audios[idx].get("audio-ipa"):
681 audios[idx]["audio-ipa"] = v
682 if prefix:
683 audios[idx]["form"] = prefix
684 else:
685 if earlier_base_data:
686 pron = deepcopy(earlier_base_data)
687 pron[field] = v
688 else:
689 pron = {field: v} # type: ignore[misc]
690 if active_pos:
691 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
692 if prefix:
693 pron["form"] = prefix
694 if active_pos:
695 pron["pos"] = active_pos # type: ignore[typeddict-unknown-key]
696 if may_be_duplicates is True: 696 ↛ 697line 696 didn't jump to line 697 because the condition on line 696 was never true
697 ok = True
698 for comp_sound in data.get("sounds", []):
699 # Python has dict comparison since 3.8
700 if pron == comp_sound:
701 ok = False
702 break
703 if ok:
704 data_append(data, "sounds", pron)
705 else:
706 data_append(data, "sounds", pron)
707 # have_pronunciations = True
709 # XXX what about {{hyphenation|...}}, {{hyph|...}}
710 # and those used to be stored under "hyphenation"
712 # Add data that was collected in template_fn
713 for audio in audios:
714 if "audio" in audio: 714 ↛ 771line 714 didn't jump to line 771 because the condition on line 714 was always true
715 # Compute audio file URLs
716 fn = audio["audio"]
717 # Strip certain characters, e.g., left-to-right mark
718 fn = re.sub(r"[\u200f\u200e]", "", fn)
719 fn = fn.strip()
720 fn = urllib.parse.unquote(fn)
721 # First character is usually uppercased
722 if re.match(r"^[a-z][a-z]+", fn):
723 fn = fn[0].upper() + fn[1:]
724 if fn in wxr.config.redirects: 724 ↛ 725line 724 didn't jump to line 725 because the condition on line 724 was never true
725 fn = wxr.config.redirects[fn]
726 # File extension is lowercased
727 # XXX some words seem to need this, some don't seem to
728 # have this??? what is the exact rule?
729 # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn)
730 # Spaces are converted to underscores
731 fn = re.sub(r"\s+", "_", fn)
732 # Compute hash digest part
733 h = hashlib.md5()
734 hname = fn.encode("utf-8")
735 h.update(hname)
736 digest = h.hexdigest()
737 # Quote filename for URL
738 qfn = urllib.parse.quote(fn)
739 # For safety when writing files
740 qfn = qfn.replace("/", "__slash__")
741 if re.search(r"(?i)\.(ogg|oga)$", fn):
742 ogg = (
743 "https://upload.wikimedia.org/wikipedia/"
744 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)
745 )
746 else:
747 ogg = (
748 "https://upload.wikimedia.org/wikipedia/"
749 "commons/transcoded/"
750 "{}/{}/{}/{}.ogg".format(
751 digest[:1], digest[:2], qfn, qfn
752 )
753 )
754 if re.search(r"(?i)\.(mp3)$", fn): 754 ↛ 755line 754 didn't jump to line 755 because the condition on line 754 was never true
755 mp3 = (
756 "https://upload.wikimedia.org/wikipedia/"
757 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)
758 )
759 else:
760 mp3 = (
761 "https://upload.wikimedia.org/wikipedia/"
762 "commons/transcoded/"
763 "{}/{}/{}/{}.mp3".format(
764 digest[:1], digest[:2], qfn, qfn
765 )
766 )
767 audio["ogg_url"] = ogg
768 audio["mp3_url"] = mp3
769 if active_pos:
770 audio["pos"] = active_pos # type: ignore[typeddict-unknown-key]
771 if audio not in data.get("sounds", ()):
772 data_append(data, "sounds", audio)
774 # if audios:
775 # have_pronunciations = True
776 audios = []
778 data_extend(data, "hyphenations", hyphenations)
779 hyphenations = []
781 ## I have commented out the otherwise unused have_pronunciation
782 ## toggles; uncomment them to use this debug print
783 # if not have_pronunciations and not have_panel_templates:
784 # wxr.wtp.debug("no pronunciations found from pronunciation section",
785 # sortid="pronunciations/533")
788def extract_th_pron_template(
789 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode
790):
791 # https://en.wiktionary.org/wiki/Template:th-pron
792 @dataclass
793 class TableHeader:
794 raw_tags: list[str]
795 rowspan: int
797 expanded_node = wxr.wtp.parse(
798 wxr.wtp.node_to_wikitext(t_node), expand_all=True
799 )
800 sounds = []
801 for table_tag in expanded_node.find_html("table"):
802 row_headers = []
803 for tr_tag in table_tag.find_html("tr"):
804 field = "other"
805 new_headers = []
806 for header in row_headers:
807 if header.rowspan > 1:
808 header.rowspan -= 1
809 new_headers.append(header)
810 row_headers = new_headers
811 for th_tag in tr_tag.find_html("th"):
812 header_str = clean_node(wxr, None, th_tag)
813 if header_str.startswith("(standard) IPA"):
814 field = "ipa"
815 elif header_str.startswith("Homophones"): 815 ↛ 816line 815 didn't jump to line 816 because the condition on line 815 was never true
816 field = "homophone"
817 elif header_str == "Audio":
818 field = "audio"
819 elif header_str != "": 819 ↛ 811line 819 didn't jump to line 811 because the condition on line 819 was always true
820 rowspan = 1
821 rowspan_str = th_tag.attrs.get("rowspan", "1")
822 if re.fullmatch(r"\d+", rowspan_str): 822 ↛ 824line 822 didn't jump to line 824 because the condition on line 822 was always true
823 rowspan = int(rowspan_str)
824 header = TableHeader([], rowspan)
825 for line in header_str.splitlines():
826 for raw_tag in line.strip("{}\n ").split(";"):
827 raw_tag = raw_tag.strip()
828 if raw_tag != "": 828 ↛ 826line 828 didn't jump to line 826 because the condition on line 828 was always true
829 header.raw_tags.append(raw_tag)
830 row_headers.append(header)
832 for td_tag in tr_tag.find_html("td"):
833 if field == "audio":
834 for link_node in td_tag.find_child(NodeKind.LINK):
835 filename = clean_node(wxr, None, link_node.largs[0])
836 if filename != "": 836 ↛ 834line 836 didn't jump to line 834 because the condition on line 836 was always true
837 sound = create_audio_url_dict(filename)
838 sounds.append(sound)
839 elif field == "homophone": 839 ↛ 840line 839 didn't jump to line 840 because the condition on line 839 was never true
840 for span_tag in td_tag.find_html_recursively(
841 "span", attr_name="lang", attr_value="th"
842 ):
843 word = clean_node(wxr, None, span_tag)
844 if word != "":
845 sounds.append({"homophone": word})
846 else:
847 raw_tags = []
848 for html_node in td_tag.find_child_recursively(
849 NodeKind.HTML
850 ):
851 if html_node.tag == "small":
852 node_str = clean_node(wxr, None, html_node)
853 if node_str.startswith("[") and node_str.endswith(
854 "]"
855 ):
856 for raw_tag in node_str.strip("[]").split(","):
857 raw_tag = raw_tag.strip()
858 if raw_tag != "": 858 ↛ 856line 858 didn't jump to line 856 because the condition on line 858 was always true
859 raw_tags.append(raw_tag)
860 elif len(sounds) > 0: 860 ↛ 848line 860 didn't jump to line 848 because the condition on line 860 was always true
861 sounds[-1]["roman"] = node_str
862 elif html_node.tag == "span":
863 node_str = clean_node(wxr, None, html_node)
864 span_lang = html_node.attrs.get("lang", "")
865 span_class = html_node.attrs.get("class", "")
866 if node_str != "" and (
867 span_lang == "th" or span_class in ["IPA", "tr"]
868 ):
869 sound = {}
870 for raw_tag in raw_tags:
871 if raw_tag in valid_tags: 871 ↛ 874line 871 didn't jump to line 874 because the condition on line 871 was always true
872 data_append(sound, "tags", raw_tag)
873 else:
874 data_append(sound, "raw_tags", raw_tag)
875 for header in row_headers:
876 for raw_tag in header.raw_tags:
877 if raw_tag.lower() in valid_tags:
878 data_append(
879 sound, "tags", raw_tag.lower()
880 )
881 else:
882 data_append(
883 sound, "raw_tags", raw_tag
884 )
885 if "romanization" in sound.get("tags", []):
886 field = "roman"
887 sound[field] = node_str
888 sounds.append(sound)
890 clean_node(wxr, word_entry, expanded_node)
891 data_extend(word_entry, "sounds", sounds)
894def extract_zh_pron_template(
895 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode
896):
897 # https://en.wiktionary.org/wiki/Template:zh-pron
898 expanded_node = wxr.wtp.parse(
899 wxr.wtp.node_to_wikitext(t_node), expand_all=True
900 )
901 seen_lists = set()
902 sounds = []
903 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):
904 if list_node not in seen_lists:
905 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
906 sounds.extend(
907 extract_zh_pron_list_item(wxr, list_item, [], seen_lists)
908 )
909 clean_node(wxr, word_entry, expanded_node)
910 data_extend(word_entry, "sounds", sounds)
913def extract_zh_pron_list_item(
914 wxr: WiktextractContext,
915 list_item: WikiNode,
916 raw_tags: list[str],
917 seen_lists: set[WikiNode],
918) -> list[SoundData]:
919 current_tags = raw_tags[:]
920 sounds = []
921 is_first_small_tag = True
922 for node in list_item.children:
923 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
924 link_str = clean_node(wxr, None, node.largs)
925 node_str = clean_node(wxr, None, node)
926 if link_str.startswith("File:"): 926 ↛ 927line 926 didn't jump to line 927 because the condition on line 926 was never true
927 sound = create_audio_url_dict(link_str.removeprefix("File:"))
928 sound["raw_tags"] = current_tags[:]
929 translate_zh_pron_raw_tags(sound)
930 sounds.append(sound)
931 elif node_str != "": 931 ↛ 922line 931 didn't jump to line 922 because the condition on line 931 was always true
932 current_tags.append(node_str)
933 elif isinstance(node, HTMLNode):
934 if node.tag == "small":
935 if is_first_small_tag: 935 ↛ 946line 935 didn't jump to line 946 because the condition on line 935 was always true
936 raw_tag_text = clean_node(
937 wxr,
938 None,
939 [
940 n
941 for n in node.children
942 if not (isinstance(n, HTMLNode) and n.tag == "sup")
943 ],
944 )
945 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))
946 elif len(sounds) > 0:
947 data_extend(
948 sounds[-1],
949 "raw_tags",
950 split_zh_pron_raw_tag(clean_node(wxr, None, node)),
951 )
952 translate_zh_pron_raw_tags(sounds[-1])
953 is_first_small_tag = False
954 elif node.tag == "span":
955 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))
956 elif ( 956 ↛ 961line 956 didn't jump to line 961 because the condition on line 956 was never true
957 node.tag == "table"
958 and len(current_tags) > 0
959 and current_tags[-1] == "Homophones"
960 ):
961 sounds.extend(
962 extract_zh_pron_homophone_table(wxr, node, current_tags)
963 )
964 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
965 seen_lists.add(node)
966 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
967 sounds.extend(
968 extract_zh_pron_list_item(
969 wxr, child_list_item, current_tags, seen_lists
970 )
971 )
973 return sounds
976def extract_zh_pron_homophone_table(
977 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]
978) -> list[SoundData]:
979 sounds = []
980 for td_tag in table.find_html_recursively("td"):
981 for span_tag in td_tag.find_html("span"):
982 span_class = span_tag.attrs.get("class", "")
983 span_lang = span_tag.attrs.get("lang", "")
984 span_str = clean_node(wxr, None, span_tag)
985 if (
986 span_str not in ["", "/"]
987 and span_lang != ""
988 and span_class in ["Hant", "Hans", "Hani"]
989 ):
990 sound = {"homophone": span_str, "raw_tags": raw_tags[:]}
991 if span_class == "Hant":
992 data_append(sound, "tags", "Traditional-Chinese")
993 elif span_class == "Hans":
994 data_append(sound, "tags", "Simplified-Chinese")
995 translate_zh_pron_raw_tags(sound)
996 sounds.append(sound)
998 return sounds
1001def translate_zh_pron_raw_tags(sound: SoundData):
1002 from .zh_pron_tags import ZH_PRON_TAGS
1004 raw_tags = []
1005 for raw_tag in sound.get("raw_tags", []):
1006 if raw_tag in ZH_PRON_TAGS:
1007 tr_tag = ZH_PRON_TAGS[raw_tag]
1008 if isinstance(tr_tag, str):
1009 data_append(sound, "tags", tr_tag)
1010 elif isinstance(tr_tag, list) and tr_tag not in sound.get( 1010 ↛ 1005line 1010 didn't jump to line 1005 because the condition on line 1010 was always true
1011 "tags", []
1012 ):
1013 data_extend(sound, "tags", tr_tag)
1014 elif raw_tag in valid_tags:
1015 if raw_tag not in sound.get("tags", []): 1015 ↛ 1005line 1015 didn't jump to line 1005 because the condition on line 1015 was always true
1016 data_append(sound, "tags", raw_tag)
1017 elif raw_tag not in raw_tags: 1017 ↛ 1005line 1017 didn't jump to line 1005 because the condition on line 1017 was always true
1018 raw_tags.append(raw_tag)
1020 if len(raw_tags) > 0:
1021 sound["raw_tags"] = raw_tags
1022 elif "raw_tags" in sound: 1022 ↛ exitline 1022 didn't return from function 'translate_zh_pron_raw_tags' because the condition on line 1022 was always true
1023 del sound["raw_tags"]
1026def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:
1027 raw_tags = []
1028 if "(" not in raw_tag_text:
1029 for raw_tag in re.split(r",|:|;| and ", raw_tag_text):
1030 raw_tag = raw_tag.strip().removeprefix("incl. ").strip()
1031 if raw_tag != "":
1032 raw_tags.append(raw_tag)
1033 else:
1034 processed_offsets = []
1035 for match in re.finditer(r"\([^()]+\)", raw_tag_text):
1036 processed_offsets.append((match.start(), match.end()))
1037 raw_tags.extend(
1038 split_zh_pron_raw_tag(
1039 raw_tag_text[match.start() + 1 : match.end() - 1]
1040 )
1041 )
1042 not_processed = ""
1043 last_end = 0
1044 for start, end in processed_offsets:
1045 not_processed += raw_tag_text[last_end:start]
1046 last_end = end
1047 not_processed += raw_tag_text[last_end:]
1048 if not_processed != raw_tag_text: 1048 ↛ 1051line 1048 didn't jump to line 1051 because the condition on line 1048 was always true
1049 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags
1050 else:
1051 raw_tags.append(not_processed)
1053 return raw_tags
1056def extract_zh_pron_span(
1057 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]
1058) -> list[SoundData]:
1059 sounds = []
1060 small_tags = []
1061 pron_nodes = []
1062 roman = ""
1063 phonetic_pron = ""
1064 for index, node in enumerate(span_tag.children):
1065 if isinstance(node, HTMLNode) and node.tag == "small": 1065 ↛ 1066line 1065 didn't jump to line 1066 because the condition on line 1065 was never true
1066 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))
1067 elif ( 1067 ↛ 1072line 1067 didn't jump to line 1072 because the condition on line 1067 was never true
1068 isinstance(node, HTMLNode)
1069 and node.tag == "span"
1070 and "-Latn" in node.attrs.get("lang", "")
1071 ):
1072 roman = clean_node(wxr, None, node).strip("() ")
1073 elif isinstance(node, str) and node.strip() == "[Phonetic:": 1073 ↛ 1074line 1073 didn't jump to line 1074 because the condition on line 1073 was never true
1074 phonetic_pron = clean_node(
1075 wxr, None, span_tag.children[index + 1 :]
1076 ).strip("] ")
1077 break
1078 else:
1079 pron_nodes.append(node)
1080 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):
1081 zh_pron = zh_pron.strip("[]: ")
1082 if len(zh_pron) > 0: 1082 ↛ 1080line 1082 didn't jump to line 1080 because the condition on line 1082 was always true
1083 if "IPA" in span_tag.attrs.get("class", ""): 1083 ↛ 1084line 1083 didn't jump to line 1084 because the condition on line 1083 was never true
1084 sound = {"ipa": zh_pron, "raw_tags": raw_tags[:]}
1085 else:
1086 sound = {"zh_pron": zh_pron, "raw_tags": raw_tags[:]}
1087 if roman != "": 1087 ↛ 1088line 1087 didn't jump to line 1088 because the condition on line 1087 was never true
1088 sound["roman"] = roman
1089 sounds.append(sound)
1090 if len(sounds) > 0: 1090 ↛ 1092line 1090 didn't jump to line 1092 because the condition on line 1090 was always true
1091 data_extend(sounds[-1], "raw_tags", small_tags)
1092 if phonetic_pron != "": 1092 ↛ 1093line 1092 didn't jump to line 1093 because the condition on line 1092 was never true
1093 sound = {
1094 "zh_pron": phonetic_pron,
1095 "raw_tags": raw_tags[:] + ["Phonetic"],
1096 }
1097 if roman != "":
1098 sound["roman"] = roman
1099 sounds.append(sound)
1100 for sound in sounds:
1101 translate_zh_pron_raw_tags(sound)
1102 return sounds
1105def split_zh_pron(zh_pron: str) -> list[str]:
1106 # split by comma and other symbols that outside parentheses
1107 parentheses = 0
1108 pron_list = []
1109 pron = ""
1110 for c in zh_pron:
1111 if (
1112 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))
1113 and parentheses == 0
1114 and len(pron.strip()) > 0
1115 ):
1116 pron_list.append(pron.strip())
1117 pron = ""
1118 elif c == "(":
1119 parentheses += 1
1120 pron += c
1121 elif c == ")":
1122 parentheses -= 1
1123 pron += c
1124 else:
1125 pron += c
1127 if pron.strip() != "": 1127 ↛ 1129line 1127 didn't jump to line 1129 because the condition on line 1127 was always true
1128 pron_list.append(pron)
1129 return pron_list