Coverage for src/wiktextract/extractor/en/pronunciation.py: 82%
775 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-05-29 08:54 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-05-29 08:54 +0000
1import hashlib
2import re
3import urllib
4from copy import deepcopy
5from dataclasses import dataclass
6from typing import Iterator, Literal, NamedTuple
8from wikitextprocessor import (
9 HTMLNode,
10 LevelNode,
11 NodeKind,
12 TemplateNode,
13 WikiNode,
14)
16from ...clean import clean_value
17from ...datautils import data_append, data_extend, split_at_comma_semi
18from ...page import LEVEL_KINDS, clean_node, is_panel_template
19from ...tags import valid_tags
20from ...wxr_context import WiktextractContext
21from ..share import create_audio_url_dict
22from .form_descriptions import (
23 classify_desc,
24 decode_tags,
25 parse_pronunciation_tags,
26)
27from .parts_of_speech import part_of_speech_map
28from .type_utils import Hyphenation, SoundData, TemplateArgs, WordData
30# Prefixes, tags, and regexp for finding romanizations from the pronuncation
31# section
32pron_romanizations = {
33 " Revised Romanization ": "romanization revised",
34 " Revised Romanization (translit.) ": (
35 "romanization revised transliteration"
36 ),
37 " McCune-Reischauer ": "McCune-Reischauer romanization",
38 " McCune–Reischauer ": "McCune-Reischauer romanization",
39 " Yale Romanization ": "Yale romanization",
40}
41pron_romanization_re = re.compile(
42 "(?m)^("
43 + "|".join(
44 re.escape(x)
45 for x in sorted(pron_romanizations.keys(), key=len, reverse=True)
46 )
47 + ")([^\n]+)"
48)
50IPA_EXTRACT = r"^(\((.+)\) )?(IPA⁽ᵏᵉʸ⁾|enPR): ((.+?)( \(([^(]+)\))?\s*)$"
51IPA_EXTRACT_RE = re.compile(IPA_EXTRACT)
54class PronunciationPosMatch(NamedTuple):
55 pos_values: list[str]
56 residual: str
59class PronunciationPosPrefix(NamedTuple):
60 pos_values: list[str]
61 text: str
62 is_persistent: bool
65PRON_POS_TEMPLATE_NAMES = {
66 "q",
67 "qualifier",
68 "qual",
69 "i",
70 "sense",
71 "a",
72 "accent",
73 "lb",
74 "lbl",
75 "label",
76}
79def normalize_pronunciation_pos_label(label: str) -> str:
80 label = label.strip().lower()
81 label = re.sub(r"\s+", " ", label)
82 # Drop explanatory suffixes such as "noun (barren areas)" before
83 # matching the label against part-of-speech names.
84 label = re.sub(r"\s*\([^)]*\)\s*$", "", label).strip()
85 label = label.strip(" \t\n\r():")
86 label = re.sub(r"\s+senses?$", "", label).strip()
87 return label
90def split_pronunciation_pos_text(text: str) -> PronunciationPosMatch:
91 pos_values: list[str] = []
92 residual: list[str] = []
93 for part in re.split(r"\s*(?:[,;]|\band\b|\bor\b)\s*", text):
94 part = part.strip()
95 if not part:
96 continue
97 normalized = normalize_pronunciation_pos_label(part)
98 if normalized in part_of_speech_map:
99 pos = part_of_speech_map[normalized]["pos"]
100 if pos not in pos_values: 100 ↛ 93line 100 didn't jump to line 93 because the condition on line 100 was always true
101 pos_values.append(pos)
102 else:
103 residual.append(part)
104 return PronunciationPosMatch(pos_values, ", ".join(residual))
107def set_sound_pos(sound: SoundData, pos_values: list[str] | None) -> None:
108 if not pos_values:
109 return
110 sound["pos"] = list(pos_values) # type: ignore[typeddict-unknown-key]
113def parse_pronunciation_tags_with_pos(
114 wxr: WiktextractContext, text: str, sound: SoundData
115) -> list[str]:
116 match = split_pronunciation_pos_text(text)
117 set_sound_pos(sound, match.pos_values)
118 if match.residual: 118 ↛ 120line 118 didn't jump to line 120 because the condition on line 118 was always true
119 parse_pronunciation_tags(wxr, match.residual, sound)
120 return match.pos_values
123def extract_pos_prefix(text: str) -> PronunciationPosPrefix | None:
124 stripped = text.strip()
125 if not (stripped.startswith("(") and stripped.endswith(")")):
126 bare_match = split_pronunciation_pos_text(text)
127 if bare_match.pos_values and not bare_match.residual:
128 return PronunciationPosPrefix(bare_match.pos_values, "", True)
130 colon_match = re.match(r"\s*([^:()]+?)\s*:\s*(.*)$", text)
131 if colon_match:
132 match = split_pronunciation_pos_text(colon_match.group(1))
133 if match.pos_values and not match.residual: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 return PronunciationPosPrefix(
135 match.pos_values, colon_match.group(2).strip(), True
136 )
138 paren_match = re.match(r"\s*\(([^()]*)\)\s*(.*)$", text)
139 if paren_match:
140 match = split_pronunciation_pos_text(paren_match.group(1))
141 if match.pos_values and not match.residual:
142 return PronunciationPosPrefix(
143 match.pos_values, paren_match.group(2).strip(), False
144 )
146 return None
149def extract_pronunciation_pos_template(
150 wxr: WiktextractContext,
151 name: str,
152 ht: TemplateArgs,
153 lang_code: str,
154) -> PronunciationPosMatch:
155 if name in {"a", "accent", "lb", "lbl", "label"}:
156 pos_args = [
157 value
158 for key, value in ht.items()
159 if isinstance(key, int) and key >= 2
160 ]
161 if not pos_args and ht.get(1) != lang_code:
162 pos_args = [ht.get(1, "")]
163 else:
164 pos_args = [
165 value
166 for key, value in ht.items()
167 if isinstance(key, int) and key >= 1
168 ]
170 pos_values: list[str] = []
171 residual: list[str] = []
172 for arg in pos_args:
173 text = clean_node(wxr, None, [arg])
174 match = split_pronunciation_pos_text(text)
175 for pos in match.pos_values:
176 if pos not in pos_values: 176 ↛ 175line 176 didn't jump to line 175 because the condition on line 176 was always true
177 pos_values.append(pos)
178 if match.residual:
179 residual.append(match.residual)
180 return PronunciationPosMatch(pos_values, ", ".join(residual))
183def extract_pron_template(
184 wxr: WiktextractContext, tname: str, targs: TemplateArgs, expanded: str
185) -> tuple[SoundData, list[SoundData]] | None:
186 """In post_template_fn, this is used to handle all enPR and IPA templates
187 so that we can leave breadcrumbs in the text that can later be handled
188 there. We return a `base_data` so that if there are two
189 or more templates on the same line, like this:
190 (Tags for the whole line, really) enPR: foo, IPA(keys): /foo/
191 then we can apply base_data fields to other templates, too, if needed.
192 """
193 cleaned = clean_value(wxr, expanded)
194 # print(f"extract_pron_template input: {tname=} {expanded=}-> {cleaned=}")
195 m = IPA_EXTRACT_RE.match(cleaned)
196 if not m:
197 wxr.wtp.error(
198 f"Text cannot match IPA_EXTRACT_RE regex: "
199 f"{cleaned=}, {tname=}, {targs=}",
200 sortid="en/pronunciation/54",
201 )
202 return None
203 # for i, group in enumerate(m.groups()):
204 # print(i + 1, repr(group))
205 main_qual = m.group(2) or ""
206 if "qq" in targs:
207 # If the template has been given a qualifier that applies to
208 # every entry, but which also happens to appear at the end
209 # which can be confused with the post-qualifier of a single
210 # entry in the style of "... /ipa3/ (foo) (bar)", where foo
211 # might not be present so the bar looks like it only might
212 # apply to `/ipa3/`
213 pron_body = m.group(5)
214 post_qual = m.group(7)
215 else:
216 pron_body = m.group(4)
217 post_qual = ""
219 if not pron_body: 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true
220 wxr.wtp.error(
221 f"Regex failed to find 'body' from {cleaned=}",
222 sortid="en/pronunciation/81",
223 )
224 return None
226 base_data: SoundData = {}
227 if main_qual:
228 parse_pronunciation_tags(wxr, main_qual, base_data)
229 if post_qual:
230 parse_pronunciation_tags(wxr, post_qual, base_data)
231 # This base_data is used as the base copy for all entries from this
232 # template, but it is also returned so that its contents may be applied
233 # to other templates on the same line.
234 # print(f"{base_data=}")
236 sound_datas: list[SoundData] = []
238 parts: list[list[str]] = [[]]
239 inside = 0
240 current: list[str] = []
241 for i, p in enumerate(re.split(r"(\s*,|;|\(|\)\s*)", pron_body)):
242 # Split the line on commas and semicolons outside of parens. This
243 # gives us lines with "(main-qualifier) /phon/ (post-qualifier, maybe)"
244 # print(f" {i=}, {p=}")
245 comp = p.strip()
246 if not p:
247 continue
248 if comp == "(":
249 if not inside and i > 0: 249 ↛ 252line 249 didn't jump to line 252 because the condition on line 249 was always true
250 if stripped := "".join(current).strip():
251 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]
252 current = [p]
253 inside += 1
254 continue
255 if comp == ")":
256 inside -= 1
257 if not inside: 257 ↛ 262line 257 didn't jump to line 262 because the condition on line 257 was always true
258 if stripped := "".join(current).strip(): 258 ↛ 262line 258 didn't jump to line 262 because the condition on line 258 was always true
259 current.append(p)
260 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]
261 current = []
262 continue
263 if not inside and comp in (",", ";"):
264 if stripped := "".join(current).strip():
265 parts[-1].append(stripped) # type:ignore[arg-type]
266 current = []
267 parts.append([])
268 continue
269 current.append(p)
270 if current:
271 parts[-1].append("".join(current).strip())
273 # print(f">>>>>> {parts=}")
274 new_parts: list[list[str]] = []
275 for entry in parts:
276 if not entry: 276 ↛ 277line 276 didn't jump to line 277 because the condition on line 276 was never true
277 continue
278 new_entry: list[str] = []
279 i1: int = entry[0].startswith("(") and entry[0].endswith(")")
280 if i1:
281 new_entry.append(entry[0][1:-1].strip())
282 else:
283 new_entry.append("")
284 i2: int = (
285 entry[-1].startswith("(")
286 and entry[-1].endswith(")")
287 and len(entry) > 1
288 )
289 if i2 == 0:
290 i2 = len(entry)
291 else:
292 i2 = -1
293 new_entry.append("".join(entry[i1:i2]).strip())
294 if not new_entry[-1]: 294 ↛ 295line 294 didn't jump to line 295 because the condition on line 294 was never true
295 wxr.wtp.error(
296 f"Missing IPA/enPRO sound data between qualifiers?{entry=}",
297 sortid="en/pronunciation/153",
298 )
299 if i2 == -1:
300 new_entry.append(entry[-1][1:-1].strip())
301 else:
302 new_entry.append("")
303 new_parts.append(new_entry)
305 # print(f">>>>> {new_parts=}")
307 for part in new_parts:
308 sd = deepcopy(base_data)
309 if part[0]:
310 parse_pronunciation_tags(wxr, part[0], sd)
311 if part[2]:
312 parse_pronunciation_tags(wxr, part[2], sd)
313 if tname == "enPR":
314 sd["enpr"] = part[1]
315 else:
316 sd["ipa"] = part[1]
317 sound_datas.append(sd)
319 # print(f"BASE_DATA: {base_data}")
320 # print(f"SOUND_DATAS: {sound_datas=}")
322 return base_data, sound_datas
325def parse_pronunciation(
326 wxr: WiktextractContext,
327 level_node: LevelNode,
328 data: WordData,
329 etym_data: WordData,
330 have_etym: bool,
331 base_data: WordData,
332 lang_code: str,
333) -> None:
334 """Parses the pronunciation section from a language section on a
335 page."""
336 if level_node.kind in LEVEL_KINDS: 336 ↛ 349line 336 didn't jump to line 349 because the condition on line 336 was always true
337 contents: list[str | WikiNode | TemplateNode] = []
338 for node in level_node.children:
339 if isinstance(node, TemplateNode):
340 if node.template_name == "th-pron":
341 extract_th_pron_template(wxr, data, node)
342 elif node.template_name == "zh-pron":
343 extract_zh_pron_template(wxr, data, node)
344 else:
345 contents.append(node)
346 else:
347 contents.append(node)
348 else:
349 contents = [level_node]
350 # Remove subsections, such as Usage notes. They may contain IPAchar
351 # templates in running text, and we do not want to extract IPAs from
352 # those.
353 # Filter out only LEVEL_KINDS; 'or' is doing heavy lifting here
354 # Slip through not-WikiNodes, then slip through WikiNodes that
355 # are not LEVEL_KINDS.
356 contents = [
357 x
358 for x in contents
359 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
360 ]
361 if not any(
362 isinstance(x, WikiNode) and x.kind == NodeKind.LIST for x in contents
363 ):
364 # expand all templates
365 new_contents: list[str | WikiNode | TemplateNode] = []
366 for lst in contents:
367 if isinstance(lst, TemplateNode):
368 temp = wxr.wtp.node_to_wikitext(lst)
369 temp = wxr.wtp.expand(temp)
370 temp_parsed = wxr.wtp.parse(temp)
371 new_contents.extend(temp_parsed.children)
372 else:
373 new_contents.append(lst)
374 contents = new_contents
376 if have_etym and data is base_data: 376 ↛ 377line 376 didn't jump to line 377 because the condition on line 376 was never true
377 data = etym_data
378 pron_templates: list[tuple[SoundData, list[SoundData]]] = []
379 pron_pos_markers: list[list[str]] = []
380 hyphenations: list[Hyphenation] = []
381 audios: list[SoundData] = []
382 have_panel_templates = False
384 def parse_pronunciation_template_fn(
385 name: str, ht: TemplateArgs
386 ) -> str | None:
387 """Handle pronunciation and hyphenation templates"""
388 # _template_fn handles templates *before* they are expanded;
389 # this allows for special handling before all the work needed
390 # for expansion is done.
391 nonlocal have_panel_templates
392 if is_panel_template(wxr, name):
393 have_panel_templates = True
394 return ""
395 if name == "audio":
396 filename = ht.get(2) or ""
397 audio: SoundData = {"audio": filename.strip()}
398 dialect = ht.get("a", "")
399 if "aa" in ht: 399 ↛ 400line 399 didn't jump to line 400 because the condition on line 399 was never true
400 dialect += ", " + ht.get("aa", "")
401 if dialect:
402 dialect = dialect.replace("<", "").replace(">", "")
403 dialect = clean_node(wxr, None, [dialect])
404 for part in split_at_comma_semi(dialect):
405 if "(" not in part:
406 parse_pronunciation_tags(wxr, part, audio)
407 else:
408 for ppart in re.split(r"[][()]", part):
409 parse_pronunciation_tags(wxr, ppart, audio)
410 desc = ht.get(3) or ""
411 desc = clean_node(wxr, None, [desc])
412 if desc: 412 ↛ 413line 412 didn't jump to line 413 because the condition on line 412 was never true
413 audio["text"] = desc
414 m = re.search(r"\((([^()]|\([^()]*\))*)\)", desc)
415 skip = False
416 if m: 416 ↛ 417line 416 didn't jump to line 417 because the condition on line 416 was never true
417 par = m.group(1)
418 cls = classify_desc(par)
419 if cls == "tags":
420 parse_pronunciation_tags(wxr, par, audio)
421 else:
422 skip = True
423 if skip: 423 ↛ 424line 423 didn't jump to line 424 because the condition on line 423 was never true
424 return ""
425 audios.append(audio)
426 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
427 if name == "audio-IPA": 427 ↛ 428line 427 didn't jump to line 428 because the condition on line 427 was never true
428 filename = ht.get(2) or ""
429 ipa = ht.get(3) or ""
430 dial = ht.get("dial")
431 audio = {"audio": filename.strip()}
432 if dial:
433 dial = clean_node(wxr, None, [dial])
434 audio["text"] = dial
435 if ipa:
436 audio["audio-ipa"] = ipa
437 audios.append(audio)
438 # The problem with these IPAs is that they often just describe
439 # what's in the sound file, rather than giving the pronunciation
440 # of the word alone. It is common for audio files to contain
441 # multiple pronunciations or articles in the same file, and then
442 # this IPA often describes what is in the file.
443 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
444 if name == "audio-pron":
445 filename = ht.get(2) or ""
446 ipa = ht.get("ipa") or ""
447 dial = ht.get("dial")
448 country = ht.get("country")
449 audio = {"audio": filename.strip()}
450 if dial: 450 ↛ 454line 450 didn't jump to line 454 because the condition on line 450 was always true
451 dial = clean_node(wxr, None, [dial])
452 audio["text"] = dial
453 parse_pronunciation_tags(wxr, dial, audio)
454 if country: 454 ↛ 456line 454 didn't jump to line 456 because the condition on line 454 was always true
455 parse_pronunciation_tags(wxr, country, audio)
456 if ipa: 456 ↛ 458line 456 didn't jump to line 458 because the condition on line 456 was always true
457 audio["audio-ipa"] = ipa
458 audios.append(audio)
459 # XXX do we really want to extract pronunciations from these?
460 # Or are they spurious / just describing what is in the
461 # audio file?
462 # if ipa:
463 # pron = {"ipa": ipa}
464 # if dial:
465 # parse_pronunciation_tags(wxr, dial, pron)
466 # if country:
467 # parse_pronunciation_tags(wxr, country, pron)
468 # data_append(data, "sounds", pron)
469 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
470 if name in ("hyph", "hyphenation"):
471 # {{hyph|en|re|late|caption="Hyphenation UK:"}}
472 # {{hyphenation|it|quiè|to||qui|è|to||quié|to||qui|é|to}}
473 # and also nocaption=1
474 caption = clean_node(wxr, None, ht.get("caption", ""))
475 tagsets, _ = decode_tags(caption)
476 # flatten the tagsets into one; it would be really weird to have
477 # several tagsets for a hyphenation caption
478 tags = sorted(set(tag for tagset in tagsets for tag in tagset))
479 # We'll just ignore any errors from tags, it's not very important
480 # for hyphenation
481 tags = [tag for tag in tags if not tag.startswith("error")]
482 hyph_sequences: list[list[str]] = [[]]
483 for text in [
484 t for (k, t) in ht.items() if (isinstance(k, int) and k >= 2)
485 ]:
486 if not text:
487 hyph_sequences.append([])
488 else:
489 hyph_sequences[-1].append(clean_node(wxr, None, text))
490 for seq in hyph_sequences:
491 hyphenations.append(Hyphenation(parts=seq, tags=tags))
492 return ""
493 return None
495 may_be_duplicates = False
497 def parse_pron_post_template_fn(
498 name: str, ht: TemplateArgs, text: str
499 ) -> str | None:
500 # _post_template_fn handles templates *after* the work to expand
501 # them has been done; this is exactly the same as _template_fn,
502 # except with the additional expanded text as an input, and
503 # possible side-effects from the expansion and recursion (like
504 # calling other subtemplates that are handled in _template_fn.
505 nonlocal may_be_duplicates
506 if is_panel_template(wxr, name): 506 ↛ 507line 506 didn't jump to line 507 because the condition on line 506 was never true
507 return ""
508 if name in PRON_POS_TEMPLATE_NAMES:
509 pos_match = extract_pronunciation_pos_template(
510 wxr, name, ht, lang_code
511 )
512 if pos_match.pos_values:
513 pron_pos_markers.append(pos_match.pos_values)
514 marker = (
515 f"__PRON_POS_MARKER_{len(pron_pos_markers) - 1}__"
516 )
517 if pos_match.residual: 517 ↛ 518line 517 didn't jump to line 518 because the condition on line 517 was never true
518 return f"{marker} ({pos_match.residual})"
519 return marker
520 if name in {
521 *PRON_POS_TEMPLATE_NAMES,
522 "l",
523 "link",
524 }:
525 # Kludge: when these templates expand to /.../ or [...],
526 # replace the expansion by something safe. This is used
527 # to filter spurious IPA-looking expansions that aren't really
528 # IPAs. We probably don't care about these templates in the
529 # contexts where they expand to something containing these.
530 v = re.sub(r'href="[^"]*"', "", text) # Ignore URLs
531 v = re.sub(r'src="[^"]*"', "", v)
532 if re.search(r"/[^/,]+?/|\[[^]0-9,/][^],/]*?\]", v):
533 # Note: replacing by empty results in Lua errors that we
534 # would rather not have. For example, voi/Middle Vietnamese
535 # uses {{a|{{l{{vi|...}}}}, and the {{a|...}} will fail
536 # if {{l|...}} returns empty.
537 return "stripped-by-parse_pron_post_template_fn"
538 if name in ("IPA", "enPR"):
539 # Extract the data from IPA and enPR templates (same underlying
540 # template) and replace them in-text with magical cookie that
541 # can be later used to refer to the data's index inside
542 # pron_templates.
543 if pron_t := extract_pron_template(wxr, name, ht, text):
544 pron_templates.append(pron_t)
545 return f"__PRON_TEMPLATE_{len(pron_templates) - 1}__"
546 # Catch templates that generate duplicate sound data entries
547 # here; if the text produces a big, toggleable section, the
548 # "header" for that section might be duplicated. Add more conditions
549 # if necessary.
550 if text.startswith("<") and "vsToggleElement" in text: 550 ↛ 551line 550 didn't jump to line 551 because the condition on line 550 was never true
551 may_be_duplicates = True
552 return text
554 def flattened_tree(lines: list[WikiNode | str]) -> Iterator[WikiNode | str]:
555 assert isinstance(lines, list)
556 for line in lines:
557 yield from flattened_tree1(line)
559 def flattened_tree1(node: WikiNode | str) -> Iterator[WikiNode | str]:
560 assert isinstance(node, (WikiNode, str))
561 if isinstance(node, str):
562 yield node
563 return
564 elif node.kind == NodeKind.LIST:
565 for item in node.children:
566 yield from flattened_tree1(item)
567 elif node.kind == NodeKind.LIST_ITEM:
568 new_children = []
569 sublist = None
570 for child in node.children:
571 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
572 sublist = child
573 else:
574 new_children.append(child)
575 node.children = new_children
576 node.sarg = "*"
577 yield node
578 if sublist:
579 yield from flattened_tree1(sublist)
580 else:
581 yield node
583 # XXX Do not use flattened_tree more than once here, for example for
584 # debug printing... The underlying data is changed, and the separated
585 # sublists disappear.
587 # Kludge for templates that generate several lines, but haven't
588 # been caught by earlier kludges...
589 def split_cleaned_node_on_newlines(
590 contents: list[WikiNode | str],
591 ) -> Iterator[str]:
592 for litem in flattened_tree(contents):
593 ipa_text = clean_node(
594 wxr,
595 data,
596 litem,
597 template_fn=parse_pronunciation_template_fn,
598 post_template_fn=parse_pron_post_template_fn,
599 )
600 for line in ipa_text.splitlines():
601 yield line
603 # have_pronunciations = False
604 active_pos: list[str] | None = None
606 for line in split_cleaned_node_on_newlines(contents):
607 prefix: str | None = None
608 earlier_base_data: SoundData | None = None
609 line_pos: list[str] | None = None
610 current_group_sounds: list[SoundData] = []
611 line_has_sound = False
612 if not line: 612 ↛ 613line 612 didn't jump to line 613 because the condition on line 612 was never true
613 continue
615 split_templates = re.split(r"__PRON_TEMPLATE_(\d+)__", line)
616 for i, text in enumerate(split_templates):
617 if not text:
618 continue
619 # clean up starts at the start of the line
620 text = re.sub(r"^\**\s*", "", text).strip()
621 if i == 0:
622 # At the start of a line, check for stuff like "Noun:"
623 # for active_pos; active_pos is a temporary data field
624 # given to each saved SoundData entry which is later
625 # used to sort the entries into their respective PoSes.
626 if pos_prefix := extract_pos_prefix(text):
627 text = pos_prefix.text
628 line_pos = pos_prefix.pos_values
629 if pos_prefix.is_persistent:
630 active_pos = pos_prefix.pos_values
631 if not text:
632 continue
634 m = re.search(r"__PRON_POS_MARKER_(\d+)__", text)
635 while m:
636 if current_group_sounds and re.search(
637 r"[,;]", text[: m.start()]
638 ):
639 current_group_sounds = []
640 pos_values = pron_pos_markers[int(m.group(1))]
641 if current_group_sounds:
642 for sound in current_group_sounds:
643 set_sound_pos(sound, pos_values)
644 line_pos = pos_values
645 text = text[: m.start()] + text[m.end() :]
646 m = re.search(r"__PRON_POS_MARKER_(\d+)__", text)
647 text = text.strip()
648 if not text:
649 continue
651 if i % 2 == 1:
652 # re.split (with capture groups) splits the lines so that
653 # every even entry is a captured splitter; odd lines are either
654 # empty strings or stuff around the splitters.
655 base_pron_data, first_prons = pron_templates[int(text)]
656 if base_pron_data:
657 earlier_base_data = base_pron_data
658 # print(f"Set {earlier_base_data=}")
659 elif earlier_base_data is not None:
660 # merge data from an earlier iteration of this loop
661 for pr in first_prons:
662 if "note" in pr and "note" in earlier_base_data: 662 ↛ 663line 662 didn't jump to line 663 because the condition on line 662 was never true
663 pr["note"] += ";" + earlier_base_data.get(
664 "note", ""
665 )
666 elif "note" in earlier_base_data: 666 ↛ 667line 666 didn't jump to line 667 because the condition on line 666 was never true
667 pr["note"] = earlier_base_data["note"]
668 if "topics" in earlier_base_data: 668 ↛ 669line 668 didn't jump to line 669 because the condition on line 668 was never true
669 data_extend(
670 pr, "topics", earlier_base_data["topics"]
671 )
672 if "tags" in pr and "tags" in earlier_base_data: 672 ↛ 673line 672 didn't jump to line 673 because the condition on line 672 was never true
673 pr["tags"].extend(earlier_base_data["tags"])
674 elif "tags" in earlier_base_data: 674 ↛ 661line 674 didn't jump to line 661 because the condition on line 674 was always true
675 pr["tags"] = sorted(set(earlier_base_data["tags"]))
676 for pr in first_prons:
677 set_sound_pos(pr, line_pos or active_pos)
678 if pr not in data.get("sounds", ()): 678 ↛ 680line 678 didn't jump to line 680 because the condition on line 678 was always true
679 data_append(data, "sounds", pr)
680 current_group_sounds.append(pr)
681 line_has_sound = True
682 # This bit is handled
683 continue
685 if "IPA" in text:
686 field: Literal[
687 "audio",
688 "audio-ipa",
689 "enpr",
690 "form",
691 "hangeul",
692 "homophone",
693 "ipa",
694 "mp3_url",
695 "note",
696 "ogg_url",
697 "other",
698 "rhymes",
699 "tags",
700 "text",
701 "topics",
702 "zh-pron",
703 ] = "ipa"
704 else:
705 # This is used for Rhymes, Homophones, etc
706 field = "other"
708 # Check if it contains Japanese "Tokyo" pronunciation with
709 # special syntax
710 m = re.search(r"(?m)\(Tokyo\) +([^ ]+) +\[", text)
711 if m: 711 ↛ 712line 711 didn't jump to line 712 because the condition on line 711 was never true
712 pron: SoundData = {field: m.group(1)} # type: ignore[misc]
713 set_sound_pos(pron, line_pos or active_pos)
714 data_append(data, "sounds", pron)
715 current_group_sounds.append(pron)
716 line_has_sound = True
717 # have_pronunciations = True
718 continue
720 # Check if it contains Rhymes
721 m = re.match(r"\s*Rhymes?: (.*)", text)
722 if m:
723 for ending in split_at_comma_semi(m.group(1)):
724 ending = ending.strip()
725 if ending: 725 ↛ 723line 725 didn't jump to line 723 because the condition on line 725 was always true
726 pron = {"rhymes": ending}
727 set_sound_pos(pron, line_pos or active_pos)
728 data_append(data, "sounds", pron)
729 current_group_sounds.append(pron)
730 line_has_sound = True
731 # have_pronunciations = True
732 continue
734 # Check if it contains homophones
735 m = re.search(r"(?m)\bHomophones?: (.*)", text)
736 if m:
737 for w in split_at_comma_semi(m.group(1)):
738 w = w.strip()
739 if w: 739 ↛ 737line 739 didn't jump to line 737 because the condition on line 739 was always true
740 pron = {"homophone": w}
741 set_sound_pos(pron, line_pos or active_pos)
742 data_append(data, "sounds", pron)
743 current_group_sounds.append(pron)
744 line_has_sound = True
745 # have_pronunciations = True
746 continue
748 # Check if it contains Phonetic hangeul
749 m = re.search(r"(?m)\bPhonetic hange?ul: \[([^]]+)\]", text)
750 if m: 750 ↛ 751line 750 didn't jump to line 751 because the condition on line 750 was never true
751 seen = set()
752 for w in m.group(1).split("/"):
753 w = w.strip()
754 if w and w not in seen:
755 seen.add(w)
756 pron = {"hangeul": w}
757 set_sound_pos(pron, line_pos or active_pos)
758 data_append(data, "sounds", pron)
759 current_group_sounds.append(pron)
760 line_has_sound = True
761 # have_pronunciations = True
763 # This regex-based hyphenation detection left as backup
764 m = re.search(r"\b(Syllabification|Hyphenation): *([^\n.]*)", text)
765 if m:
766 data_append(data, "hyphenation", m.group(2))
767 commaseparated = m.group(2).split(",")
768 if len(commaseparated) > 1: 768 ↛ 779line 768 didn't jump to line 779 because the condition on line 768 was always true
769 for h in commaseparated:
770 # That second characters looks like a dash but it's
771 # actually unicode decimal code 8231, hyphenation dash
772 # Add more delimiters here if needed.
773 parts = re.split(r"-|‧", h.strip())
774 data_append(
775 data, "hyphenations", Hyphenation(parts=parts)
776 )
777 ...
778 else:
779 data_append(
780 data,
781 "hyphenations",
782 Hyphenation(parts=m.group(2).split(sep="-")),
783 )
784 # have_pronunciations = True
786 # See if it contains a word prefix restricting which forms the
787 # pronunciation applies to (see amica/Latin) and/or parenthesized
788 # tags.
789 m = re.match(
790 r"^[*#\s]*(([-\w]+):\s+)?\((([^()]|\([^()]*\))*?)\)", text
791 )
792 if m:
793 prefix = m.group(2) or ""
794 tagstext = m.group(3)
795 text = text[m.end() :]
796 else:
797 m = re.match(r"^[*#\s]*([-\w]+):\s+", text)
798 if m:
799 prefix = m.group(1)
800 tagstext = ""
801 text = text[m.end() :]
802 else:
803 # Spanish has tags before pronunciations, eg. aceite/Spanish
804 m = re.match(r".*:\s+\(([^)]*)\)\s+(.*)", text)
805 if m: 805 ↛ 806line 805 didn't jump to line 806 because the condition on line 805 was never true
806 tagstext = m.group(1)
807 text = m.group(2)
808 else:
809 # No prefix. In this case, we inherit prefix
810 # from previous entry. This particularly
811 # applies for nested Audio files.
812 tagstext = ""
813 if tagstext:
814 earlier_base_data = {}
815 parse_pronunciation_tags_with_pos(
816 wxr, tagstext, earlier_base_data
817 )
819 # Find romanizations from the pronunciation section (routinely
820 # produced for Korean by {{ko-IPA}})
821 for m in re.finditer(pron_romanization_re, text): 821 ↛ 822line 821 didn't jump to line 822 because the loop on line 821 never started
822 prefix = m.group(1)
823 w = m.group(2).strip()
824 tag = pron_romanizations[prefix]
825 form = {"form": w, "tags": tag.split()}
826 data_append(data, "forms", form)
828 # Find IPA pronunciations
829 for m in re.finditer(
830 r"(?m)/[^][\n/,]+?/" r"|" r"\[[^]\n0-9,/][^],/]*?\]", text
831 ):
832 v = m.group(0)
833 # The regexp above can match file links. Skip them.
834 if v.startswith("[[File:"): 834 ↛ 835line 834 didn't jump to line 835 because the condition on line 834 was never true
835 continue
836 if v == "/wiki.local/": 836 ↛ 837line 836 didn't jump to line 837 because the condition on line 836 was never true
837 continue
838 if field == "ipa" and "__AUDIO_IGNORE_THIS__" in text: 838 ↛ 839line 838 didn't jump to line 839 because the condition on line 838 was never true
839 m = re.search(r"__AUDIO_IGNORE_THIS__(\d+)__", text)
840 assert m
841 idx = int(m.group(1))
842 if idx >= len(audios):
843 continue
844 if not audios[idx].get("audio-ipa"):
845 audios[idx]["audio-ipa"] = v
846 if prefix:
847 audios[idx]["form"] = prefix
848 else:
849 if earlier_base_data:
850 pron = deepcopy(earlier_base_data)
851 pron[field] = v
852 else:
853 pron = {field: v} # type: ignore[misc]
854 if prefix:
855 pron["form"] = prefix
856 if "pos" not in pron: 856 ↛ 858line 856 didn't jump to line 858 because the condition on line 856 was always true
857 set_sound_pos(pron, line_pos or active_pos)
858 if may_be_duplicates is True: 858 ↛ 859line 858 didn't jump to line 859 because the condition on line 858 was never true
859 ok = True
860 for comp_sound in data.get("sounds", []):
861 # Python has dict comparison since 3.8
862 if pron == comp_sound:
863 ok = False
864 break
865 if ok:
866 data_append(data, "sounds", pron)
867 else:
868 data_append(data, "sounds", pron)
869 current_group_sounds.append(pron)
870 line_has_sound = True
871 # have_pronunciations = True
872 if current_group_sounds and re.search(r"[,;]", text):
873 current_group_sounds = []
874 if line_pos and not line_has_sound:
875 active_pos = line_pos
877 # XXX what about {{hyphenation|...}}, {{hyph|...}}
878 # and those used to be stored under "hyphenation"
880 # Add data that was collected in template_fn
881 for audio in audios:
882 if "audio" in audio: 882 ↛ 938line 882 didn't jump to line 938 because the condition on line 882 was always true
883 # Compute audio file URLs
884 fn = audio["audio"]
885 # Strip certain characters, e.g., left-to-right mark
886 fn = re.sub(r"[\u200f\u200e]", "", fn)
887 fn = fn.strip()
888 fn = urllib.parse.unquote(fn)
889 # First character is usually uppercased
890 if re.match(r"^[a-z][a-z]+", fn):
891 fn = fn[0].upper() + fn[1:]
892 if fn in wxr.config.redirects: 892 ↛ 893line 892 didn't jump to line 893 because the condition on line 892 was never true
893 fn = wxr.config.redirects[fn]
894 # File extension is lowercased
895 # XXX some words seem to need this, some don't seem to
896 # have this??? what is the exact rule?
897 # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn)
898 # Spaces are converted to underscores
899 fn = re.sub(r"\s+", "_", fn)
900 # Compute hash digest part
901 h = hashlib.md5()
902 hname = fn.encode("utf-8")
903 h.update(hname)
904 digest = h.hexdigest()
905 # Quote filename for URL
906 qfn = urllib.parse.quote(fn)
907 # For safety when writing files
908 qfn = qfn.replace("/", "__slash__")
909 if re.search(r"(?i)\.(ogg|oga)$", fn):
910 ogg = (
911 "https://upload.wikimedia.org/wikipedia/"
912 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)
913 )
914 else:
915 ogg = (
916 "https://upload.wikimedia.org/wikipedia/"
917 "commons/transcoded/"
918 "{}/{}/{}/{}.ogg".format(
919 digest[:1], digest[:2], qfn, qfn
920 )
921 )
922 if re.search(r"(?i)\.(mp3)$", fn): 922 ↛ 923line 922 didn't jump to line 923 because the condition on line 922 was never true
923 mp3 = (
924 "https://upload.wikimedia.org/wikipedia/"
925 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)
926 )
927 else:
928 mp3 = (
929 "https://upload.wikimedia.org/wikipedia/"
930 "commons/transcoded/"
931 "{}/{}/{}/{}.mp3".format(
932 digest[:1], digest[:2], qfn, qfn
933 )
934 )
935 audio["ogg_url"] = ogg
936 audio["mp3_url"] = mp3
937 set_sound_pos(audio, line_pos or active_pos)
938 if audio not in data.get("sounds", ()):
939 data_append(data, "sounds", audio)
941 # if audios:
942 # have_pronunciations = True
943 audios = []
945 data_extend(data, "hyphenations", hyphenations)
946 hyphenations = []
948 ## I have commented out the otherwise unused have_pronunciation
949 ## toggles; uncomment them to use this debug print
950 # if not have_pronunciations and not have_panel_templates:
951 # wxr.wtp.debug("no pronunciations found from pronunciation section",
952 # sortid="pronunciations/533")
955def extract_th_pron_template(
956 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode
957):
958 # https://en.wiktionary.org/wiki/Template:th-pron
959 @dataclass
960 class TableHeader:
961 raw_tags: list[str]
962 rowspan: int
964 expanded_node = wxr.wtp.parse(
965 wxr.wtp.node_to_wikitext(t_node), expand_all=True
966 )
967 sounds = []
968 for table_tag in expanded_node.find_html("table"):
969 row_headers = []
970 for tr_tag in table_tag.find_html("tr"):
971 field = "other"
972 new_headers = []
973 for header in row_headers:
974 if header.rowspan > 1:
975 header.rowspan -= 1
976 new_headers.append(header)
977 row_headers = new_headers
978 for th_tag in tr_tag.find_html("th"):
979 header_str = clean_node(wxr, None, th_tag)
980 if header_str.startswith("(standard) IPA"):
981 field = "ipa"
982 elif header_str.startswith("Homophones"): 982 ↛ 983line 982 didn't jump to line 983 because the condition on line 982 was never true
983 field = "homophone"
984 elif header_str == "Audio":
985 field = "audio"
986 elif header_str != "": 986 ↛ 978line 986 didn't jump to line 978 because the condition on line 986 was always true
987 rowspan = 1
988 rowspan_str = th_tag.attrs.get("rowspan", "1")
989 if re.fullmatch(r"\d+", rowspan_str): 989 ↛ 991line 989 didn't jump to line 991 because the condition on line 989 was always true
990 rowspan = int(rowspan_str)
991 header = TableHeader([], rowspan)
992 for line in header_str.splitlines():
993 for raw_tag in line.strip("{}\n ").split(";"):
994 raw_tag = raw_tag.strip()
995 if raw_tag != "": 995 ↛ 993line 995 didn't jump to line 993 because the condition on line 995 was always true
996 header.raw_tags.append(raw_tag)
997 row_headers.append(header)
999 for td_tag in tr_tag.find_html("td"):
1000 if field == "audio":
1001 for link_node in td_tag.find_child(NodeKind.LINK):
1002 filename = clean_node(wxr, None, link_node.largs[0])
1003 if filename != "": 1003 ↛ 1001line 1003 didn't jump to line 1001 because the condition on line 1003 was always true
1004 sound = create_audio_url_dict(filename)
1005 sounds.append(sound)
1006 elif field == "homophone": 1006 ↛ 1007line 1006 didn't jump to line 1007 because the condition on line 1006 was never true
1007 for span_tag in td_tag.find_html_recursively(
1008 "span", attr_name="lang", attr_value="th"
1009 ):
1010 word = clean_node(wxr, None, span_tag)
1011 if word != "":
1012 sounds.append({"homophone": word})
1013 else:
1014 raw_tags = []
1015 for html_node in td_tag.find_child_recursively(
1016 NodeKind.HTML
1017 ):
1018 if html_node.tag == "small":
1019 node_str = clean_node(wxr, None, html_node)
1020 if node_str.startswith("[") and node_str.endswith(
1021 "]"
1022 ):
1023 for raw_tag in node_str.strip("[]").split(","):
1024 raw_tag = raw_tag.strip()
1025 if raw_tag != "": 1025 ↛ 1023line 1025 didn't jump to line 1023 because the condition on line 1025 was always true
1026 raw_tags.append(raw_tag)
1027 elif len(sounds) > 0: 1027 ↛ 1015line 1027 didn't jump to line 1015 because the condition on line 1027 was always true
1028 sounds[-1]["roman"] = node_str
1029 elif html_node.tag == "span":
1030 node_str = clean_node(wxr, None, html_node)
1031 span_lang = html_node.attrs.get("lang", "")
1032 span_class = html_node.attrs.get("class", "")
1033 if node_str != "" and (
1034 span_lang == "th" or span_class in ["IPA", "tr"]
1035 ):
1036 sound = {}
1037 for raw_tag in raw_tags:
1038 if raw_tag in valid_tags: 1038 ↛ 1041line 1038 didn't jump to line 1041 because the condition on line 1038 was always true
1039 data_append(sound, "tags", raw_tag)
1040 else:
1041 data_append(sound, "raw_tags", raw_tag)
1042 for header in row_headers:
1043 for raw_tag in header.raw_tags:
1044 if raw_tag.lower() in valid_tags:
1045 data_append(
1046 sound, "tags", raw_tag.lower()
1047 )
1048 else:
1049 data_append(
1050 sound, "raw_tags", raw_tag
1051 )
1052 if "romanization" in sound.get("tags", []):
1053 field = "roman"
1054 sound[field] = node_str
1055 sounds.append(sound)
1057 clean_node(wxr, word_entry, expanded_node)
1058 data_extend(word_entry, "sounds", sounds)
1061def extract_zh_pron_template(
1062 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode
1063):
1064 # https://en.wiktionary.org/wiki/Template:zh-pron
1065 expanded_node = wxr.wtp.parse(
1066 wxr.wtp.node_to_wikitext(t_node), expand_all=True
1067 )
1068 seen_lists = set()
1069 sounds = []
1070 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):
1071 if list_node not in seen_lists:
1072 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
1073 sounds.extend(
1074 extract_zh_pron_list_item(wxr, list_item, [], seen_lists)
1075 )
1076 clean_node(wxr, word_entry, expanded_node)
1077 data_extend(word_entry, "sounds", sounds)
1080def extract_zh_pron_list_item(
1081 wxr: WiktextractContext,
1082 list_item: WikiNode,
1083 raw_tags: list[str],
1084 seen_lists: set[WikiNode],
1085) -> list[SoundData]:
1086 current_tags = raw_tags[:]
1087 sounds = []
1088 is_first_small_tag = True
1089 for node in list_item.children:
1090 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
1091 link_str = clean_node(wxr, None, node.largs)
1092 node_str = clean_node(wxr, None, node)
1093 if link_str.startswith("File:"): 1093 ↛ 1094line 1093 didn't jump to line 1094 because the condition on line 1093 was never true
1094 sound = create_audio_url_dict(link_str.removeprefix("File:"))
1095 sound["raw_tags"] = current_tags[:]
1096 translate_zh_pron_raw_tags(sound)
1097 sounds.append(sound)
1098 elif node_str != "": 1098 ↛ 1089line 1098 didn't jump to line 1089 because the condition on line 1098 was always true
1099 current_tags.append(node_str)
1100 elif isinstance(node, HTMLNode):
1101 if node.tag == "small":
1102 if is_first_small_tag: 1102 ↛ 1113line 1102 didn't jump to line 1113 because the condition on line 1102 was always true
1103 raw_tag_text = clean_node(
1104 wxr,
1105 None,
1106 [
1107 n
1108 for n in node.children
1109 if not (isinstance(n, HTMLNode) and n.tag == "sup")
1110 ],
1111 )
1112 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))
1113 elif len(sounds) > 0:
1114 data_extend(
1115 sounds[-1],
1116 "raw_tags",
1117 split_zh_pron_raw_tag(clean_node(wxr, None, node)),
1118 )
1119 translate_zh_pron_raw_tags(sounds[-1])
1120 is_first_small_tag = False
1121 elif node.tag == "span":
1122 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))
1123 elif ( 1123 ↛ 1128line 1123 didn't jump to line 1128 because the condition on line 1123 was never true
1124 node.tag == "table"
1125 and len(current_tags) > 0
1126 and current_tags[-1] == "Homophones"
1127 ):
1128 sounds.extend(
1129 extract_zh_pron_homophone_table(wxr, node, current_tags)
1130 )
1131 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
1132 seen_lists.add(node)
1133 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
1134 sounds.extend(
1135 extract_zh_pron_list_item(
1136 wxr, child_list_item, current_tags, seen_lists
1137 )
1138 )
1140 return sounds
1143def extract_zh_pron_homophone_table(
1144 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]
1145) -> list[SoundData]:
1146 sounds = []
1147 for td_tag in table.find_html_recursively("td"):
1148 for span_tag in td_tag.find_html("span"):
1149 span_class = span_tag.attrs.get("class", "")
1150 span_lang = span_tag.attrs.get("lang", "")
1151 span_str = clean_node(wxr, None, span_tag)
1152 if (
1153 span_str not in ["", "/"]
1154 and span_lang != ""
1155 and span_class in ["Hant", "Hans", "Hani"]
1156 ):
1157 sound = {"homophone": span_str, "raw_tags": raw_tags[:]}
1158 if span_class == "Hant":
1159 data_append(sound, "tags", "Traditional-Chinese")
1160 elif span_class == "Hans":
1161 data_append(sound, "tags", "Simplified-Chinese")
1162 translate_zh_pron_raw_tags(sound)
1163 sounds.append(sound)
1165 return sounds
1168def translate_zh_pron_raw_tags(sound: SoundData):
1169 from .zh_pron_tags import ZH_PRON_TAGS
1171 raw_tags = []
1172 for raw_tag in sound.get("raw_tags", []):
1173 if raw_tag in ZH_PRON_TAGS:
1174 tr_tag = ZH_PRON_TAGS[raw_tag]
1175 if isinstance(tr_tag, str):
1176 data_append(sound, "tags", tr_tag)
1177 elif isinstance(tr_tag, list) and tr_tag not in sound.get( 1177 ↛ 1172line 1177 didn't jump to line 1172 because the condition on line 1177 was always true
1178 "tags", []
1179 ):
1180 data_extend(sound, "tags", tr_tag)
1181 elif raw_tag in valid_tags:
1182 if raw_tag not in sound.get("tags", []): 1182 ↛ 1172line 1182 didn't jump to line 1172 because the condition on line 1182 was always true
1183 data_append(sound, "tags", raw_tag)
1184 elif raw_tag not in raw_tags: 1184 ↛ 1172line 1184 didn't jump to line 1172 because the condition on line 1184 was always true
1185 raw_tags.append(raw_tag)
1187 if len(raw_tags) > 0:
1188 sound["raw_tags"] = raw_tags
1189 elif "raw_tags" in sound: 1189 ↛ exitline 1189 didn't return from function 'translate_zh_pron_raw_tags' because the condition on line 1189 was always true
1190 del sound["raw_tags"]
1193def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:
1194 raw_tags = []
1195 if "(" not in raw_tag_text:
1196 for raw_tag in re.split(r",|:|;| and ", raw_tag_text):
1197 raw_tag = raw_tag.strip().removeprefix("incl. ").strip()
1198 if raw_tag != "":
1199 raw_tags.append(raw_tag)
1200 else:
1201 processed_offsets = []
1202 for match in re.finditer(r"\([^()]+\)", raw_tag_text):
1203 processed_offsets.append((match.start(), match.end()))
1204 raw_tags.extend(
1205 split_zh_pron_raw_tag(
1206 raw_tag_text[match.start() + 1 : match.end() - 1]
1207 )
1208 )
1209 not_processed = ""
1210 last_end = 0
1211 for start, end in processed_offsets:
1212 not_processed += raw_tag_text[last_end:start]
1213 last_end = end
1214 not_processed += raw_tag_text[last_end:]
1215 if not_processed != raw_tag_text: 1215 ↛ 1218line 1215 didn't jump to line 1218 because the condition on line 1215 was always true
1216 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags
1217 else:
1218 raw_tags.append(not_processed)
1220 return raw_tags
1223def extract_zh_pron_span(
1224 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]
1225) -> list[SoundData]:
1226 sounds = []
1227 small_tags = []
1228 pron_nodes = []
1229 roman = ""
1230 phonetic_pron = ""
1231 for index, node in enumerate(span_tag.children):
1232 if isinstance(node, HTMLNode) and node.tag == "small": 1232 ↛ 1233line 1232 didn't jump to line 1233 because the condition on line 1232 was never true
1233 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))
1234 elif ( 1234 ↛ 1239line 1234 didn't jump to line 1239 because the condition on line 1234 was never true
1235 isinstance(node, HTMLNode)
1236 and node.tag == "span"
1237 and "-Latn" in node.attrs.get("lang", "")
1238 ):
1239 roman = clean_node(wxr, None, node).strip("() ")
1240 elif isinstance(node, str) and node.strip() == "[Phonetic:": 1240 ↛ 1241line 1240 didn't jump to line 1241 because the condition on line 1240 was never true
1241 phonetic_pron = clean_node(
1242 wxr, None, span_tag.children[index + 1 :]
1243 ).strip("] ")
1244 break
1245 else:
1246 pron_nodes.append(node)
1247 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):
1248 zh_pron = zh_pron.strip("[]: ")
1249 if len(zh_pron) > 0: 1249 ↛ 1247line 1249 didn't jump to line 1247 because the condition on line 1249 was always true
1250 if "IPA" in span_tag.attrs.get("class", ""): 1250 ↛ 1251line 1250 didn't jump to line 1251 because the condition on line 1250 was never true
1251 sound = {"ipa": zh_pron, "raw_tags": raw_tags[:]}
1252 else:
1253 sound = {"zh_pron": zh_pron, "raw_tags": raw_tags[:]}
1254 if roman != "": 1254 ↛ 1255line 1254 didn't jump to line 1255 because the condition on line 1254 was never true
1255 sound["roman"] = roman
1256 sounds.append(sound)
1257 if len(sounds) > 0: 1257 ↛ 1259line 1257 didn't jump to line 1259 because the condition on line 1257 was always true
1258 data_extend(sounds[-1], "raw_tags", small_tags)
1259 if phonetic_pron != "": 1259 ↛ 1260line 1259 didn't jump to line 1260 because the condition on line 1259 was never true
1260 sound = {
1261 "zh_pron": phonetic_pron,
1262 "raw_tags": raw_tags[:] + ["Phonetic"],
1263 }
1264 if roman != "":
1265 sound["roman"] = roman
1266 sounds.append(sound)
1267 for sound in sounds:
1268 translate_zh_pron_raw_tags(sound)
1269 return sounds
1272def split_zh_pron(zh_pron: str) -> list[str]:
1273 # split by comma and other symbols that outside parentheses
1274 parentheses = 0
1275 pron_list = []
1276 pron = ""
1277 for c in zh_pron:
1278 if (
1279 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))
1280 and parentheses == 0
1281 and len(pron.strip()) > 0
1282 ):
1283 pron_list.append(pron.strip())
1284 pron = ""
1285 elif c == "(":
1286 parentheses += 1
1287 pron += c
1288 elif c == ")":
1289 parentheses -= 1
1290 pron += c
1291 else:
1292 pron += c
1294 if pron.strip() != "": 1294 ↛ 1296line 1294 didn't jump to line 1296 because the condition on line 1294 was always true
1295 pron_list.append(pron)
1296 return pron_list