Coverage for src/wiktextract/extractor/en/pronunciation.py: 83%
850 statements
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 09:14 +0000
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 09:14 +0000
1import hashlib
2import re
3import urllib
4from copy import deepcopy
5from dataclasses import dataclass
6from typing import Iterator, Literal, NamedTuple
8from wikitextprocessor import (
9 HTMLNode,
10 LevelNode,
11 NodeKind,
12 TemplateNode,
13 WikiNode,
14)
16from ...clean import clean_value
17from ...datautils import data_append, data_extend, split_at_comma_semi
18from ...page import LEVEL_KINDS, clean_node, is_panel_template
19from ...tags import valid_tags
20from ...wxr_context import WiktextractContext
21from ..share import create_audio_url_dict
22from .form_descriptions import (
23 classify_desc,
24 decode_tags,
25 parse_pronunciation_tags,
26)
27from .parts_of_speech import part_of_speech_map
28from .type_utils import Hyphenation, SoundData, TemplateArgs, WordData
30PronunciationPoses = tuple[str, ...]
32# Prefixes, tags, and regexp for finding romanizations from the pronuncation
33# section
34pron_romanizations = {
35 " Revised Romanization ": "romanization revised",
36 " Revised Romanization (translit.) ": (
37 "romanization revised transliteration"
38 ),
39 " McCune-Reischauer ": "McCune-Reischauer romanization",
40 " McCune–Reischauer ": "McCune-Reischauer romanization",
41 " Yale Romanization ": "Yale romanization",
42}
43pron_romanization_re = re.compile(
44 "(?m)^("
45 + "|".join(
46 re.escape(x)
47 for x in sorted(pron_romanizations.keys(), key=len, reverse=True)
48 )
49 + ")([^\n]+)"
50)
52IPA_EXTRACT = r"^(\((.+)\) )?(IPA⁽ᵏᵉʸ⁾|enPR): ((.+?)( \(([^(]+)\))?\s*)$"
53IPA_EXTRACT_RE = re.compile(IPA_EXTRACT)
56class PronunciationPosMatch(NamedTuple):
57 pos_values: PronunciationPoses
58 residual: str
61class PronunciationPosPrefix(NamedTuple):
62 pos_values: PronunciationPoses
63 text: str
64 is_persistent: bool
67class FlattenedListNode(NamedTuple):
68 node: WikiNode | str
69 list_depth: int
72PRON_POS_TEMPLATE_NAMES = {
73 "q",
74 "qualifier",
75 "qual",
76 "i",
77 "sense",
78 "a",
79 "accent",
80 "lb",
81 "lbl",
82 "label",
83}
85PRON_POS_BY_LABEL = {
86 label: pos_data["pos"] for label, pos_data in part_of_speech_map.items()
87}
89PRON_POS_LABEL_RE = re.compile(
90 r"^(?:(?P<residual>.+?)\s+)?(?P<label>"
91 + "|".join(
92 re.escape(label)
93 for label in sorted(PRON_POS_BY_LABEL, key=len, reverse=True)
94 )
95 + r")$"
96)
99def normalize_pronunciation_pos_label(label: str) -> str:
100 label = label.strip().lower()
101 label = re.sub(r"\s+", " ", label)
102 # Drop explanatory suffixes such as "noun (barren areas)" before
103 # matching the label against part-of-speech names.
104 label = re.sub(r"\s*\([^)]*\)\s*$", "", label).strip()
105 label = label.strip(" \t\n\r():")
106 label = re.sub(r"\s+senses?$", "", label).strip()
107 return label
110def split_pronunciation_pos_text(text: str) -> PronunciationPosMatch:
111 pos_values: list[str] = []
112 residual: list[str] = []
113 for part in split_pronunciation_pos_parts(text):
114 pos, residual_part = pronunciation_pos_from_part(part)
115 if pos:
116 # POS-bearing qualifier text may also contain normal pronunciation
117 # tags before the POS label, e.g. "attributive adjective".
118 if pos not in pos_values: 118 ↛ 120line 118 didn't jump to line 120 because the condition on line 118 was always true
119 pos_values.append(pos)
120 if residual_part:
121 residual.append(residual_part)
122 elif residual_part: 122 ↛ 113line 122 didn't jump to line 113 because the condition on line 122 was always true
123 residual.append(residual_part)
124 if not pos_values:
125 # If nothing in the text was a POS label, preserve the original text
126 # for normal pronunciation tag/note parsing.
127 return PronunciationPosMatch((), text.strip())
128 return PronunciationPosMatch(tuple(pos_values), ", ".join(residual))
131def pronunciation_pos_from_part(part: str) -> tuple[str | None, str]:
132 normalized = normalize_pronunciation_pos_label(part)
133 if normalized in PRON_POS_BY_LABEL:
134 return PRON_POS_BY_LABEL[normalized], ""
135 # Match residual tag text followed by a POS label:
136 # "attributive adjective" -> ("adj", "attributive")
137 # "attributive proper noun" -> ("name", "attributive")
138 # The label alternation is sorted longest-first so multi-word POS labels
139 # such as "proper noun" win over their suffixes.
140 match = PRON_POS_LABEL_RE.match(normalized)
141 if match:
142 label = match.group("label")
143 residual = (match.group("residual") or "").rstrip(" ,;:")
144 if not residual or classify_desc(residual) == "tags":
145 return PRON_POS_BY_LABEL[label], residual
146 return None, part
149def split_pronunciation_pos_parts(text: str) -> list[str]:
150 parts: list[str] = []
151 for comma_part in re.split(r"[,;]", text):
152 comma_part = comma_part.strip()
153 if not comma_part:
154 continue
155 # Commas and semicolons reliably separate qualifier chunks. Only split
156 # "and"/"or" when at least one side is a POS label, so prose notes
157 # stay intact.
158 conjunction_parts = re.split(r"\s+(?:and|or)\s+", comma_part)
159 if len(conjunction_parts) > 1 and any(
160 pronunciation_pos_from_part(part)[0]
161 for part in conjunction_parts
162 ):
163 parts.extend(conjunction_parts)
164 else:
165 parts.append(comma_part)
166 return parts
169def set_sound_pos(
170 sound: SoundData, pos_values: PronunciationPoses | None
171) -> PronunciationPoses | None:
172 if pos_values:
173 sound["pos"] = pos_values # type: ignore[typeddict-unknown-key]
174 return pos_values
175 if "pos" in sound:
176 return sound["pos"] # type: ignore[typeddict-item]
177 return None
180def common_sound_pos(
181 pos_candidates: set[PronunciationPoses],
182) -> PronunciationPoses | None:
183 if len(pos_candidates) != 1:
184 return None
185 return next(iter(pos_candidates))
188def merge_pronunciation_tag_data(
189 sound: SoundData, tag_data: SoundData
190) -> None:
191 for value in tag_data.get("tags", []):
192 if value not in sound.get("tags", []): 192 ↛ 191line 192 didn't jump to line 191 because the condition on line 192 was always true
193 data_append(sound, "tags", value)
194 for value in tag_data.get("topics", []): 194 ↛ 195line 194 didn't jump to line 195 because the loop on line 194 never started
195 if value not in sound.get("topics", []):
196 data_append(sound, "topics", value)
197 if note := tag_data.get("note"):
198 existing_note = sound.get("note")
199 if not existing_note:
200 sound["note"] = note
201 elif note not in [n.strip() for n in existing_note.split(";")]: 201 ↛ exitline 201 didn't return from function 'merge_pronunciation_tag_data' because the condition on line 201 was always true
202 sound["note"] = f"{existing_note}; {note}"
205def parse_pronunciation_tags_with_pos(
206 wxr: WiktextractContext, text: str, sound: SoundData
207) -> PronunciationPoses:
208 match = split_pronunciation_pos_text(text)
209 set_sound_pos(sound, match.pos_values)
210 if match.residual:
211 tag_data: SoundData = {}
212 parse_pronunciation_tags(wxr, match.residual, tag_data)
213 merge_pronunciation_tag_data(sound, tag_data)
214 return match.pos_values
217def extract_pos_prefix(text: str) -> PronunciationPosPrefix | None:
218 stripped = text.strip()
219 if not (stripped.startswith("(") and stripped.endswith(")")):
220 bare_match = split_pronunciation_pos_text(text)
221 if bare_match.pos_values and not bare_match.residual:
222 return PronunciationPosPrefix(bare_match.pos_values, "", True)
224 colon_match = re.match(r"\s*([^:()]+?)\s*:\s*(.*)$", text)
225 if colon_match:
226 match = split_pronunciation_pos_text(colon_match.group(1))
227 if match.pos_values and not match.residual: 227 ↛ 228line 227 didn't jump to line 228 because the condition on line 227 was never true
228 return PronunciationPosPrefix(
229 match.pos_values, colon_match.group(2).strip(), True
230 )
232 paren_match = re.match(r"\s*\(([^()]*)\)\s*(.*)$", text)
233 if paren_match:
234 match = split_pronunciation_pos_text(paren_match.group(1))
235 if match.pos_values and not match.residual:
236 return PronunciationPosPrefix(
237 match.pos_values, paren_match.group(2).strip(), False
238 )
240 return None
243def extract_pronunciation_pos_template(
244 wxr: WiktextractContext,
245 name: str,
246 ht: TemplateArgs,
247 lang_code: str,
248) -> PronunciationPosMatch:
249 if name in {"a", "accent", "lb", "lbl", "label"}:
250 pos_args = [
251 value
252 for key, value in ht.items()
253 if isinstance(key, int) and key >= 2
254 ]
255 if not pos_args and ht.get(1) != lang_code:
256 pos_args = [ht.get(1, "")]
257 else:
258 pos_args = [
259 value
260 for key, value in ht.items()
261 if isinstance(key, int) and key >= 1
262 ]
264 pos_values: list[str] = []
265 residual: list[str] = []
266 for arg in pos_args:
267 text = clean_node(wxr, None, [arg])
268 match = split_pronunciation_pos_text(text)
269 for pos in match.pos_values:
270 if pos not in pos_values: 270 ↛ 269line 270 didn't jump to line 269 because the condition on line 270 was always true
271 pos_values.append(pos)
272 if match.residual:
273 residual.append(match.residual)
274 return PronunciationPosMatch(tuple(pos_values), ", ".join(residual))
277def extract_pron_template(
278 wxr: WiktextractContext, tname: str, targs: TemplateArgs, expanded: str
279) -> tuple[SoundData, list[SoundData]] | None:
280 """In post_template_fn, this is used to handle all enPR and IPA templates
281 so that we can leave breadcrumbs in the text that can later be handled
282 there. We return a `base_data` so that if there are two
283 or more templates on the same line, like this:
284 (Tags for the whole line, really) enPR: foo, IPA(keys): /foo/
285 then we can apply base_data fields to other templates, too, if needed.
286 """
287 cleaned = clean_value(wxr, expanded)
288 # print(f"extract_pron_template input: {tname=} {expanded=}-> {cleaned=}")
289 m = IPA_EXTRACT_RE.match(cleaned)
290 if not m:
291 wxr.wtp.error(
292 f"Text cannot match IPA_EXTRACT_RE regex: "
293 f"{cleaned=}, {tname=}, {targs=}",
294 sortid="en/pronunciation/54",
295 )
296 return None
297 # for i, group in enumerate(m.groups()):
298 # print(i + 1, repr(group))
299 main_qual = m.group(2) or ""
300 if "qq" in targs:
301 # If the template has been given a qualifier that applies to
302 # every entry, but which also happens to appear at the end
303 # which can be confused with the post-qualifier of a single
304 # entry in the style of "... /ipa3/ (foo) (bar)", where foo
305 # might not be present so the bar looks like it only might
306 # apply to `/ipa3/`
307 pron_body = m.group(5)
308 post_qual = m.group(7)
309 else:
310 pron_body = m.group(4)
311 post_qual = ""
313 if not pron_body: 313 ↛ 314line 313 didn't jump to line 314 because the condition on line 313 was never true
314 wxr.wtp.error(
315 f"Regex failed to find 'body' from {cleaned=}",
316 sortid="en/pronunciation/81",
317 )
318 return None
320 base_data: SoundData = {}
321 if main_qual:
322 parse_pronunciation_tags_with_pos(wxr, main_qual, base_data)
323 if post_qual:
324 parse_pronunciation_tags_with_pos(wxr, post_qual, base_data)
325 # This base_data is used as the base copy for all entries from this
326 # template, but it is also returned so that its contents may be applied
327 # to other templates on the same line.
328 # print(f"{base_data=}")
330 sound_datas: list[SoundData] = []
332 parts: list[list[str]] = [[]]
333 inside = 0
334 current: list[str] = []
335 for i, p in enumerate(re.split(r"(\s*,|;|\(|\)\s*)", pron_body)):
336 # Split the line on commas and semicolons outside of parens. This
337 # gives us lines with "(main-qualifier) /phon/ (post-qualifier, maybe)"
338 # print(f" {i=}, {p=}")
339 comp = p.strip()
340 if not p:
341 continue
342 if comp == "(":
343 if not inside and i > 0: 343 ↛ 346line 343 didn't jump to line 346 because the condition on line 343 was always true
344 if stripped := "".join(current).strip():
345 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]
346 current = [p]
347 inside += 1
348 continue
349 if comp == ")":
350 inside -= 1
351 if not inside: 351 ↛ 356line 351 didn't jump to line 356 because the condition on line 351 was always true
352 if stripped := "".join(current).strip(): 352 ↛ 356line 352 didn't jump to line 356 because the condition on line 352 was always true
353 current.append(p)
354 parts[-1].append("".join(current).strip()) # type:ignore[arg-type]
355 current = []
356 continue
357 if not inside and comp in (",", ";"):
358 if stripped := "".join(current).strip():
359 parts[-1].append(stripped) # type:ignore[arg-type]
360 current = []
361 parts.append([])
362 continue
363 current.append(p)
364 if current:
365 parts[-1].append("".join(current).strip())
367 # print(f">>>>>> {parts=}")
368 new_parts: list[list[str]] = []
369 for entry in parts:
370 if not entry: 370 ↛ 371line 370 didn't jump to line 371 because the condition on line 370 was never true
371 continue
372 new_entry: list[str] = []
373 i1: int = entry[0].startswith("(") and entry[0].endswith(")")
374 if i1:
375 new_entry.append(entry[0][1:-1].strip())
376 else:
377 new_entry.append("")
378 i2: int = (
379 entry[-1].startswith("(")
380 and entry[-1].endswith(")")
381 and len(entry) > 1
382 )
383 if i2 == 0:
384 i2 = len(entry)
385 else:
386 i2 = -1
387 new_entry.append("".join(entry[i1:i2]).strip())
388 if not new_entry[-1]: 388 ↛ 389line 388 didn't jump to line 389 because the condition on line 388 was never true
389 wxr.wtp.error(
390 f"Missing IPA/enPRO sound data between qualifiers?{entry=}",
391 sortid="en/pronunciation/153",
392 )
393 if i2 == -1:
394 new_entry.append(entry[-1][1:-1].strip())
395 else:
396 new_entry.append("")
397 new_parts.append(new_entry)
399 # print(f">>>>> {new_parts=}")
401 for part in new_parts:
402 sd = deepcopy(base_data)
403 if part[0]:
404 parse_pronunciation_tags_with_pos(wxr, part[0], sd)
405 if part[2]:
406 parse_pronunciation_tags_with_pos(wxr, part[2], sd)
407 if tname == "enPR":
408 sd["enpr"] = part[1]
409 else:
410 sd["ipa"] = part[1]
411 sound_datas.append(sd)
413 # print(f"BASE_DATA: {base_data}")
414 # print(f"SOUND_DATAS: {sound_datas=}")
416 return base_data, sound_datas
419def parse_pronunciation(
420 wxr: WiktextractContext,
421 level_node: LevelNode,
422 data: WordData,
423 etym_data: WordData,
424 have_etym: bool,
425 base_data: WordData,
426 lang_code: str,
427) -> None:
428 """Parses the pronunciation section from a language section on a
429 page."""
430 if level_node.kind in LEVEL_KINDS: 430 ↛ 443line 430 didn't jump to line 443 because the condition on line 430 was always true
431 contents: list[str | WikiNode | TemplateNode] = []
432 for node in level_node.children:
433 if isinstance(node, TemplateNode):
434 if node.template_name == "th-pron":
435 extract_th_pron_template(wxr, data, node)
436 elif node.template_name == "zh-pron":
437 extract_zh_pron_template(wxr, data, node)
438 else:
439 contents.append(node)
440 else:
441 contents.append(node)
442 else:
443 contents = [level_node]
444 # Remove subsections, such as Usage notes. They may contain IPAchar
445 # templates in running text, and we do not want to extract IPAs from
446 # those.
447 # Filter out only LEVEL_KINDS; 'or' is doing heavy lifting here
448 # Slip through not-WikiNodes, then slip through WikiNodes that
449 # are not LEVEL_KINDS.
450 contents = [
451 x
452 for x in contents
453 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
454 ]
455 if not any(
456 isinstance(x, WikiNode) and x.kind == NodeKind.LIST for x in contents
457 ):
458 # expand all templates
459 new_contents: list[str | WikiNode | TemplateNode] = []
460 for lst in contents:
461 if isinstance(lst, TemplateNode):
462 temp = wxr.wtp.node_to_wikitext(lst)
463 temp = wxr.wtp.expand(temp)
464 temp_parsed = wxr.wtp.parse(temp)
465 new_contents.extend(temp_parsed.children)
466 else:
467 new_contents.append(lst)
468 contents = new_contents
470 if have_etym and data is base_data: 470 ↛ 471line 470 didn't jump to line 471 because the condition on line 470 was never true
471 data = etym_data
472 pron_templates: list[tuple[SoundData, list[SoundData]]] = []
473 pron_pos_markers: list[PronunciationPoses] = []
474 hyphenations: list[Hyphenation] = []
475 audios: list[SoundData] = []
476 have_panel_templates = False
478 def parse_pronunciation_template_fn(
479 name: str, ht: TemplateArgs
480 ) -> str | None:
481 """Handle pronunciation and hyphenation templates"""
482 # _template_fn handles templates *before* they are expanded;
483 # this allows for special handling before all the work needed
484 # for expansion is done.
485 nonlocal have_panel_templates
486 if is_panel_template(wxr, name):
487 have_panel_templates = True
488 return ""
489 if name == "audio":
490 filename = ht.get(2) or ""
491 audio: SoundData = {"audio": filename.strip()}
492 dialect = ht.get("a", "")
493 if "aa" in ht: 493 ↛ 494line 493 didn't jump to line 494 because the condition on line 493 was never true
494 dialect += ", " + ht.get("aa", "")
495 if dialect:
496 dialect = dialect.replace("<", "").replace(">", "")
497 dialect = clean_node(wxr, None, [dialect])
498 for part in split_at_comma_semi(dialect):
499 if "(" not in part:
500 parse_pronunciation_tags(wxr, part, audio)
501 else:
502 for ppart in re.split(r"[][()]", part):
503 parse_pronunciation_tags(wxr, ppart, audio)
504 desc = ht.get(3) or ""
505 desc = clean_node(wxr, None, [desc])
506 if desc: 506 ↛ 507line 506 didn't jump to line 507 because the condition on line 506 was never true
507 audio["text"] = desc
508 m = re.search(r"\((([^()]|\([^()]*\))*)\)", desc)
509 skip = False
510 if m: 510 ↛ 511line 510 didn't jump to line 511 because the condition on line 510 was never true
511 par = m.group(1)
512 cls = classify_desc(par)
513 if cls == "tags":
514 parse_pronunciation_tags(wxr, par, audio)
515 else:
516 skip = True
517 if skip: 517 ↛ 518line 517 didn't jump to line 518 because the condition on line 517 was never true
518 return ""
519 audios.append(audio)
520 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
521 if name == "audio-IPA": 521 ↛ 522line 521 didn't jump to line 522 because the condition on line 521 was never true
522 filename = ht.get(2) or ""
523 ipa = ht.get(3) or ""
524 dial = ht.get("dial")
525 audio = {"audio": filename.strip()}
526 if dial:
527 dial = clean_node(wxr, None, [dial])
528 audio["text"] = dial
529 if ipa:
530 audio["audio-ipa"] = ipa
531 audios.append(audio)
532 # The problem with these IPAs is that they often just describe
533 # what's in the sound file, rather than giving the pronunciation
534 # of the word alone. It is common for audio files to contain
535 # multiple pronunciations or articles in the same file, and then
536 # this IPA often describes what is in the file.
537 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
538 if name == "audio-pron":
539 filename = ht.get(2) or ""
540 ipa = ht.get("ipa") or ""
541 dial = ht.get("dial")
542 country = ht.get("country")
543 audio = {"audio": filename.strip()}
544 if dial: 544 ↛ 548line 544 didn't jump to line 548 because the condition on line 544 was always true
545 dial = clean_node(wxr, None, [dial])
546 audio["text"] = dial
547 parse_pronunciation_tags(wxr, dial, audio)
548 if country: 548 ↛ 550line 548 didn't jump to line 550 because the condition on line 548 was always true
549 parse_pronunciation_tags(wxr, country, audio)
550 if ipa: 550 ↛ 552line 550 didn't jump to line 552 because the condition on line 550 was always true
551 audio["audio-ipa"] = ipa
552 audios.append(audio)
553 # XXX do we really want to extract pronunciations from these?
554 # Or are they spurious / just describing what is in the
555 # audio file?
556 # if ipa:
557 # pron = {"ipa": ipa}
558 # if dial:
559 # parse_pronunciation_tags(wxr, dial, pron)
560 # if country:
561 # parse_pronunciation_tags(wxr, country, pron)
562 # data_append(data, "sounds", pron)
563 return "__AUDIO_IGNORE_THIS__" + str(len(audios) - 1) + "__"
564 if name in ("hyph", "hyphenation"):
565 # {{hyph|en|re|late|caption="Hyphenation UK:"}}
566 # {{hyphenation|it|quiè|to||qui|è|to||quié|to||qui|é|to}}
567 # and also nocaption=1
568 caption = clean_node(wxr, None, ht.get("caption", ""))
569 tagsets, _ = decode_tags(caption)
570 # flatten the tagsets into one; it would be really weird to have
571 # several tagsets for a hyphenation caption
572 tags = sorted(set(tag for tagset in tagsets for tag in tagset))
573 # We'll just ignore any errors from tags, it's not very important
574 # for hyphenation
575 tags = [tag for tag in tags if not tag.startswith("error")]
576 hyph_sequences: list[list[str]] = [[]]
577 for text in [
578 t for (k, t) in ht.items() if (isinstance(k, int) and k >= 2)
579 ]:
580 if not text:
581 hyph_sequences.append([])
582 else:
583 hyph_sequences[-1].append(clean_node(wxr, None, text))
584 for seq in hyph_sequences:
585 hyphenations.append(Hyphenation(parts=seq, tags=tags))
586 return ""
587 return None
589 may_be_duplicates = False
591 def parse_pron_post_template_fn(
592 name: str, ht: TemplateArgs, text: str
593 ) -> str | None:
594 # _post_template_fn handles templates *after* the work to expand
595 # them has been done; this is exactly the same as _template_fn,
596 # except with the additional expanded text as an input, and
597 # possible side-effects from the expansion and recursion (like
598 # calling other subtemplates that are handled in _template_fn.
599 nonlocal may_be_duplicates
600 if is_panel_template(wxr, name): 600 ↛ 601line 600 didn't jump to line 601 because the condition on line 600 was never true
601 return ""
602 if name in PRON_POS_TEMPLATE_NAMES:
603 pos_match = extract_pronunciation_pos_template(
604 wxr, name, ht, lang_code
605 )
606 if pos_match.pos_values:
607 pron_pos_markers.append(pos_match.pos_values)
608 marker = (
609 f"__PRON_POS_MARKER_{len(pron_pos_markers) - 1}__"
610 )
611 if pos_match.residual: 611 ↛ 612line 611 didn't jump to line 612 because the condition on line 611 was never true
612 return f"{marker} ({pos_match.residual})"
613 return marker
614 if name in {
615 *PRON_POS_TEMPLATE_NAMES,
616 "l",
617 "link",
618 }:
619 # Kludge: when these templates expand to /.../ or [...],
620 # replace the expansion by something safe. This is used
621 # to filter spurious IPA-looking expansions that aren't really
622 # IPAs. We probably don't care about these templates in the
623 # contexts where they expand to something containing these.
624 v = re.sub(r'href="[^"]*"', "", text) # Ignore URLs
625 v = re.sub(r'src="[^"]*"', "", v)
626 v = clean_value(wxr, v)
627 if re.search(r"/[^/,]+?/|\[[^]0-9,/][^],/]*?\]", v):
628 # Note: replacing by empty results in Lua errors that we
629 # would rather not have. For example, voi/Middle Vietnamese
630 # uses {{a|{{l{{vi|...}}}}, and the {{a|...}} will fail
631 # if {{l|...}} returns empty.
632 return "stripped-by-parse_pron_post_template_fn"
633 if name in ("IPA", "enPR"):
634 # Extract the data from IPA and enPR templates (same underlying
635 # template) and replace them in-text with magical cookie that
636 # can be later used to refer to the data's index inside
637 # pron_templates.
638 if pron_t := extract_pron_template(wxr, name, ht, text):
639 pron_templates.append(pron_t)
640 return f"__PRON_TEMPLATE_{len(pron_templates) - 1}__"
641 # Catch templates that generate duplicate sound data entries
642 # here; if the text produces a big, toggleable section, the
643 # "header" for that section might be duplicated. Add more conditions
644 # if necessary.
645 if text.startswith("<") and "vsToggleElement" in text: 645 ↛ 646line 645 didn't jump to line 646 because the condition on line 645 was never true
646 may_be_duplicates = True
647 return text
649 def flattened_tree(
650 lines: list[WikiNode | str],
651 ) -> Iterator[FlattenedListNode]:
652 assert isinstance(lines, list)
653 for line in lines:
654 yield from flattened_tree1(line, 0)
656 def flattened_tree1(
657 node: WikiNode | str, list_depth: int
658 ) -> Iterator[FlattenedListNode]:
659 assert isinstance(node, (WikiNode, str))
660 if isinstance(node, str):
661 yield FlattenedListNode(node, list_depth)
662 return
663 elif node.kind == NodeKind.LIST:
664 for item in node.children:
665 yield from flattened_tree1(item, list_depth)
666 elif node.kind == NodeKind.LIST_ITEM:
667 item_depth = (
668 len(node.sarg) if isinstance(node.sarg, str) else list_depth
669 )
670 new_children = []
671 sublist = None
672 for child in node.children:
673 if isinstance(child, WikiNode) and child.kind == NodeKind.LIST:
674 sublist = child
675 else:
676 new_children.append(child)
677 node.children = new_children
678 node.sarg = "*"
679 yield FlattenedListNode(node, item_depth)
680 if sublist:
681 yield from flattened_tree1(sublist, item_depth)
682 else:
683 yield FlattenedListNode(node, list_depth)
685 # XXX Do not use flattened_tree more than once here, for example for
686 # debug printing... The underlying data is changed, and the separated
687 # sublists disappear.
689 # Kludge for templates that generate several lines, but haven't
690 # been caught by earlier kludges...
691 def split_cleaned_node_on_newlines(
692 contents: list[WikiNode | str],
693 ) -> Iterator[tuple[str, int]]:
694 for flattened in flattened_tree(contents):
695 ipa_text = clean_node(
696 wxr,
697 data,
698 flattened.node,
699 template_fn=parse_pronunciation_template_fn,
700 post_template_fn=parse_pron_post_template_fn,
701 )
702 for line in ipa_text.splitlines():
703 yield line, flattened.list_depth
705 # have_pronunciations = False
706 active_pos: PronunciationPoses | None = None
707 # POS values from parent pronunciation lines by original list depth.
708 # Audio-only child lines can inherit from a parent pronunciation line,
709 # but same-depth audio lines must not inherit from a preceding IPA.
710 pronunciation_pos_stack: list[tuple[int, PronunciationPoses]] = []
712 def parent_pronunciation_pos(
713 list_depth: int,
714 ) -> PronunciationPoses | None:
715 if not pronunciation_pos_stack:
716 return None
717 parent_depth, parent_pos = pronunciation_pos_stack[-1]
718 return parent_pos if parent_depth < list_depth else None
720 for line, list_depth in split_cleaned_node_on_newlines(contents):
721 prefix: str | None = None
722 earlier_base_data: SoundData | None = None
723 line_pos: PronunciationPoses | None = None
724 current_group_sounds: list[SoundData] = []
725 # POS values seen on sounds extracted from this physical line. A
726 # single candidate can seed adjacent audio-only child lines; multiple
727 # POS-marked sounds on one line are too ambiguous for inheritance.
728 line_sound_pos_candidates: set[PronunciationPoses] = set()
729 line_has_sound = False
730 if not line: 730 ↛ 731line 730 didn't jump to line 731 because the condition on line 730 was never true
731 continue
732 while (
733 pronunciation_pos_stack
734 and pronunciation_pos_stack[-1][0] >= list_depth
735 ):
736 pronunciation_pos_stack.pop()
738 split_templates = re.split(r"__PRON_TEMPLATE_(\d+)__", line)
739 for i, text in enumerate(split_templates):
740 if not text:
741 continue
742 # clean up starts at the start of the line
743 text = re.sub(r"^\**\s*", "", text).strip()
744 if i == 0:
745 # At the start of a line, check for stuff like "Noun:"
746 # or "(verb)" for POS labels that apply to this line or
747 # structurally nested pronunciation lines.
748 # These labels feed the inheritance state that later sets the
749 # temporary sound["pos"] field used to route pronunciation
750 # data into matching POS sections.
751 if pos_prefix := extract_pos_prefix(text):
752 text = pos_prefix.text
753 line_pos = pos_prefix.pos_values
754 if pos_prefix.is_persistent:
755 active_pos = pos_prefix.pos_values
756 if not text:
757 continue
759 m = re.search(r"__PRON_POS_MARKER_(\d+)__", text)
760 while m:
761 if current_group_sounds and re.search(
762 r"[,;]", text[: m.start()]
763 ):
764 current_group_sounds = []
765 pos_values = pron_pos_markers[int(m.group(1))]
766 if current_group_sounds:
767 for sound in current_group_sounds:
768 set_sound_pos(sound, pos_values)
769 line_sound_pos_candidates.add(pos_values)
770 line_pos = pos_values
771 text = text[: m.start()] + text[m.end() :]
772 m = re.search(r"__PRON_POS_MARKER_(\d+)__", text)
773 text = text.strip()
774 if not text:
775 continue
776 # POS inheritance for normal pronunciation data:
777 # 1. line_pos: explicit POS marker on this line, e.g.
778 # "* {{q|noun}} {{IPA|...}}".
779 # 2. parent_pronunciation_pos: structurally inherited from a
780 # parent list item, e.g. "* {{q|noun}}" then "** {{IPA|...}}".
781 # 3. active_pos: support for "* Noun:" followed by
782 # "* {{IPA|...}}"; broad, so it stays after structural data.
783 inherited_pos = (
784 line_pos or parent_pronunciation_pos(list_depth) or active_pos
785 )
787 if i % 2 == 1:
788 # re.split (with capture groups) splits the lines so that
789 # every even entry is a captured splitter; odd lines are either
790 # empty strings or stuff around the splitters.
791 base_pron_data, first_prons = pron_templates[int(text)]
792 if base_pron_data:
793 earlier_base_data = base_pron_data
794 # print(f"Set {earlier_base_data=}")
795 elif earlier_base_data is not None:
796 # merge data from an earlier iteration of this loop
797 for pr in first_prons:
798 if "note" in pr and "note" in earlier_base_data: 798 ↛ 799line 798 didn't jump to line 799 because the condition on line 798 was never true
799 pr["note"] += ";" + earlier_base_data.get(
800 "note", ""
801 )
802 elif "note" in earlier_base_data: 802 ↛ 803line 802 didn't jump to line 803 because the condition on line 802 was never true
803 pr["note"] = earlier_base_data["note"]
804 if "topics" in earlier_base_data: 804 ↛ 805line 804 didn't jump to line 805 because the condition on line 804 was never true
805 data_extend(
806 pr, "topics", earlier_base_data["topics"]
807 )
808 if "tags" in pr and "tags" in earlier_base_data: 808 ↛ 809line 808 didn't jump to line 809 because the condition on line 808 was never true
809 pr["tags"].extend(earlier_base_data["tags"])
810 elif "tags" in earlier_base_data: 810 ↛ 797line 810 didn't jump to line 797 because the condition on line 810 was always true
811 pr["tags"] = sorted(set(earlier_base_data["tags"]))
812 for pr in first_prons:
813 if sound_pos := set_sound_pos(
814 pr,
815 None if "pos" in pr else inherited_pos,
816 ):
817 line_sound_pos_candidates.add(sound_pos)
818 if pr not in data.get("sounds", ()):
819 data_append(data, "sounds", pr)
820 current_group_sounds.append(pr)
821 line_has_sound = True
822 # This bit is handled
823 continue
825 if "IPA" in text:
826 field: Literal[
827 "audio",
828 "audio-ipa",
829 "enpr",
830 "form",
831 "hangeul",
832 "homophone",
833 "ipa",
834 "mp3_url",
835 "note",
836 "ogg_url",
837 "other",
838 "rhymes",
839 "tags",
840 "text",
841 "topics",
842 "zh-pron",
843 ] = "ipa"
844 else:
845 # This is used for Rhymes, Homophones, etc
846 field = "other"
848 # Check if it contains Japanese "Tokyo" pronunciation with
849 # special syntax
850 m = re.search(r"(?m)\(Tokyo\) +([^ ]+) +\[", text)
851 if m: 851 ↛ 852line 851 didn't jump to line 852 because the condition on line 851 was never true
852 pron: SoundData = {field: m.group(1)} # type: ignore[misc]
853 if sound_pos := set_sound_pos(pron, inherited_pos):
854 line_sound_pos_candidates.add(sound_pos)
855 data_append(data, "sounds", pron)
856 current_group_sounds.append(pron)
857 line_has_sound = True
858 # have_pronunciations = True
859 continue
861 # Check if it contains Rhymes
862 m = re.match(r"\s*Rhymes?: (.*)", text)
863 if m:
864 for ending in split_at_comma_semi(m.group(1)):
865 ending = ending.strip()
866 if ending: 866 ↛ 864line 866 didn't jump to line 864 because the condition on line 866 was always true
867 pron = {"rhymes": ending}
868 if sound_pos := set_sound_pos(pron, inherited_pos):
869 line_sound_pos_candidates.add(sound_pos)
870 data_append(data, "sounds", pron)
871 current_group_sounds.append(pron)
872 line_has_sound = True
873 # have_pronunciations = True
874 continue
876 # Check if it contains homophones
877 m = re.search(r"(?m)\bHomophones?: (.*)", text)
878 if m:
879 for w in split_at_comma_semi(m.group(1)):
880 w = w.strip()
881 if w: 881 ↛ 879line 881 didn't jump to line 879 because the condition on line 881 was always true
882 pron = {"homophone": w}
883 if sound_pos := set_sound_pos(pron, inherited_pos):
884 line_sound_pos_candidates.add(sound_pos)
885 data_append(data, "sounds", pron)
886 current_group_sounds.append(pron)
887 line_has_sound = True
888 # have_pronunciations = True
889 continue
891 # Check if it contains Phonetic hangeul
892 m = re.search(r"(?m)\bPhonetic hange?ul: \[([^]]+)\]", text)
893 if m: 893 ↛ 894line 893 didn't jump to line 894 because the condition on line 893 was never true
894 seen = set()
895 for w in m.group(1).split("/"):
896 w = w.strip()
897 if w and w not in seen:
898 seen.add(w)
899 pron = {"hangeul": w}
900 if sound_pos := set_sound_pos(pron, inherited_pos):
901 line_sound_pos_candidates.add(sound_pos)
902 data_append(data, "sounds", pron)
903 current_group_sounds.append(pron)
904 line_has_sound = True
905 # have_pronunciations = True
907 # This regex-based hyphenation detection left as backup
908 m = re.search(r"\b(Syllabification|Hyphenation): *([^\n.]*)", text)
909 if m:
910 data_append(data, "hyphenation", m.group(2))
911 commaseparated = m.group(2).split(",")
912 if len(commaseparated) > 1: 912 ↛ 923line 912 didn't jump to line 923 because the condition on line 912 was always true
913 for h in commaseparated:
914 # That second characters looks like a dash but it's
915 # actually unicode decimal code 8231, hyphenation dash
916 # Add more delimiters here if needed.
917 parts = re.split(r"-|‧", h.strip())
918 data_append(
919 data, "hyphenations", Hyphenation(parts=parts)
920 )
921 ...
922 else:
923 data_append(
924 data,
925 "hyphenations",
926 Hyphenation(parts=m.group(2).split(sep="-")),
927 )
928 # have_pronunciations = True
930 # See if it contains a word prefix restricting which forms the
931 # pronunciation applies to (see amica/Latin) and/or parenthesized
932 # tags.
933 m = re.match(
934 r"^[*#\s]*(([-\w]+):\s+)?\((([^()]|\([^()]*\))*?)\)", text
935 )
936 if m:
937 prefix = m.group(2) or ""
938 tagstext = m.group(3)
939 text = text[m.end() :]
940 else:
941 m = re.match(r"^[*#\s]*([-\w]+):\s+", text)
942 if m:
943 prefix = m.group(1)
944 tagstext = ""
945 text = text[m.end() :]
946 else:
947 # Spanish has tags before pronunciations, eg. aceite/Spanish
948 m = re.match(r".*:\s+\(([^)]*)\)\s+(.*)", text)
949 if m: 949 ↛ 950line 949 didn't jump to line 950 because the condition on line 949 was never true
950 tagstext = m.group(1)
951 text = m.group(2)
952 else:
953 # No prefix. In this case, we inherit prefix
954 # from previous entry. This particularly
955 # applies for nested Audio files.
956 tagstext = ""
957 if tagstext:
958 earlier_base_data = {}
959 parse_pronunciation_tags_with_pos(
960 wxr, tagstext, earlier_base_data
961 )
963 # Find romanizations from the pronunciation section (routinely
964 # produced for Korean by {{ko-IPA}})
965 for m in re.finditer(pron_romanization_re, text): 965 ↛ 966line 965 didn't jump to line 966 because the loop on line 965 never started
966 prefix = m.group(1)
967 w = m.group(2).strip()
968 tag = pron_romanizations[prefix]
969 form = {"form": w, "tags": tag.split()}
970 data_append(data, "forms", form)
972 # Find IPA pronunciations
973 for m in re.finditer(
974 r"(?m)/[^][\n/,]+?/" r"|" r"\[[^]\n0-9,/][^],/]*?\]", text
975 ):
976 v = m.group(0)
977 # The regexp above can match file links. Skip them.
978 if v.startswith("[[File:"): 978 ↛ 979line 978 didn't jump to line 979 because the condition on line 978 was never true
979 continue
980 if v == "/wiki.local/": 980 ↛ 981line 980 didn't jump to line 981 because the condition on line 980 was never true
981 continue
982 if field == "ipa" and "__AUDIO_IGNORE_THIS__" in text: 982 ↛ 983line 982 didn't jump to line 983 because the condition on line 982 was never true
983 m = re.search(r"__AUDIO_IGNORE_THIS__(\d+)__", text)
984 assert m
985 idx = int(m.group(1))
986 if idx >= len(audios):
987 continue
988 if not audios[idx].get("audio-ipa"):
989 audios[idx]["audio-ipa"] = v
990 if prefix:
991 audios[idx]["form"] = prefix
992 else:
993 if earlier_base_data:
994 pron = deepcopy(earlier_base_data)
995 pron[field] = v
996 else:
997 pron = {field: v} # type: ignore[misc]
998 if prefix:
999 pron["form"] = prefix
1000 if sound_pos := set_sound_pos(
1001 pron,
1002 None if "pos" in pron else inherited_pos,
1003 ):
1004 line_sound_pos_candidates.add(sound_pos)
1005 if may_be_duplicates is True: 1005 ↛ 1006line 1005 didn't jump to line 1006 because the condition on line 1005 was never true
1006 ok = True
1007 for comp_sound in data.get("sounds", []):
1008 # Python has dict comparison since 3.8
1009 if pron == comp_sound:
1010 ok = False
1011 break
1012 if ok:
1013 data_append(data, "sounds", pron)
1014 else:
1015 data_append(data, "sounds", pron)
1016 current_group_sounds.append(pron)
1017 line_has_sound = True
1018 # have_pronunciations = True
1019 if current_group_sounds and re.search(r"[,;]", text):
1020 current_group_sounds = []
1021 # XXX what about {{hyphenation|...}}, {{hyph|...}}
1022 # and those used to be stored under "hyphenation"
1024 # Add data that was collected in template_fn
1025 # POS inheritance for audio has one extra source:
1026 # common_sound_pos(line_sound_pos_candidates), from pronunciations
1027 # extracted earlier on the same physical line, e.g.
1028 # "* {{IPA|en|/foo/|a=verb}} {{audio|en|foo.wav}}".
1029 # Explicit line_pos still wins, then same-line sound agreement, then
1030 # parent-list structure, then active_pos.
1031 audio_inherited_pos = (
1032 line_pos
1033 or common_sound_pos(line_sound_pos_candidates)
1034 or parent_pronunciation_pos(list_depth)
1035 or active_pos
1036 )
1037 for audio in audios:
1038 if "audio" in audio: 1038 ↛ 1095line 1038 didn't jump to line 1095 because the condition on line 1038 was always true
1039 # Compute audio file URLs
1040 fn = audio["audio"]
1041 # Strip certain characters, e.g., left-to-right mark
1042 fn = re.sub(r"[\u200f\u200e]", "", fn)
1043 fn = fn.strip()
1044 fn = urllib.parse.unquote(fn)
1045 # First character is usually uppercased
1046 if re.match(r"^[a-z][a-z]+", fn):
1047 fn = fn[0].upper() + fn[1:]
1048 if fn in wxr.config.redirects: 1048 ↛ 1049line 1048 didn't jump to line 1049 because the condition on line 1048 was never true
1049 fn = wxr.config.redirects[fn]
1050 # File extension is lowercased
1051 # XXX some words seem to need this, some don't seem to
1052 # have this??? what is the exact rule?
1053 # fn = re.sub(r"\.[^.]*$", lambda m: m.group(0).lower(), fn)
1054 # Spaces are converted to underscores
1055 fn = re.sub(r"\s+", "_", fn)
1056 # Compute hash digest part
1057 h = hashlib.md5()
1058 hname = fn.encode("utf-8")
1059 h.update(hname)
1060 digest = h.hexdigest()
1061 # Quote filename for URL
1062 qfn = urllib.parse.quote(fn)
1063 # For safety when writing files
1064 qfn = qfn.replace("/", "__slash__")
1065 if re.search(r"(?i)\.(ogg|oga)$", fn):
1066 ogg = (
1067 "https://upload.wikimedia.org/wikipedia/"
1068 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)
1069 )
1070 else:
1071 ogg = (
1072 "https://upload.wikimedia.org/wikipedia/"
1073 "commons/transcoded/"
1074 "{}/{}/{}/{}.ogg".format(
1075 digest[:1], digest[:2], qfn, qfn
1076 )
1077 )
1078 if re.search(r"(?i)\.(mp3)$", fn): 1078 ↛ 1079line 1078 didn't jump to line 1079 because the condition on line 1078 was never true
1079 mp3 = (
1080 "https://upload.wikimedia.org/wikipedia/"
1081 "commons/{}/{}/{}".format(digest[:1], digest[:2], qfn)
1082 )
1083 else:
1084 mp3 = (
1085 "https://upload.wikimedia.org/wikipedia/"
1086 "commons/transcoded/"
1087 "{}/{}/{}/{}.mp3".format(
1088 digest[:1], digest[:2], qfn, qfn
1089 )
1090 )
1091 audio["ogg_url"] = ogg
1092 audio["mp3_url"] = mp3
1093 if "pos" not in audio: 1093 ↛ 1095line 1093 didn't jump to line 1095 because the condition on line 1093 was always true
1094 set_sound_pos(audio, audio_inherited_pos)
1095 if audio not in data.get("sounds", ()):
1096 data_append(data, "sounds", audio)
1097 line_has_sound = True
1099 # if audios:
1100 # have_pronunciations = True
1101 audios = []
1103 data_extend(data, "hyphenations", hyphenations)
1104 hyphenations = []
1106 if line_pos and not line_has_sound:
1107 active_pos = line_pos
1108 pronunciation_pos_stack.append((list_depth, line_pos))
1109 elif line_pronunciation_pos := common_sound_pos(
1110 line_sound_pos_candidates
1111 ):
1112 pronunciation_pos_stack.append((list_depth, line_pronunciation_pos))
1114 ## I have commented out the otherwise unused have_pronunciation
1115 ## toggles; uncomment them to use this debug print
1116 # if not have_pronunciations and not have_panel_templates:
1117 # wxr.wtp.debug("no pronunciations found from pronunciation section",
1118 # sortid="pronunciations/533")
1121def extract_th_pron_template(
1122 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode
1123):
1124 # https://en.wiktionary.org/wiki/Template:th-pron
1125 @dataclass
1126 class TableHeader:
1127 raw_tags: list[str]
1128 rowspan: int
1130 expanded_node = wxr.wtp.parse(
1131 wxr.wtp.node_to_wikitext(t_node), expand_all=True
1132 )
1133 sounds = []
1134 for table_tag in expanded_node.find_html("table"):
1135 row_headers = []
1136 for tr_tag in table_tag.find_html("tr"):
1137 field = "other"
1138 new_headers = []
1139 for header in row_headers:
1140 if header.rowspan > 1:
1141 header.rowspan -= 1
1142 new_headers.append(header)
1143 row_headers = new_headers
1144 for th_tag in tr_tag.find_html("th"):
1145 header_str = clean_node(wxr, None, th_tag)
1146 if header_str.startswith("(standard) IPA"):
1147 field = "ipa"
1148 elif header_str.startswith("Homophones"): 1148 ↛ 1149line 1148 didn't jump to line 1149 because the condition on line 1148 was never true
1149 field = "homophone"
1150 elif header_str == "Audio":
1151 field = "audio"
1152 elif header_str != "": 1152 ↛ 1144line 1152 didn't jump to line 1144 because the condition on line 1152 was always true
1153 rowspan = 1
1154 rowspan_str = th_tag.attrs.get("rowspan", "1")
1155 if re.fullmatch(r"\d+", rowspan_str): 1155 ↛ 1157line 1155 didn't jump to line 1157 because the condition on line 1155 was always true
1156 rowspan = int(rowspan_str)
1157 header = TableHeader([], rowspan)
1158 for line in header_str.splitlines():
1159 for raw_tag in line.strip("{}\n ").split(";"):
1160 raw_tag = raw_tag.strip()
1161 if raw_tag != "": 1161 ↛ 1159line 1161 didn't jump to line 1159 because the condition on line 1161 was always true
1162 header.raw_tags.append(raw_tag)
1163 row_headers.append(header)
1165 for td_tag in tr_tag.find_html("td"):
1166 if field == "audio":
1167 for link_node in td_tag.find_child(NodeKind.LINK):
1168 filename = clean_node(wxr, None, link_node.largs[0])
1169 if filename != "": 1169 ↛ 1167line 1169 didn't jump to line 1167 because the condition on line 1169 was always true
1170 sound = create_audio_url_dict(filename)
1171 sounds.append(sound)
1172 elif field == "homophone": 1172 ↛ 1173line 1172 didn't jump to line 1173 because the condition on line 1172 was never true
1173 for span_tag in td_tag.find_html_recursively(
1174 "span", attr_name="lang", attr_value="th"
1175 ):
1176 word = clean_node(wxr, None, span_tag)
1177 if word != "":
1178 sounds.append({"homophone": word})
1179 else:
1180 raw_tags = []
1181 for html_node in td_tag.find_child_recursively(
1182 NodeKind.HTML
1183 ):
1184 if html_node.tag == "small":
1185 node_str = clean_node(wxr, None, html_node)
1186 if node_str.startswith("[") and node_str.endswith(
1187 "]"
1188 ):
1189 for raw_tag in node_str.strip("[]").split(","):
1190 raw_tag = raw_tag.strip()
1191 if raw_tag != "": 1191 ↛ 1189line 1191 didn't jump to line 1189 because the condition on line 1191 was always true
1192 raw_tags.append(raw_tag)
1193 elif len(sounds) > 0: 1193 ↛ 1181line 1193 didn't jump to line 1181 because the condition on line 1193 was always true
1194 sounds[-1]["roman"] = node_str
1195 elif html_node.tag == "span":
1196 node_str = clean_node(wxr, None, html_node)
1197 span_lang = html_node.attrs.get("lang", "")
1198 span_class = html_node.attrs.get("class", "")
1199 if node_str != "" and (
1200 span_lang == "th" or span_class in ["IPA", "tr"]
1201 ):
1202 sound = {}
1203 for raw_tag in raw_tags:
1204 if raw_tag in valid_tags: 1204 ↛ 1207line 1204 didn't jump to line 1207 because the condition on line 1204 was always true
1205 data_append(sound, "tags", raw_tag)
1206 else:
1207 data_append(sound, "raw_tags", raw_tag)
1208 for header in row_headers:
1209 for raw_tag in header.raw_tags:
1210 if raw_tag.lower() in valid_tags:
1211 data_append(
1212 sound, "tags", raw_tag.lower()
1213 )
1214 else:
1215 data_append(
1216 sound, "raw_tags", raw_tag
1217 )
1218 if "romanization" in sound.get("tags", []):
1219 field = "roman"
1220 sound[field] = node_str
1221 sounds.append(sound)
1223 clean_node(wxr, word_entry, expanded_node)
1224 data_extend(word_entry, "sounds", sounds)
1227def extract_zh_pron_template(
1228 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode
1229):
1230 # https://en.wiktionary.org/wiki/Template:zh-pron
1231 expanded_node = wxr.wtp.parse(
1232 wxr.wtp.node_to_wikitext(t_node), expand_all=True
1233 )
1234 seen_lists = set()
1235 sounds = []
1236 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):
1237 if list_node not in seen_lists:
1238 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
1239 sounds.extend(
1240 extract_zh_pron_list_item(wxr, list_item, [], seen_lists)
1241 )
1242 clean_node(wxr, word_entry, expanded_node)
1243 data_extend(word_entry, "sounds", sounds)
1246def extract_zh_pron_list_item(
1247 wxr: WiktextractContext,
1248 list_item: WikiNode,
1249 raw_tags: list[str],
1250 seen_lists: set[WikiNode],
1251) -> list[SoundData]:
1252 current_tags = raw_tags[:]
1253 sounds = []
1254 is_first_small_tag = True
1255 for node in list_item.children:
1256 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
1257 link_str = clean_node(wxr, None, node.largs)
1258 node_str = clean_node(wxr, None, node)
1259 if link_str.startswith("File:"): 1259 ↛ 1260line 1259 didn't jump to line 1260 because the condition on line 1259 was never true
1260 sound = create_audio_url_dict(link_str.removeprefix("File:"))
1261 sound["raw_tags"] = current_tags[:]
1262 translate_zh_pron_raw_tags(sound)
1263 sounds.append(sound)
1264 elif node_str != "": 1264 ↛ 1255line 1264 didn't jump to line 1255 because the condition on line 1264 was always true
1265 current_tags.append(node_str)
1266 elif isinstance(node, HTMLNode):
1267 if node.tag == "small":
1268 if is_first_small_tag: 1268 ↛ 1279line 1268 didn't jump to line 1279 because the condition on line 1268 was always true
1269 raw_tag_text = clean_node(
1270 wxr,
1271 None,
1272 [
1273 n
1274 for n in node.children
1275 if not (isinstance(n, HTMLNode) and n.tag == "sup")
1276 ],
1277 )
1278 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))
1279 elif len(sounds) > 0:
1280 data_extend(
1281 sounds[-1],
1282 "raw_tags",
1283 split_zh_pron_raw_tag(clean_node(wxr, None, node)),
1284 )
1285 translate_zh_pron_raw_tags(sounds[-1])
1286 is_first_small_tag = False
1287 elif node.tag == "span":
1288 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))
1289 elif ( 1289 ↛ 1294line 1289 didn't jump to line 1294 because the condition on line 1289 was never true
1290 node.tag == "table"
1291 and len(current_tags) > 0
1292 and current_tags[-1] == "Homophones"
1293 ):
1294 sounds.extend(
1295 extract_zh_pron_homophone_table(wxr, node, current_tags)
1296 )
1297 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
1298 seen_lists.add(node)
1299 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
1300 sounds.extend(
1301 extract_zh_pron_list_item(
1302 wxr, child_list_item, current_tags, seen_lists
1303 )
1304 )
1306 return sounds
1309def extract_zh_pron_homophone_table(
1310 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]
1311) -> list[SoundData]:
1312 sounds = []
1313 for td_tag in table.find_html_recursively("td"):
1314 for span_tag in td_tag.find_html("span"):
1315 span_class = span_tag.attrs.get("class", "")
1316 span_lang = span_tag.attrs.get("lang", "")
1317 span_str = clean_node(wxr, None, span_tag)
1318 if (
1319 span_str not in ["", "/"]
1320 and span_lang != ""
1321 and span_class in ["Hant", "Hans", "Hani"]
1322 ):
1323 sound = {"homophone": span_str, "raw_tags": raw_tags[:]}
1324 if span_class == "Hant":
1325 data_append(sound, "tags", "Traditional-Chinese")
1326 elif span_class == "Hans":
1327 data_append(sound, "tags", "Simplified-Chinese")
1328 translate_zh_pron_raw_tags(sound)
1329 sounds.append(sound)
1331 return sounds
1334def translate_zh_pron_raw_tags(sound: SoundData):
1335 from .zh_pron_tags import ZH_PRON_TAGS
1337 raw_tags = []
1338 for raw_tag in sound.get("raw_tags", []):
1339 if raw_tag in ZH_PRON_TAGS:
1340 tr_tag = ZH_PRON_TAGS[raw_tag]
1341 if isinstance(tr_tag, str):
1342 data_append(sound, "tags", tr_tag)
1343 elif isinstance(tr_tag, list) and tr_tag not in sound.get( 1343 ↛ 1338line 1343 didn't jump to line 1338 because the condition on line 1343 was always true
1344 "tags", []
1345 ):
1346 data_extend(sound, "tags", tr_tag)
1347 elif raw_tag in valid_tags:
1348 if raw_tag not in sound.get("tags", []): 1348 ↛ 1338line 1348 didn't jump to line 1338 because the condition on line 1348 was always true
1349 data_append(sound, "tags", raw_tag)
1350 elif raw_tag not in raw_tags: 1350 ↛ 1338line 1350 didn't jump to line 1338 because the condition on line 1350 was always true
1351 raw_tags.append(raw_tag)
1353 if len(raw_tags) > 0:
1354 sound["raw_tags"] = raw_tags
1355 elif "raw_tags" in sound: 1355 ↛ exitline 1355 didn't return from function 'translate_zh_pron_raw_tags' because the condition on line 1355 was always true
1356 del sound["raw_tags"]
1359def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:
1360 raw_tags = []
1361 if "(" not in raw_tag_text:
1362 for raw_tag in re.split(r",|:|;| and ", raw_tag_text):
1363 raw_tag = raw_tag.strip().removeprefix("incl. ").strip()
1364 if raw_tag != "":
1365 raw_tags.append(raw_tag)
1366 else:
1367 processed_offsets = []
1368 for match in re.finditer(r"\([^()]+\)", raw_tag_text):
1369 processed_offsets.append((match.start(), match.end()))
1370 raw_tags.extend(
1371 split_zh_pron_raw_tag(
1372 raw_tag_text[match.start() + 1 : match.end() - 1]
1373 )
1374 )
1375 not_processed = ""
1376 last_end = 0
1377 for start, end in processed_offsets:
1378 not_processed += raw_tag_text[last_end:start]
1379 last_end = end
1380 not_processed += raw_tag_text[last_end:]
1381 if not_processed != raw_tag_text: 1381 ↛ 1384line 1381 didn't jump to line 1384 because the condition on line 1381 was always true
1382 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags
1383 else:
1384 raw_tags.append(not_processed)
1386 return raw_tags
1389def extract_zh_pron_span(
1390 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]
1391) -> list[SoundData]:
1392 sounds = []
1393 small_tags = []
1394 pron_nodes = []
1395 roman = ""
1396 phonetic_pron = ""
1397 for index, node in enumerate(span_tag.children):
1398 if isinstance(node, HTMLNode) and node.tag == "small": 1398 ↛ 1399line 1398 didn't jump to line 1399 because the condition on line 1398 was never true
1399 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))
1400 elif ( 1400 ↛ 1405line 1400 didn't jump to line 1405 because the condition on line 1400 was never true
1401 isinstance(node, HTMLNode)
1402 and node.tag == "span"
1403 and "-Latn" in node.attrs.get("lang", "")
1404 ):
1405 roman = clean_node(wxr, None, node).strip("() ")
1406 elif isinstance(node, str) and node.strip() == "[Phonetic:": 1406 ↛ 1407line 1406 didn't jump to line 1407 because the condition on line 1406 was never true
1407 phonetic_pron = clean_node(
1408 wxr, None, span_tag.children[index + 1 :]
1409 ).strip("] ")
1410 break
1411 else:
1412 pron_nodes.append(node)
1413 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):
1414 zh_pron = zh_pron.strip("[]: ")
1415 if len(zh_pron) > 0: 1415 ↛ 1413line 1415 didn't jump to line 1413 because the condition on line 1415 was always true
1416 if "IPA" in span_tag.attrs.get("class", ""): 1416 ↛ 1417line 1416 didn't jump to line 1417 because the condition on line 1416 was never true
1417 sound = {"ipa": zh_pron, "raw_tags": raw_tags[:]}
1418 else:
1419 sound = {"zh_pron": zh_pron, "raw_tags": raw_tags[:]}
1420 if roman != "": 1420 ↛ 1421line 1420 didn't jump to line 1421 because the condition on line 1420 was never true
1421 sound["roman"] = roman
1422 sounds.append(sound)
1423 if len(sounds) > 0: 1423 ↛ 1425line 1423 didn't jump to line 1425 because the condition on line 1423 was always true
1424 data_extend(sounds[-1], "raw_tags", small_tags)
1425 if phonetic_pron != "": 1425 ↛ 1426line 1425 didn't jump to line 1426 because the condition on line 1425 was never true
1426 sound = {
1427 "zh_pron": phonetic_pron,
1428 "raw_tags": raw_tags[:] + ["Phonetic"],
1429 }
1430 if roman != "":
1431 sound["roman"] = roman
1432 sounds.append(sound)
1433 for sound in sounds:
1434 translate_zh_pron_raw_tags(sound)
1435 return sounds
1438def split_zh_pron(zh_pron: str) -> list[str]:
1439 # split by comma and other symbols that outside parentheses
1440 parentheses = 0
1441 pron_list = []
1442 pron = ""
1443 for c in zh_pron:
1444 if (
1445 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))
1446 and parentheses == 0
1447 and len(pron.strip()) > 0
1448 ):
1449 pron_list.append(pron.strip())
1450 pron = ""
1451 elif c == "(":
1452 parentheses += 1
1453 pron += c
1454 elif c == ")":
1455 parentheses -= 1
1456 pron += c
1457 else:
1458 pron += c
1460 if pron.strip() != "": 1460 ↛ 1462line 1460 didn't jump to line 1462 because the condition on line 1460 was always true
1461 pron_list.append(pron)
1462 return pron_list