Coverage for src/wiktextract/extractor/pt/pronunciation.py: 94%
46 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 06:55 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 06:55 +0000
1import re
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 LevelNode,
6 NodeKind,
7 WikiNode,
8)
10from ...page import clean_node
11from ...wxr_context import WiktextractContext
12from .models import Sound, WordEntry
13from .tags import translate_raw_tags
16def extract_pronunciation_section(
17 wxr: WiktextractContext,
18 page_data: list[WordEntry],
19 level_node: LevelNode,
20) -> None:
21 raw_tags = []
22 sounds = []
23 title_text = clean_node(wxr, None, level_node.largs)
24 if title_text not in ["", "Pronúncia"]:
25 raw_tags.append(title_text)
27 for list_node in level_node.find_child(NodeKind.LIST):
28 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
29 sounds.extend(
30 extract_pronunciation_list_item(
31 wxr, list_item, page_data[-1].lang_code, raw_tags
32 )
33 )
35 for child_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
36 extract_pronunciation_section(wxr, page_data, child_level_node)
38 for data in page_data:
39 if data.lang_code == page_data[-1].lang_code: 39 ↛ 38line 39 didn't jump to line 38 because the condition on line 39 was always true
40 for sound in sounds:
41 data.sounds.append(sound)
44def extract_pronunciation_list_item(
45 wxr: WiktextractContext,
46 list_item: WikiNode,
47 lang_code: str,
48 parent_raw_tags: list[str],
49) -> list[Sound]:
50 raw_tags = parent_raw_tags[:]
51 sounds = []
52 if len(list_item.children) == 1 and isinstance(list_item.children[0], str):
53 # Match minimal sections ` /ipa/ ` or ` [ipa] `
54 if re.match(r"\s*(/[^/]+/|\[[^][]+\])\s*$", list_item.children[0]): 54 ↛ 58line 54 didn't jump to line 58 because the condition on line 54 was always true
55 sound_value = clean_node(wxr, None, list_item.children[0]).strip()
56 sound = Sound(ipa=sound_value)
57 return [sound]
58 for index, node in enumerate(list_item.children):
59 if isinstance(node, str) and ":" in node:
60 raw_tag = clean_node(wxr, None, list_item.children[:index])
61 if raw_tag != "": 61 ↛ 63line 61 didn't jump to line 63 because the condition on line 61 was always true
62 raw_tags.append(raw_tag)
63 sound_value = clean_node(
64 wxr,
65 None,
66 [node[node.index(":") + 1 :]]
67 + [
68 n
69 for n in list_item.children[index + 1 :]
70 if not (isinstance(n, WikiNode) and n.kind == NodeKind.LIST)
71 ],
72 )
73 if sound_value != "":
74 sound = Sound(raw_tags=raw_tags)
75 if lang_code == "zh": 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 sound.zh_pron = sound_value
77 else:
78 sound.ipa = sound_value
79 translate_raw_tags(sound)
80 sounds.append(sound)
81 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
82 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
83 sounds.extend(
84 extract_pronunciation_list_item(
85 wxr, child_list_item, lang_code, raw_tags
86 )
87 )
89 return sounds