Coverage for src/wiktextract/extractor/cs/sound.py: 83%
100 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..share import set_sound_file_url_fields
6from .models import Form, Hyphenation, Sound, WordEntry
7from .tags import translate_raw_tags
10def extract_sound_section(
11 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
12):
13 for list_node in level_node.find_child(NodeKind.LIST):
14 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
15 raw_tags = []
16 for node in list_item.children:
17 if isinstance(node, TemplateNode):
18 if node.template_name == "IPA":
19 extract_ipa_template(wxr, base_data, node, raw_tags)
20 raw_tags.clear()
21 elif node.template_name == "IPA2":
22 extract_ipa2_template(wxr, base_data, node, raw_tags)
23 raw_tags.clear()
24 elif node.template_name == "Audio": 24 ↛ 27line 24 didn't jump to line 27 because the condition on line 24 was always true
25 extract_audio_template(wxr, base_data, node, raw_tags)
26 raw_tags.clear()
27 elif node.template_name == "Příznak2":
28 raw_tags.extend(extract_příznak2_template(wxr, node))
29 elif (
30 isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC
31 ):
32 raw_tag = clean_node(wxr, None, node)
33 if raw_tag != "": 33 ↛ 16line 33 didn't jump to line 16 because the condition on line 33 was always true
34 raw_tags.append(raw_tag)
37def extract_ipa_template(
38 wxr: WiktextractContext,
39 base_data: WordEntry,
40 t_node: TemplateNode,
41 raw_tags: list[str],
42):
43 # https://cs.wiktionary.org/wiki/Šablona:IPA
44 expanded_node = wxr.wtp.parse(
45 wxr.wtp.node_to_wikitext(t_node), expand_all=True
46 )
47 for span_tag in expanded_node.find_html(
48 "span", attr_name="class", attr_value="IPA"
49 ):
50 text = clean_node(wxr, None, span_tag)
51 for ipa in text.split(","):
52 ipa = ipa.strip()
53 if ipa != "": 53 ↛ 51line 53 didn't jump to line 51 because the condition on line 53 was always true
54 sound = Sound(ipa=ipa, raw_tags=raw_tags)
55 translate_raw_tags(sound)
56 base_data.sounds.append(sound)
57 clean_node(wxr, base_data, expanded_node)
60def extract_ipa2_template(
61 wxr: WiktextractContext,
62 base_data: WordEntry,
63 t_node: TemplateNode,
64 raw_tags: list[str],
65):
66 # https://cs.wiktionary.org/wiki/Šablona:IPA2
67 ipa = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
68 if ipa != "": 68 ↛ exitline 68 didn't return from function 'extract_ipa2_template' because the condition on line 68 was always true
69 sound = Sound(ipa=f"[{ipa}]", raw_tags=raw_tags)
70 translate_raw_tags(sound)
71 base_data.sounds.append(sound)
74def extract_audio_template(
75 wxr: WiktextractContext,
76 base_data: WordEntry,
77 t_node: TemplateNode,
78 raw_tags: list[str],
79):
80 # https://cs.wiktionary.org/wiki/Šablona:Audio
81 file = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
82 if file != "": 82 ↛ exitline 82 didn't return from function 'extract_audio_template' because the condition on line 82 was always true
83 sound = Sound(raw_tags=raw_tags)
84 set_sound_file_url_fields(wxr, file, sound)
85 translate_raw_tags(sound)
86 base_data.sounds.append(sound)
89def extract_příznak2_template(
90 wxr: WiktextractContext, t_node: TemplateNode
91) -> list[str]:
92 raw_tags = []
93 text = clean_node(wxr, None, t_node).strip("() ")
94 for raw_tag in text.split(","):
95 raw_tag = raw_tag.strip()
96 if raw_tag != "": 96 ↛ 94line 96 didn't jump to line 94 because the condition on line 96 was always true
97 raw_tags.append(raw_tag)
98 return raw_tags
101def extract_hyphenation_section(
102 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
103):
104 for list_node in level_node.find_child(NodeKind.LIST):
105 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
106 h_str = clean_node(wxr, None, list_item.children)
107 h_parts = list(filter(None, map(str.strip, h_str.split("-"))))
108 if len(h_parts) > 0: 108 ↛ 105line 108 didn't jump to line 105 because the condition on line 108 was always true
109 base_data.hyphenations.append(Hyphenation(parts=h_parts))
112def extract_homophone_section(
113 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
114):
115 for list_node in level_node.find_child(NodeKind.LIST):
116 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
117 for link_node in list_item.find_child(NodeKind.LINK):
118 homophone = clean_node(wxr, None, link_node)
119 if homophone != "":
120 base_data.sounds.append(Sound(homophone=homophone))
123def extract_transcript_section(
124 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
125):
126 for list_node in level_node.find_child(NodeKind.LIST):
127 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
128 for index, node in enumerate(list_item.children):
129 if isinstance(node, TemplateNode) and node.template_name in [
130 "Hiragana",
131 "Rómadži",
132 "Kana",
133 ]:
134 extract_ja_transcript_template(wxr, word_entry, node)
135 elif (
136 isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC
137 ):
138 italic_str = clean_node(wxr, None, node).removesuffix(":")
139 if italic_str != "": 139 ↛ 128line 139 didn't jump to line 128 because the condition on line 139 was always true
140 sound = Sound(raw_tags=[italic_str])
141 if italic_str in ["Pinyin", "Bopomofo"]: 141 ↛ 146line 141 didn't jump to line 146 because the condition on line 141 was always true
142 sound.zh_pron = clean_node(
143 wxr, None, list_item.children[index + 1 :]
144 )
145 else:
146 sound.other = clean_node(
147 wxr, None, list_item.children[index + 1 :]
148 )
149 translate_raw_tags(sound)
150 word_entry.sounds.append(sound)
151 break
154def extract_ja_transcript_template(
155 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
156):
157 expanded_template = wxr.wtp.parse(
158 wxr.wtp.node_to_wikitext(t_node), expand_all=True
159 )
160 for span_tag in expanded_template.find_html("span"):
161 span_class = span_tag.attrs.get("class", "")
162 if not span_class.endswith("-title") and span_class != "":
163 span_text = clean_node(wxr, None, span_tag)
164 if span_text != "": 164 ↛ 160line 164 didn't jump to line 160 because the condition on line 164 was always true
165 form = Form(form=span_text, raw_tags=[span_class])
166 translate_raw_tags(form)
167 word_entry.forms.append(form)
168 clean_node(wxr, word_entry, expanded_template)