Coverage for src / wiktextract / extractor / ko / sound.py: 78%
140 statements
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-29 01:50 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2025-12-29 01:50 +0000
1from wikitextprocessor import LevelNode, NodeKind, TemplateNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..share import set_sound_file_url_fields
6from .models import Sound, WordEntry
7from .tags import translate_raw_tags
9SOUND_TEMPLATES = frozenset(["발음 듣기", "IPA", "ko-IPA", "ja-pron", "audio"])
12def extract_sound_section(
13 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
14) -> None:
15 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE):
16 extract_sound_template(wxr, word_entry, t_node)
19def extract_sound_template(
20 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
21) -> None:
22 if node.template_name == "발음 듣기":
23 extract_listen_pronunciation_template(wxr, word_entry, node)
24 elif node.template_name == "IPA":
25 extract_ipa_template(wxr, word_entry, node)
26 elif node.template_name == "ko-IPA":
27 extract_ko_ipa_template(wxr, word_entry, node)
28 elif node.template_name == "ja-pron": 28 ↛ 30line 28 didn't jump to line 30 because the condition on line 28 was always true
29 extract_ja_pron_template(wxr, word_entry, node)
30 elif node.template_name == "audio":
31 extract_audio_template(wxr, word_entry, node)
34def extract_listen_pronunciation_template(
35 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
36) -> None:
37 # https://ko.wiktionary.org/wiki/틀:발음_듣기
38 for key in range(1, 9): 38 ↛ exitline 38 didn't return from function 'extract_listen_pronunciation_template' because the loop on line 38 didn't complete
39 if key not in node.template_parameters:
40 break
41 value = clean_node(wxr, None, node.template_parameters[key])
42 if value == "": 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true
43 continue
44 elif key % 2 == 1:
45 sound = Sound()
46 set_sound_file_url_fields(wxr, value, sound)
47 word_entry.sounds.append(sound)
48 elif len(word_entry.sounds) > 0: 48 ↛ 38line 48 didn't jump to line 38 because the condition on line 48 was always true
49 word_entry.sounds[-1].raw_tags.append(value)
50 translate_raw_tags(word_entry.sounds[-1])
53def extract_ipa_template(
54 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
55) -> None:
56 # https://ko.wiktionary.org/wiki/틀:IPA
57 for key in range(1, 5):
58 if key not in node.template_parameters: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true
59 break
60 value = clean_node(wxr, None, node.template_parameters[key])
61 if value == "": 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 continue
63 elif key % 2 == 1:
64 sound = Sound(ipa=value)
65 word_entry.sounds.append(sound)
66 elif len(word_entry.sounds) > 0: 66 ↛ 57line 66 didn't jump to line 57 because the condition on line 66 was always true
67 for raw_tag in value.split(","):
68 raw_tag = raw_tag.strip()
69 if raw_tag != "": 69 ↛ 67line 69 didn't jump to line 67 because the condition on line 69 was always true
70 word_entry.sounds[-1].raw_tags.append(raw_tag.strip())
71 translate_raw_tags(word_entry.sounds[-1])
74def extract_ko_ipa_template(
75 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
76):
77 # https://ko.wiktionary.org/wiki/틀:ko-IPA
78 sounds = []
79 expanded_node = wxr.wtp.parse(
80 wxr.wtp.node_to_wikitext(t_node), expand_all=True
81 )
82 clean_node(wxr, word_entry, expanded_node)
83 for ul_node in expanded_node.find_html("ul"):
84 for li_node in ul_node.find_html("li"):
85 if "ko-pron__ph" in li_node.attrs.get("class", ""):
86 for span_node in li_node.find_html(
87 "span", attr_name="lang", attr_value="ko"
88 ):
89 hangeul_str = clean_node(wxr, None, span_node).strip("[]")
90 for hangeul in hangeul_str.split("/"):
91 if hangeul != "": 91 ↛ 90line 91 didn't jump to line 90 because the condition on line 91 was always true
92 sounds.append(
93 Sound(hangeul=hangeul, tags=["phonetic"])
94 )
95 else:
96 raw_tags = []
97 for link_node in li_node.find_child(NodeKind.LINK):
98 raw_tag = clean_node(wxr, None, link_node)
99 if raw_tag not in ["", "IPA"]:
100 raw_tags.append(raw_tag)
101 for span_node in li_node.find_html(
102 "span", attr_name="class", attr_value="IPA"
103 ):
104 ipas = clean_node(wxr, None, span_node)
105 for ipa in ipas.split("~"):
106 ipa = ipa.strip()
107 if ipa != "": 107 ↛ 105line 107 didn't jump to line 105 because the condition on line 107 was always true
108 sound = Sound(ipa=ipa, raw_tags=raw_tags)
109 translate_raw_tags(sound)
110 sounds.append(sound)
112 for table in expanded_node.find_html("table"):
113 for tr in table.find_html("tr"):
114 raw_tag = ""
115 for th in tr.find_html("th"):
116 raw_tag = clean_node(wxr, None, th)
117 for td in tr.find_html("td"):
118 roman = clean_node(wxr, None, td)
119 if roman != "": 119 ↛ 117line 119 didn't jump to line 117 because the condition on line 119 was always true
120 sound = Sound(roman=roman)
121 if raw_tag != "": 121 ↛ 124line 121 didn't jump to line 124 because the condition on line 121 was always true
122 sound.raw_tags.append(raw_tag)
123 translate_raw_tags(sound)
124 sounds.append(sound)
126 audio_file = clean_node(
127 wxr,
128 None,
129 t_node.template_parameters.get(
130 "a", t_node.template_parameters.get("audio", "")
131 ),
132 )
133 if audio_file != "": 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 sound = Sound()
135 set_sound_file_url_fields(wxr, audio_file, sound)
136 sounds.append(sound)
137 word_entry.sounds.extend(sounds)
140def extract_ja_pron_template(
141 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
142) -> None:
143 # https://ko.wiktionary.org/wiki/틀:ja-pron
144 JA_PRON_ACCENTS = {
145 "중고형": "Nakadaka",
146 "평판형": "Heiban",
147 "두고형": "Atamadaka",
148 "미고형": "Odaka",
149 }
150 expanded_node = wxr.wtp.parse(
151 wxr.wtp.node_to_wikitext(node), expand_all=True
152 )
153 for ul_tag in expanded_node.find_html("ul"):
154 for li_tag in ul_tag.find_html("li"):
155 sound = Sound()
156 for span_tag in li_tag.find_html("span"):
157 span_class = span_tag.attrs.get("class", "").split()
158 if "usage-label-accent" in span_class:
159 sound.raw_tags.append(
160 clean_node(wxr, None, span_tag).strip("()")
161 )
162 elif "Jpan" in span_class:
163 sound.other = clean_node(wxr, None, span_tag)
164 elif "Latn" in span_class:
165 sound.roman = clean_node(wxr, None, span_tag)
166 elif "IPA" in span_class: 166 ↛ 156line 166 didn't jump to line 156 because the condition on line 166 was always true
167 sound.ipa = clean_node(wxr, None, span_tag)
168 for link_node in li_tag.find_child(NodeKind.LINK):
169 link_text = clean_node(wxr, None, link_node)
170 if link_text in JA_PRON_ACCENTS:
171 sound.tags.append(JA_PRON_ACCENTS[link_text])
172 if sound.ipa != "" or sound.roman != "": 172 ↛ 154line 172 didn't jump to line 154 because the condition on line 172 was always true
173 translate_raw_tags(sound)
174 word_entry.sounds.append(sound)
175 audio_file = node.template_parameters.get(
176 "a", node.template_parameters.get("audio", "")
177 ).strip()
178 if audio_file != "": 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true
179 sound = Sound()
180 set_sound_file_url_fields(wxr, audio_file, sound)
181 word_entry.sounds.append(sound)
182 clean_node(wxr, word_entry, expanded_node)
185def extract_audio_template(
186 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
187):
188 sound = Sound()
189 filename = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
190 if filename != "":
191 set_sound_file_url_fields(wxr, filename, sound)
192 caption = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
193 if caption != "":
194 sound.raw_tags.append(caption)
195 expanded_node = wxr.wtp.parse(
196 wxr.wtp.node_to_wikitext(t_node), expand_all=True
197 )
198 for span_node in expanded_node.find_html_recursively(
199 "span", attr_name="class", attr_value="ib-content"
200 ):
201 for raw_tag in clean_node(wxr, None, span_node).split(","):
202 if raw_tag != "":
203 sound.raw_tags.append(raw_tag)
204 translate_raw_tags(sound)
205 base_data.sounds.append(sound)
206 clean_node(wxr, base_data, t_node)