Coverage for src / wiktextract / extractor / ko / sound.py: 88%
122 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1from wikitextprocessor import LevelNode, NodeKind, TemplateNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from ..share import set_sound_file_url_fields
6from .models import Sound, WordEntry
7from .tags import translate_raw_tags
9SOUND_TEMPLATES = frozenset(["발음 듣기", "IPA", "ko-IPA", "ja-pron"])
12def extract_sound_section(
13 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
14) -> None:
15 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE):
16 extract_sound_template(wxr, word_entry, t_node)
19def extract_sound_template(
20 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
21) -> None:
22 if node.template_name == "발음 듣기":
23 extract_listen_pronunciation_template(wxr, word_entry, node)
24 elif node.template_name == "IPA":
25 extract_ipa_template(wxr, word_entry, node)
26 elif node.template_name == "ko-IPA":
27 extract_ko_ipa_template(wxr, word_entry, node)
28 elif node.template_name == "ja-pron": 28 ↛ exitline 28 didn't return from function 'extract_sound_template' because the condition on line 28 was always true
29 extract_ja_pron_template(wxr, word_entry, node)
32def extract_listen_pronunciation_template(
33 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
34) -> None:
35 # https://ko.wiktionary.org/wiki/틀:발음_듣기
36 for key in range(1, 9): 36 ↛ exitline 36 didn't return from function 'extract_listen_pronunciation_template' because the loop on line 36 didn't complete
37 if key not in node.template_parameters:
38 break
39 value = clean_node(wxr, None, node.template_parameters[key])
40 if value == "": 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true
41 continue
42 elif key % 2 == 1:
43 sound = Sound()
44 set_sound_file_url_fields(wxr, value, sound)
45 word_entry.sounds.append(sound)
46 elif len(word_entry.sounds) > 0: 46 ↛ 36line 46 didn't jump to line 36 because the condition on line 46 was always true
47 word_entry.sounds[-1].raw_tags.append(value)
48 translate_raw_tags(word_entry.sounds[-1])
51def extract_ipa_template(
52 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
53) -> None:
54 # https://ko.wiktionary.org/wiki/틀:IPA
55 for key in range(1, 5):
56 if key not in node.template_parameters: 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true
57 break
58 value = clean_node(wxr, None, node.template_parameters[key])
59 if value == "": 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true
60 continue
61 elif key % 2 == 1:
62 sound = Sound(ipa=value)
63 word_entry.sounds.append(sound)
64 elif len(word_entry.sounds) > 0: 64 ↛ 55line 64 didn't jump to line 55 because the condition on line 64 was always true
65 for raw_tag in value.split(","):
66 raw_tag = raw_tag.strip()
67 if raw_tag != "": 67 ↛ 65line 67 didn't jump to line 65 because the condition on line 67 was always true
68 word_entry.sounds[-1].raw_tags.append(raw_tag.strip())
69 translate_raw_tags(word_entry.sounds[-1])
72def extract_ko_ipa_template(
73 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
74):
75 # https://ko.wiktionary.org/wiki/틀:ko-IPA
76 sounds = []
77 expanded_node = wxr.wtp.parse(
78 wxr.wtp.node_to_wikitext(t_node), expand_all=True
79 )
80 clean_node(wxr, word_entry, expanded_node)
81 for ul_node in expanded_node.find_html("ul"):
82 for li_node in ul_node.find_html("li"):
83 if "ko-pron__ph" in li_node.attrs.get("class", ""):
84 for span_node in li_node.find_html(
85 "span", attr_name="lang", attr_value="ko"
86 ):
87 hangeul_str = clean_node(wxr, None, span_node).strip("[]")
88 for hangeul in hangeul_str.split("/"):
89 if hangeul != "": 89 ↛ 88line 89 didn't jump to line 88 because the condition on line 89 was always true
90 sounds.append(
91 Sound(hangeul=hangeul, tags=["phonetic"])
92 )
93 else:
94 raw_tags = []
95 for link_node in li_node.find_child(NodeKind.LINK):
96 raw_tag = clean_node(wxr, None, link_node)
97 if raw_tag not in ["", "IPA"]:
98 raw_tags.append(raw_tag)
99 for span_node in li_node.find_html(
100 "span", attr_name="class", attr_value="IPA"
101 ):
102 ipas = clean_node(wxr, None, span_node)
103 for ipa in ipas.split("~"):
104 ipa = ipa.strip()
105 if ipa != "": 105 ↛ 103line 105 didn't jump to line 103 because the condition on line 105 was always true
106 sound = Sound(ipa=ipa, raw_tags=raw_tags)
107 translate_raw_tags(sound)
108 sounds.append(sound)
110 for table in expanded_node.find_html("table"):
111 for tr in table.find_html("tr"):
112 raw_tag = ""
113 for th in tr.find_html("th"):
114 raw_tag = clean_node(wxr, None, th)
115 for td in tr.find_html("td"):
116 roman = clean_node(wxr, None, td)
117 if roman != "": 117 ↛ 115line 117 didn't jump to line 115 because the condition on line 117 was always true
118 sound = Sound(roman=roman)
119 if raw_tag != "": 119 ↛ 122line 119 didn't jump to line 122 because the condition on line 119 was always true
120 sound.raw_tags.append(raw_tag)
121 translate_raw_tags(sound)
122 sounds.append(sound)
124 audio_file = clean_node(
125 wxr,
126 None,
127 t_node.template_parameters.get(
128 "a", t_node.template_parameters.get("audio", "")
129 ),
130 )
131 if audio_file != "": 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 sound = Sound()
133 set_sound_file_url_fields(wxr, audio_file, sound)
134 sounds.append(sound)
135 word_entry.sounds.extend(sounds)
138def extract_ja_pron_template(
139 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode
140) -> None:
141 # https://ko.wiktionary.org/wiki/틀:ja-pron
142 JA_PRON_ACCENTS = {
143 "중고형": "Nakadaka",
144 "평판형": "Heiban",
145 "두고형": "Atamadaka",
146 "미고형": "Odaka",
147 }
148 expanded_node = wxr.wtp.parse(
149 wxr.wtp.node_to_wikitext(node), expand_all=True
150 )
151 for ul_tag in expanded_node.find_html("ul"):
152 for li_tag in ul_tag.find_html("li"):
153 sound = Sound()
154 for span_tag in li_tag.find_html("span"):
155 span_class = span_tag.attrs.get("class", "").split()
156 if "usage-label-accent" in span_class:
157 sound.raw_tags.append(
158 clean_node(wxr, None, span_tag).strip("()")
159 )
160 elif "Jpan" in span_class:
161 sound.other = clean_node(wxr, None, span_tag)
162 elif "Latn" in span_class:
163 sound.roman = clean_node(wxr, None, span_tag)
164 elif "IPA" in span_class: 164 ↛ 154line 164 didn't jump to line 154 because the condition on line 164 was always true
165 sound.ipa = clean_node(wxr, None, span_tag)
166 for link_node in li_tag.find_child(NodeKind.LINK):
167 link_text = clean_node(wxr, None, link_node)
168 if link_text in JA_PRON_ACCENTS:
169 sound.tags.append(JA_PRON_ACCENTS[link_text])
170 if sound.ipa != "" or sound.roman != "": 170 ↛ 152line 170 didn't jump to line 152 because the condition on line 170 was always true
171 translate_raw_tags(sound)
172 word_entry.sounds.append(sound)
173 audio_file = node.template_parameters.get(
174 "a", node.template_parameters.get("audio", "")
175 ).strip()
176 if audio_file != "": 176 ↛ 177line 176 didn't jump to line 177 because the condition on line 176 was never true
177 sound = Sound()
178 set_sound_file_url_fields(wxr, audio_file, sound)
179 word_entry.sounds.append(sound)
180 clean_node(wxr, word_entry, expanded_node)