Coverage for src / wiktextract / extractor / ko / sound.py: 78%

140 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2025-12-29 01:50 +0000

1from wikitextprocessor import LevelNode, NodeKind, TemplateNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..share import set_sound_file_url_fields 

6from .models import Sound, WordEntry 

7from .tags import translate_raw_tags 

8 

9SOUND_TEMPLATES = frozenset(["발음 듣기", "IPA", "ko-IPA", "ja-pron", "audio"]) 

10 

11 

12def extract_sound_section( 

13 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

14) -> None: 

15 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 

16 extract_sound_template(wxr, word_entry, t_node) 

17 

18 

19def extract_sound_template( 

20 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode 

21) -> None: 

22 if node.template_name == "발음 듣기": 

23 extract_listen_pronunciation_template(wxr, word_entry, node) 

24 elif node.template_name == "IPA": 

25 extract_ipa_template(wxr, word_entry, node) 

26 elif node.template_name == "ko-IPA": 

27 extract_ko_ipa_template(wxr, word_entry, node) 

28 elif node.template_name == "ja-pron": 28 ↛ 30line 28 didn't jump to line 30 because the condition on line 28 was always true

29 extract_ja_pron_template(wxr, word_entry, node) 

30 elif node.template_name == "audio": 

31 extract_audio_template(wxr, word_entry, node) 

32 

33 

34def extract_listen_pronunciation_template( 

35 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode 

36) -> None: 

37 # https://ko.wiktionary.org/wiki/틀:발음_듣기 

38 for key in range(1, 9): 38 ↛ exitline 38 didn't return from function 'extract_listen_pronunciation_template' because the loop on line 38 didn't complete

39 if key not in node.template_parameters: 

40 break 

41 value = clean_node(wxr, None, node.template_parameters[key]) 

42 if value == "": 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true

43 continue 

44 elif key % 2 == 1: 

45 sound = Sound() 

46 set_sound_file_url_fields(wxr, value, sound) 

47 word_entry.sounds.append(sound) 

48 elif len(word_entry.sounds) > 0: 48 ↛ 38line 48 didn't jump to line 38 because the condition on line 48 was always true

49 word_entry.sounds[-1].raw_tags.append(value) 

50 translate_raw_tags(word_entry.sounds[-1]) 

51 

52 

53def extract_ipa_template( 

54 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode 

55) -> None: 

56 # https://ko.wiktionary.org/wiki/틀:IPA 

57 for key in range(1, 5): 

58 if key not in node.template_parameters: 58 ↛ 59line 58 didn't jump to line 59 because the condition on line 58 was never true

59 break 

60 value = clean_node(wxr, None, node.template_parameters[key]) 

61 if value == "": 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 continue 

63 elif key % 2 == 1: 

64 sound = Sound(ipa=value) 

65 word_entry.sounds.append(sound) 

66 elif len(word_entry.sounds) > 0: 66 ↛ 57line 66 didn't jump to line 57 because the condition on line 66 was always true

67 for raw_tag in value.split(","): 

68 raw_tag = raw_tag.strip() 

69 if raw_tag != "": 69 ↛ 67line 69 didn't jump to line 67 because the condition on line 69 was always true

70 word_entry.sounds[-1].raw_tags.append(raw_tag.strip()) 

71 translate_raw_tags(word_entry.sounds[-1]) 

72 

73 

74def extract_ko_ipa_template( 

75 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

76): 

77 # https://ko.wiktionary.org/wiki/틀:ko-IPA 

78 sounds = [] 

79 expanded_node = wxr.wtp.parse( 

80 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

81 ) 

82 clean_node(wxr, word_entry, expanded_node) 

83 for ul_node in expanded_node.find_html("ul"): 

84 for li_node in ul_node.find_html("li"): 

85 if "ko-pron__ph" in li_node.attrs.get("class", ""): 

86 for span_node in li_node.find_html( 

87 "span", attr_name="lang", attr_value="ko" 

88 ): 

89 hangeul_str = clean_node(wxr, None, span_node).strip("[]") 

90 for hangeul in hangeul_str.split("/"): 

91 if hangeul != "": 91 ↛ 90line 91 didn't jump to line 90 because the condition on line 91 was always true

92 sounds.append( 

93 Sound(hangeul=hangeul, tags=["phonetic"]) 

94 ) 

95 else: 

96 raw_tags = [] 

97 for link_node in li_node.find_child(NodeKind.LINK): 

98 raw_tag = clean_node(wxr, None, link_node) 

99 if raw_tag not in ["", "IPA"]: 

100 raw_tags.append(raw_tag) 

101 for span_node in li_node.find_html( 

102 "span", attr_name="class", attr_value="IPA" 

103 ): 

104 ipas = clean_node(wxr, None, span_node) 

105 for ipa in ipas.split("~"): 

106 ipa = ipa.strip() 

107 if ipa != "": 107 ↛ 105line 107 didn't jump to line 105 because the condition on line 107 was always true

108 sound = Sound(ipa=ipa, raw_tags=raw_tags) 

109 translate_raw_tags(sound) 

110 sounds.append(sound) 

111 

112 for table in expanded_node.find_html("table"): 

113 for tr in table.find_html("tr"): 

114 raw_tag = "" 

115 for th in tr.find_html("th"): 

116 raw_tag = clean_node(wxr, None, th) 

117 for td in tr.find_html("td"): 

118 roman = clean_node(wxr, None, td) 

119 if roman != "": 119 ↛ 117line 119 didn't jump to line 117 because the condition on line 119 was always true

120 sound = Sound(roman=roman) 

121 if raw_tag != "": 121 ↛ 124line 121 didn't jump to line 124 because the condition on line 121 was always true

122 sound.raw_tags.append(raw_tag) 

123 translate_raw_tags(sound) 

124 sounds.append(sound) 

125 

126 audio_file = clean_node( 

127 wxr, 

128 None, 

129 t_node.template_parameters.get( 

130 "a", t_node.template_parameters.get("audio", "") 

131 ), 

132 ) 

133 if audio_file != "": 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true

134 sound = Sound() 

135 set_sound_file_url_fields(wxr, audio_file, sound) 

136 sounds.append(sound) 

137 word_entry.sounds.extend(sounds) 

138 

139 

140def extract_ja_pron_template( 

141 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode 

142) -> None: 

143 # https://ko.wiktionary.org/wiki/틀:ja-pron 

144 JA_PRON_ACCENTS = { 

145 "중고형": "Nakadaka", 

146 "평판형": "Heiban", 

147 "두고형": "Atamadaka", 

148 "미고형": "Odaka", 

149 } 

150 expanded_node = wxr.wtp.parse( 

151 wxr.wtp.node_to_wikitext(node), expand_all=True 

152 ) 

153 for ul_tag in expanded_node.find_html("ul"): 

154 for li_tag in ul_tag.find_html("li"): 

155 sound = Sound() 

156 for span_tag in li_tag.find_html("span"): 

157 span_class = span_tag.attrs.get("class", "").split() 

158 if "usage-label-accent" in span_class: 

159 sound.raw_tags.append( 

160 clean_node(wxr, None, span_tag).strip("()") 

161 ) 

162 elif "Jpan" in span_class: 

163 sound.other = clean_node(wxr, None, span_tag) 

164 elif "Latn" in span_class: 

165 sound.roman = clean_node(wxr, None, span_tag) 

166 elif "IPA" in span_class: 166 ↛ 156line 166 didn't jump to line 156 because the condition on line 166 was always true

167 sound.ipa = clean_node(wxr, None, span_tag) 

168 for link_node in li_tag.find_child(NodeKind.LINK): 

169 link_text = clean_node(wxr, None, link_node) 

170 if link_text in JA_PRON_ACCENTS: 

171 sound.tags.append(JA_PRON_ACCENTS[link_text]) 

172 if sound.ipa != "" or sound.roman != "": 172 ↛ 154line 172 didn't jump to line 154 because the condition on line 172 was always true

173 translate_raw_tags(sound) 

174 word_entry.sounds.append(sound) 

175 audio_file = node.template_parameters.get( 

176 "a", node.template_parameters.get("audio", "") 

177 ).strip() 

178 if audio_file != "": 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true

179 sound = Sound() 

180 set_sound_file_url_fields(wxr, audio_file, sound) 

181 word_entry.sounds.append(sound) 

182 clean_node(wxr, word_entry, expanded_node) 

183 

184 

185def extract_audio_template( 

186 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

187): 

188 sound = Sound() 

189 filename = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

190 if filename != "": 

191 set_sound_file_url_fields(wxr, filename, sound) 

192 caption = clean_node(wxr, None, t_node.template_parameters.get(3, "")) 

193 if caption != "": 

194 sound.raw_tags.append(caption) 

195 expanded_node = wxr.wtp.parse( 

196 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

197 ) 

198 for span_node in expanded_node.find_html_recursively( 

199 "span", attr_name="class", attr_value="ib-content" 

200 ): 

201 for raw_tag in clean_node(wxr, None, span_node).split(","): 

202 if raw_tag != "": 

203 sound.raw_tags.append(raw_tag) 

204 translate_raw_tags(sound) 

205 base_data.sounds.append(sound) 

206 clean_node(wxr, base_data, t_node)