Coverage for src / wiktextract / extractor / ko / sound.py: 88%

122 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1from wikitextprocessor import LevelNode, NodeKind, TemplateNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..share import set_sound_file_url_fields 

6from .models import Sound, WordEntry 

7from .tags import translate_raw_tags 

8 

9SOUND_TEMPLATES = frozenset(["발음 듣기", "IPA", "ko-IPA", "ja-pron"]) 

10 

11 

12def extract_sound_section( 

13 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

14) -> None: 

15 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE): 

16 extract_sound_template(wxr, word_entry, t_node) 

17 

18 

19def extract_sound_template( 

20 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode 

21) -> None: 

22 if node.template_name == "발음 듣기": 

23 extract_listen_pronunciation_template(wxr, word_entry, node) 

24 elif node.template_name == "IPA": 

25 extract_ipa_template(wxr, word_entry, node) 

26 elif node.template_name == "ko-IPA": 

27 extract_ko_ipa_template(wxr, word_entry, node) 

28 elif node.template_name == "ja-pron": 28 ↛ exitline 28 didn't return from function 'extract_sound_template' because the condition on line 28 was always true

29 extract_ja_pron_template(wxr, word_entry, node) 

30 

31 

32def extract_listen_pronunciation_template( 

33 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode 

34) -> None: 

35 # https://ko.wiktionary.org/wiki/틀:발음_듣기 

36 for key in range(1, 9): 36 ↛ exitline 36 didn't return from function 'extract_listen_pronunciation_template' because the loop on line 36 didn't complete

37 if key not in node.template_parameters: 

38 break 

39 value = clean_node(wxr, None, node.template_parameters[key]) 

40 if value == "": 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true

41 continue 

42 elif key % 2 == 1: 

43 sound = Sound() 

44 set_sound_file_url_fields(wxr, value, sound) 

45 word_entry.sounds.append(sound) 

46 elif len(word_entry.sounds) > 0: 46 ↛ 36line 46 didn't jump to line 36 because the condition on line 46 was always true

47 word_entry.sounds[-1].raw_tags.append(value) 

48 translate_raw_tags(word_entry.sounds[-1]) 

49 

50 

51def extract_ipa_template( 

52 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode 

53) -> None: 

54 # https://ko.wiktionary.org/wiki/틀:IPA 

55 for key in range(1, 5): 

56 if key not in node.template_parameters: 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true

57 break 

58 value = clean_node(wxr, None, node.template_parameters[key]) 

59 if value == "": 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true

60 continue 

61 elif key % 2 == 1: 

62 sound = Sound(ipa=value) 

63 word_entry.sounds.append(sound) 

64 elif len(word_entry.sounds) > 0: 64 ↛ 55line 64 didn't jump to line 55 because the condition on line 64 was always true

65 for raw_tag in value.split(","): 

66 raw_tag = raw_tag.strip() 

67 if raw_tag != "": 67 ↛ 65line 67 didn't jump to line 65 because the condition on line 67 was always true

68 word_entry.sounds[-1].raw_tags.append(raw_tag.strip()) 

69 translate_raw_tags(word_entry.sounds[-1]) 

70 

71 

72def extract_ko_ipa_template( 

73 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

74): 

75 # https://ko.wiktionary.org/wiki/틀:ko-IPA 

76 sounds = [] 

77 expanded_node = wxr.wtp.parse( 

78 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

79 ) 

80 clean_node(wxr, word_entry, expanded_node) 

81 for ul_node in expanded_node.find_html("ul"): 

82 for li_node in ul_node.find_html("li"): 

83 if "ko-pron__ph" in li_node.attrs.get("class", ""): 

84 for span_node in li_node.find_html( 

85 "span", attr_name="lang", attr_value="ko" 

86 ): 

87 hangeul_str = clean_node(wxr, None, span_node).strip("[]") 

88 for hangeul in hangeul_str.split("/"): 

89 if hangeul != "": 89 ↛ 88line 89 didn't jump to line 88 because the condition on line 89 was always true

90 sounds.append( 

91 Sound(hangeul=hangeul, tags=["phonetic"]) 

92 ) 

93 else: 

94 raw_tags = [] 

95 for link_node in li_node.find_child(NodeKind.LINK): 

96 raw_tag = clean_node(wxr, None, link_node) 

97 if raw_tag not in ["", "IPA"]: 

98 raw_tags.append(raw_tag) 

99 for span_node in li_node.find_html( 

100 "span", attr_name="class", attr_value="IPA" 

101 ): 

102 ipas = clean_node(wxr, None, span_node) 

103 for ipa in ipas.split("~"): 

104 ipa = ipa.strip() 

105 if ipa != "": 105 ↛ 103line 105 didn't jump to line 103 because the condition on line 105 was always true

106 sound = Sound(ipa=ipa, raw_tags=raw_tags) 

107 translate_raw_tags(sound) 

108 sounds.append(sound) 

109 

110 for table in expanded_node.find_html("table"): 

111 for tr in table.find_html("tr"): 

112 raw_tag = "" 

113 for th in tr.find_html("th"): 

114 raw_tag = clean_node(wxr, None, th) 

115 for td in tr.find_html("td"): 

116 roman = clean_node(wxr, None, td) 

117 if roman != "": 117 ↛ 115line 117 didn't jump to line 115 because the condition on line 117 was always true

118 sound = Sound(roman=roman) 

119 if raw_tag != "": 119 ↛ 122line 119 didn't jump to line 122 because the condition on line 119 was always true

120 sound.raw_tags.append(raw_tag) 

121 translate_raw_tags(sound) 

122 sounds.append(sound) 

123 

124 audio_file = clean_node( 

125 wxr, 

126 None, 

127 t_node.template_parameters.get( 

128 "a", t_node.template_parameters.get("audio", "") 

129 ), 

130 ) 

131 if audio_file != "": 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true

132 sound = Sound() 

133 set_sound_file_url_fields(wxr, audio_file, sound) 

134 sounds.append(sound) 

135 word_entry.sounds.extend(sounds) 

136 

137 

138def extract_ja_pron_template( 

139 wxr: WiktextractContext, word_entry: WordEntry, node: TemplateNode 

140) -> None: 

141 # https://ko.wiktionary.org/wiki/틀:ja-pron 

142 JA_PRON_ACCENTS = { 

143 "중고형": "Nakadaka", 

144 "평판형": "Heiban", 

145 "두고형": "Atamadaka", 

146 "미고형": "Odaka", 

147 } 

148 expanded_node = wxr.wtp.parse( 

149 wxr.wtp.node_to_wikitext(node), expand_all=True 

150 ) 

151 for ul_tag in expanded_node.find_html("ul"): 

152 for li_tag in ul_tag.find_html("li"): 

153 sound = Sound() 

154 for span_tag in li_tag.find_html("span"): 

155 span_class = span_tag.attrs.get("class", "").split() 

156 if "usage-label-accent" in span_class: 

157 sound.raw_tags.append( 

158 clean_node(wxr, None, span_tag).strip("()") 

159 ) 

160 elif "Jpan" in span_class: 

161 sound.other = clean_node(wxr, None, span_tag) 

162 elif "Latn" in span_class: 

163 sound.roman = clean_node(wxr, None, span_tag) 

164 elif "IPA" in span_class: 164 ↛ 154line 164 didn't jump to line 154 because the condition on line 164 was always true

165 sound.ipa = clean_node(wxr, None, span_tag) 

166 for link_node in li_tag.find_child(NodeKind.LINK): 

167 link_text = clean_node(wxr, None, link_node) 

168 if link_text in JA_PRON_ACCENTS: 

169 sound.tags.append(JA_PRON_ACCENTS[link_text]) 

170 if sound.ipa != "" or sound.roman != "": 170 ↛ 152line 170 didn't jump to line 152 because the condition on line 170 was always true

171 translate_raw_tags(sound) 

172 word_entry.sounds.append(sound) 

173 audio_file = node.template_parameters.get( 

174 "a", node.template_parameters.get("audio", "") 

175 ).strip() 

176 if audio_file != "": 176 ↛ 177line 176 didn't jump to line 177 because the condition on line 176 was never true

177 sound = Sound() 

178 set_sound_file_url_fields(wxr, audio_file, sound) 

179 word_entry.sounds.append(sound) 

180 clean_node(wxr, word_entry, expanded_node)