Coverage for src / wiktextract / extractor / ms / sound.py: 57%

182 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-02 00:27 +0000

1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..share import set_sound_file_url_fields 

6from .models import Hyphenation, Sound, WordEntry 

7from .tags import translate_raw_tags 

8 

9 

10def extract_sound_section( 

11 wxr: WiktextractContext, 

12 page_data: list[WordEntry], 

13 base_data: WordEntry, 

14 level_node: LevelNode, 

15) -> None: 

16 sounds = [] 

17 for list_node in level_node.find_child(NodeKind.LIST): 

18 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

19 sounds.extend(extract_sound_list_item(wxr, list_item)) 

20 for node in level_node.find_child(NodeKind.TEMPLATE): 20 ↛ 21line 20 didn't jump to line 21 because the loop on line 20 never started

21 sounds.extend(extract_sound_templates(wxr, node, [])) 

22 

23 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 23 ↛ 24line 23 didn't jump to line 24 because the condition on line 23 was never true

24 for sound in sounds: 

25 if len(sound.hyphenations) > 0: 

26 base_data.hyphenations.extend(sound.hyphenations) 

27 else: 

28 base_data.sounds.append(sound) 

29 for cat in sound.categories: 

30 if cat not in base_data: 

31 base_data.categories.append(cat) 

32 elif level_node.kind == NodeKind.LEVEL3: 32 ↛ 44line 32 didn't jump to line 44 because the condition on line 32 was always true

33 for data in page_data: 

34 if data.lang_code == page_data[-1].lang_code: 34 ↛ 33line 34 didn't jump to line 33 because the condition on line 34 was always true

35 for sound in sounds: 

36 if len(sound.hyphenations) > 0: 

37 data.hyphenations.extend(sound.hyphenations) 

38 else: 

39 data.sounds.append(sound) 

40 for cat in sound.categories: 

41 if cat not in data.categories: 

42 data.categories.append(cat) 

43 else: 

44 for sound in sounds: 

45 if len(sound.hyphenations) > 0: 

46 page_data[-1].hyphenations.extend(sound.hyphenations) 

47 else: 

48 page_data[-1].sounds.append(sound) 

49 for cat in sound.categories: 

50 if cat not in page_data[-1].categories: 

51 page_data[-1].categories.append(cat) 

52 

53 

54def extract_sound_list_item( 

55 wxr: WiktextractContext, list_item: WikiNode 

56) -> list[Sound]: 

57 raw_tags = [] 

58 cats = {} 

59 sounds = [] 

60 for node in list_item.children: 

61 if isinstance(node, TemplateNode): 

62 if node.template_name in ["a", "accent"]: 

63 raw_tag = clean_node(wxr, cats, node).strip("() ") 

64 if raw_tag != "": 64 ↛ 60line 64 didn't jump to line 60 because the condition on line 64 was always true

65 raw_tags.append(raw_tag) 

66 else: 

67 sounds.extend(extract_sound_templates(wxr, node, raw_tags)) 

68 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

69 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

70 sounds.extend(extract_sound_list_item(wxr, child_list_item)) 

71 elif isinstance(node, str) and node.strip().endswith(":"): 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 raw_tag = node.strip(": ") 

73 if raw_tag != "": 

74 raw_tags.append(raw_tag) 

75 for sound in sounds: 

76 sound.categories.extend(cats.get("categories", [])) 

77 return sounds 

78 

79 

80def extract_sound_templates( 

81 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str] 

82) -> list[Sound]: 

83 sounds = [] 

84 if t_node.template_name == "dewan": 

85 sounds.extend(extract_dewan_template(wxr, t_node)) 

86 elif t_node.template_name in ["audio-AFA", "audio-IPA"]: 

87 sounds.extend(extract_audio_ipa_template(wxr, t_node, raw_tags)) 

88 elif t_node.template_name.lower() in ["ko-afa", "ko-ipa", "ko-pron"]: 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 sounds.extend(extract_ko_ipa_template(wxr, t_node)) 

90 elif t_node.template_name.lower() in [ 

91 "afa", 

92 "ipa", 

93 ] or t_node.template_name.lower().endswith(("-afa", "-ipa")): 

94 sounds.extend(extract_ipa_template(wxr, t_node, raw_tags)) 

95 elif t_node.template_name in ["penyempangan", "hyphenation", "hyph"]: 

96 sounds.extend(extract_hyph_template(wxr, t_node)) 

97 elif t_node.template_name == "audio": 

98 sounds.extend(extract_audio_template(wxr, t_node)) 

99 elif t_node.template_name in ["rima", "rhymes", "rhyme"]: 99 ↛ 101line 99 didn't jump to line 101 because the condition on line 99 was always true

100 sounds.extend(extract_rhyme_template(wxr, t_node)) 

101 return sounds 

102 

103 

104def extract_dewan_template( 

105 wxr: WiktextractContext, t_node: TemplateNode 

106) -> list[Sound]: 

107 sounds = [] 

108 cats = {} 

109 text = clean_node(wxr, cats, t_node).removeprefix("Kamus Dewan:").strip() 

110 if text != "": 110 ↛ 118line 110 didn't jump to line 118 because the condition on line 110 was always true

111 sounds.append( 

112 Sound( 

113 other=text, 

114 raw_tags=["Kamus Dewan"], 

115 categories=cats.get("categories", []), 

116 ) 

117 ) 

118 return sounds 

119 

120 

121def extract_ipa_template( 

122 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str] 

123) -> list[Sound]: 

124 sounds = [] 

125 cats = {} 

126 expanded_template = wxr.wtp.parse( 

127 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

128 ) 

129 clean_node(wxr, cats, expanded_template) 

130 for span_tag in expanded_template.find_html( 

131 "span", attr_name="class", attr_value="IPA" 

132 ): 

133 ipa = clean_node(wxr, None, span_tag) 

134 if ipa != "": 134 ↛ 130line 134 didn't jump to line 130 because the condition on line 134 was always true

135 sound = Sound( 

136 ipa=ipa, 

137 raw_tags=raw_tags, 

138 categories=cats.get("categories", []), 

139 ) 

140 translate_raw_tags(sound) 

141 sounds.append(sound) 

142 return sounds 

143 

144 

145def extract_hyph_template( 

146 wxr: WiktextractContext, t_node: TemplateNode 

147) -> list[Sound]: 

148 sounds = [] 

149 expanded_template = wxr.wtp.parse( 

150 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

151 ) 

152 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

153 for span_tag in expanded_template.find_html( 

154 "span", attr_name="lang", attr_value=lang_code 

155 ): 

156 text = clean_node(wxr, None, span_tag) 

157 if text != "": 157 ↛ 153line 157 didn't jump to line 153 because the condition on line 157 was always true

158 sounds.append( 

159 Sound(hyphenations=[Hyphenation(parts=text.split("‧"))]) 

160 ) 

161 return sounds 

162 

163 

164def extract_audio_template( 

165 wxr: WiktextractContext, t_node: TemplateNode 

166) -> list[Sound]: 

167 sounds = [] 

168 filename = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

169 cats = {} 

170 clean_node(wxr, cats, t_node) 

171 if filename != "": 171 ↛ 175line 171 didn't jump to line 175 because the condition on line 171 was always true

172 sound = Sound(categories=cats.get("categories", [])) 

173 set_sound_file_url_fields(wxr, filename, sound) 

174 sounds.append(sound) 

175 return sounds 

176 

177 

178def extract_rhyme_template( 

179 wxr: WiktextractContext, t_node: TemplateNode 

180) -> list[Sound]: 

181 sounds = [] 

182 expanded_template = wxr.wtp.parse( 

183 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

184 ) 

185 cats = {} 

186 clean_node(wxr, cats, expanded_template) 

187 for link in expanded_template.find_child(NodeKind.LINK): 

188 sound = Sound(categories=cats.get("categories", [])) 

189 text = clean_node(wxr, None, link) 

190 if text != "": 

191 sound.rhymes = text 

192 sounds.append(sound) 

193 return sounds 

194 

195 

196def extract_audio_ipa_template( 

197 wxr: WiktextractContext, 

198 t_node: TemplateNode, 

199 raw_tags: list[str], 

200) -> list[Sound]: 

201 sounds = [] 

202 filename = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

203 cats = {} 

204 clean_node(wxr, cats, t_node) 

205 if filename != "": 205 ↛ 210line 205 didn't jump to line 210 because the condition on line 205 was always true

206 ipa = clean_node(wxr, None, t_node.template_parameters.get(3, "")) 

207 sound = Sound(ipa=ipa, categories=cats.get("categories", [])) 

208 set_sound_file_url_fields(wxr, filename, sound) 

209 sounds.append(sound) 

210 return sounds 

211 

212 

213def extract_ko_ipa_template( 

214 wxr: WiktextractContext, t_node: TemplateNode 

215) -> list[Sound]: 

216 sounds = [] 

217 expanded_node = wxr.wtp.parse( 

218 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

219 ) 

220 for ul_node in expanded_node.find_html("ul"): 

221 for li_node in ul_node.find_html("li"): 

222 if "ko-pron__ph" in li_node.attrs.get("class", ""): 

223 for span_node in li_node.find_html( 

224 "span", attr_name="lang", attr_value="ko" 

225 ): 

226 hangeul_str = clean_node(wxr, None, span_node).strip("[]") 

227 for hangeul in hangeul_str.split("/"): 

228 if hangeul != "": 

229 sounds.append( 

230 Sound(hangeul=hangeul, tags=["phonetic"]) 

231 ) 

232 else: 

233 raw_tags = [] 

234 for i_node in li_node.find_html("i"): 

235 for raw_tag in clean_node(wxr, None, i_node).split("/"): 

236 if raw_tag not in ["", "AFA"]: 

237 raw_tags.append(raw_tag) 

238 for span_node in li_node.find_html( 

239 "span", attr_name="class", attr_value="IPA" 

240 ): 

241 ipas = clean_node(wxr, None, span_node) 

242 for ipa in ipas.split("~"): 

243 ipa = ipa.strip() 

244 if ipa != "": 

245 sound = Sound(ipa=ipa, raw_tags=raw_tags) 

246 translate_raw_tags(sound) 

247 sounds.append(sound) 

248 

249 for table in expanded_node.find_html("table"): 

250 for tr in table.find_html("tr"): 

251 raw_tag = "" 

252 for th in tr.find_html("th"): 

253 raw_tag = clean_node(wxr, None, th) 

254 for td in tr.find_html("td"): 

255 roman = clean_node(wxr, None, td) 

256 if roman != "": 

257 sound = Sound(roman=roman) 

258 if raw_tag != "": 

259 sound.raw_tags.append(raw_tag) 

260 translate_raw_tags(sound) 

261 sounds.append(sound) 

262 

263 audio_file = clean_node( 

264 wxr, 

265 None, 

266 t_node.template_parameters.get( 

267 "a", t_node.template_parameters.get("audio", "") 

268 ), 

269 ) 

270 if audio_file != "": 

271 sound = Sound() 

272 set_sound_file_url_fields(wxr, audio_file, sound) 

273 sounds.append(sound) 

274 if len(sounds) > 0: 

275 clean_node(wxr, sounds[-1], expanded_node) 

276 return sounds