Coverage for src/wiktextract/extractor/vi/sound.py: 81%

149 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1from dataclasses import dataclass 

2 

3from wikitextprocessor import ( 

4 HTMLNode, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from ..share import set_sound_file_url_fields 

14from .models import Hyphenation, Sound, WordEntry 

15from .tags import translate_raw_tags 

16 

17 

18def extract_sound_section( 

19 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

20): 

21 for node in level_node.children: 

22 if isinstance(node, TemplateNode): 

23 if node.template_name == "vie-pron": 

24 extract_vie_pron_template(wxr, base_data, node) 

25 elif node.template_name in [ 25 ↛ 30line 25 didn't jump to line 30 because the condition on line 25 was never true

26 "âm thanh-IPA", 

27 "pron-audio", 

28 "audio-for-pron", 

29 ]: 

30 extract_pron_audio_template(wxr, base_data, node) 

31 elif node.template_name == "tyz-IPA": 31 ↛ 21line 31 didn't jump to line 21 because the condition on line 31 was always true

32 extract_tyz_ipa_template(wxr, base_data, node) 

33 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

34 for list_item in node.find_child(NodeKind.LIST_ITEM): 

35 extract_sound_list_item(wxr, base_data, list_item) 

36 

37 

38def extract_sound_list_item( 

39 wxr: WiktextractContext, base_data: WordEntry, list_item: WikiNode 

40): 

41 for node in list_item.children: 

42 if isinstance(node, TemplateNode): 

43 if node.template_name.lower() in ["âm thanh", "audio", "âm thanh"]: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true

44 extract_audio_template(wxr, base_data, node) 

45 elif node.template_name in [ 

46 "IPA", 

47 "IPA2", 

48 "IPA3", 

49 "IPA4", 

50 "fra-IPA", 

51 "fr-IPA", 

52 ]: 

53 extract_ipa_template(wxr, base_data, node, "IPA") 

54 elif node.template_name in ["enPR", "AHD"]: 

55 extract_ipa_template(wxr, base_data, node, "enPR") 

56 elif node.template_name in ["rhymes", "rhyme"]: 

57 extract_rhymes_template(wxr, base_data, node) 

58 elif node.template_name in ["hyphenation", "hyph"]: 58 ↛ 41line 58 didn't jump to line 41 because the condition on line 58 was always true

59 extract_hyphenation_template(wxr, base_data, node) 

60 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

62 extract_sound_list_item(wxr, base_data, child_list_item) 

63 

64 

65@dataclass 

66class TableHeader: 

67 text: str 

68 index: int 

69 span: int 

70 

71 

72def extract_vie_pron_template( 

73 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

74): 

75 expanded_node = wxr.wtp.parse( 

76 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

77 ) 

78 for table in expanded_node.find_child(NodeKind.TABLE): 

79 col_headers = [] 

80 for row in table.find_child(NodeKind.TABLE_ROW): 

81 col_index = 0 

82 for cell in row.find_child( 

83 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

84 ): 

85 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

86 if col_index == 0: 

87 col_headers.clear() 

88 colspan = int(cell.attrs.get("colspan", "1")) 

89 col_headers.append( 

90 TableHeader( 

91 clean_node(wxr, None, cell), col_index, colspan 

92 ) 

93 ) 

94 col_index += colspan 

95 else: 

96 colspan = int(cell.attrs.get("colspan", "1")) 

97 for span_tag in cell.find_html( 

98 "span", attr_name="class", attr_value="IPA" 

99 ): 

100 extract_vie_pron_span_tag( 

101 wxr, 

102 base_data, 

103 span_tag, 

104 col_index, 

105 colspan, 

106 col_headers, 

107 ) 

108 col_index += colspan 

109 for td_tag in cell.find_html("td"): 

110 colspan = int(td_tag.attrs.get("colspan", "1")) 

111 for span_tag in td_tag.find_html( 

112 "span", attr_name="class", attr_value="IPA" 

113 ): 

114 extract_vie_pron_span_tag( 

115 wxr, 

116 base_data, 

117 span_tag, 

118 col_index, 

119 colspan, 

120 col_headers, 

121 ) 

122 col_index += colspan 

123 

124 for link in expanded_node.find_child(NodeKind.LINK): 

125 clean_node(wxr, base_data, link) 

126 

127 

128def extract_vie_pron_span_tag( 

129 wxr: WiktextractContext, 

130 base_data: WordEntry, 

131 span_tag: HTMLNode, 

132 index: str, 

133 colspan: int, 

134 col_headers: list[TableHeader], 

135): 

136 ipa = clean_node(wxr, None, span_tag) 

137 if ipa != "": 137 ↛ exitline 137 didn't return from function 'extract_vie_pron_span_tag' because the condition on line 137 was always true

138 sound = Sound(ipa=ipa) 

139 for header in col_headers: 

140 if ( 

141 index < header.index + header.span 

142 and index + colspan > header.index 

143 ): 

144 sound.raw_tags.append(header.text) 

145 translate_raw_tags(sound) 

146 base_data.sounds.append(sound) 

147 

148 

149def extract_pron_audio_template( 

150 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

151): 

152 file = clean_node(wxr, None, t_node.template_parameters.get("file", "")) 

153 if file == "": 

154 return 

155 sound = Sound() 

156 set_sound_file_url_fields(wxr, file, sound) 

157 place = clean_node(wxr, None, t_node.template_parameters.get("place", "")) 

158 if place != "": 

159 sound.raw_tags.append(place) 

160 sound.ipa = clean_node( 

161 wxr, None, t_node.template_parameters.get("pron", "") 

162 ) 

163 translate_raw_tags(sound) 

164 base_data.sounds.append(sound) 

165 

166 

167def extract_audio_template( 

168 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

169): 

170 file = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

171 if file == "": 

172 return 

173 sound = Sound() 

174 set_sound_file_url_fields(wxr, file, sound) 

175 raw_tag = clean_node(wxr, None, t_node.template_parameters.get(3, "")) 

176 if raw_tag != "": 

177 sound.raw_tags.append(raw_tag) 

178 translate_raw_tags(sound) 

179 base_data.sounds.append(sound) 

180 

181 

182def extract_tyz_ipa_template( 

183 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

184): 

185 expanded_node = wxr.wtp.parse( 

186 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

187 ) 

188 for list in expanded_node.find_child(NodeKind.LIST): 

189 for list_item in list.find_child(NodeKind.LIST_ITEM): 

190 sound = Sound() 

191 for node in list_item.children: 

192 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

193 raw_tag = clean_node(wxr, None, node) 

194 if raw_tag != "": 194 ↛ 191line 194 didn't jump to line 191 because the condition on line 194 was always true

195 sound.raw_tags.append(raw_tag) 

196 elif ( 

197 isinstance(node, HTMLNode) 

198 and node.tag == "span" 

199 and "IPA" in node.attrs.get("class", "").split() 

200 ): 

201 sound.ipa = clean_node(wxr, None, node) 

202 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

203 clean_node(wxr, base_data, node) 

204 if sound.ipa != "": 204 ↛ 189line 204 didn't jump to line 189 because the condition on line 204 was always true

205 base_data.sounds.append(sound) 

206 

207 

208def extract_ipa_template( 

209 wxr: WiktextractContext, 

210 base_data: WordEntry, 

211 t_node: TemplateNode, 

212 ipa_class: str, 

213): 

214 # https://vi.wiktionary.org/wiki/Bản_mẫu:IPA 

215 expanded_node = wxr.wtp.parse( 

216 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

217 ) 

218 raw_tags = [] 

219 for span_tag in expanded_node.find_html("span"): 

220 class_names = span_tag.attrs.get("class", "").split() 

221 if "qualifier-content" in class_names: 

222 raw_tag = clean_node(wxr, None, span_tag) 

223 if raw_tag != "": 223 ↛ 219line 223 didn't jump to line 219 because the condition on line 223 was always true

224 raw_tags.append(raw_tag) 

225 elif ipa_class in class_names: 

226 ipa = clean_node(wxr, None, span_tag) 

227 if ipa != "": 227 ↛ 219line 227 didn't jump to line 219 because the condition on line 227 was always true

228 sound = Sound(ipa=ipa, raw_tags=raw_tags) 

229 translate_raw_tags(sound) 

230 base_data.sounds.append(sound) 

231 

232 for link in expanded_node.find_child(NodeKind.LINK): 

233 clean_node(wxr, base_data, link) 

234 

235 

236def extract_rhymes_template( 

237 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

238): 

239 # https://vi.wiktionary.org/wiki/Bản_mẫu:rhymes 

240 expanded_node = wxr.wtp.parse( 

241 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

242 ) 

243 for span_tag in expanded_node.find_html_recursively( 

244 "span", attr_name="class", attr_value="IPA" 

245 ): 

246 rhyme = clean_node(wxr, None, span_tag) 

247 if rhyme != "": 247 ↛ 243line 247 didn't jump to line 243 because the condition on line 247 was always true

248 base_data.sounds.append(Sound(rhymes=rhyme)) 

249 

250 for link in expanded_node.find_child(NodeKind.LINK): 

251 clean_node(wxr, base_data, link) 

252 

253 

254def extract_hyphenation_template( 

255 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode 

256): 

257 # https://vi.wiktionary.org/wiki/Bản_mẫu:hyphenation 

258 expanded_node = wxr.wtp.parse( 

259 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

260 ) 

261 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

262 for span_tag in expanded_node.find_html( 

263 "span", attr_name="lang", attr_value=lang_code 

264 ): 

265 h_str = clean_node(wxr, None, span_tag) 

266 h_data = Hyphenation() 

267 for part in h_str.split("‧"): 

268 part = part.strip() 

269 if part != "": 269 ↛ 267line 269 didn't jump to line 267 because the condition on line 269 was always true

270 h_data.parts.append(part) 

271 if len(h_data.parts) > 0: 271 ↛ 262line 271 didn't jump to line 262 because the condition on line 271 was always true

272 base_data.hyphenations.append(h_data)