Coverage for src/wiktextract/extractor/zh/pronunciation.py: 84%

133 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import itertools 

2import re 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from ..share import create_audio_url_dict, set_sound_file_url_fields 

14from .models import Sound, WordEntry 

15from .tags import translate_raw_tags 

16 

17 

18def extract_pronunciation( 

19 wxr: WiktextractContext, 

20 page_data: list[WordEntry], 

21 base_data: WordEntry, 

22 level_node: WikiNode, 

23) -> tuple[list[Sound], list[str]]: 

24 if len(base_data.sounds) > 0: 

25 base_data.sounds.clear() 

26 

27 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

28 new_sounds, new_cats = process_pron_template(wxr, template_node) 

29 base_data.sounds.extend(new_sounds) 

30 base_data.categories.extend(new_cats) 

31 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

32 new_sounds, new_cats = process_pron_item_list_item(wxr, list_item_node) 

33 base_data.sounds.extend(new_sounds) 

34 base_data.categories.extend(new_cats) 

35 

36 

37def process_pron_item_list_item( 

38 wxr: WiktextractContext, list_item_node: WikiNode 

39) -> tuple[list[Sound], list[str]]: 

40 raw_tags = [] 

41 sounds = [] 

42 categories = [] 

43 for template_node in list_item_node.find_child(NodeKind.TEMPLATE): 

44 new_sounds, new_cats = process_pron_template( 

45 wxr, template_node, raw_tags 

46 ) 

47 sounds.extend(new_sounds) 

48 categories.extend(new_cats) 

49 return sounds, categories 

50 

51 

52def process_pron_template( 

53 wxr: WiktextractContext, 

54 template_node: TemplateNode, 

55 raw_tags: list[str] = [], 

56) -> tuple[list[Sound], list[str]]: 

57 template_name = template_node.template_name.lower() 

58 sounds = [] 

59 categories = [] 

60 if template_name == "zh-pron": 

61 new_sounds, new_cats = process_zh_pron_template(wxr, template_node) 

62 sounds.extend(new_sounds) 

63 categories.extend(new_cats) 

64 elif template_name in ["homophones", "homophone", "hmp"]: 

65 sounds.extend(process_homophones_template(wxr, template_node)) 

66 elif template_name in ["a", "accent"]: 

67 # https://zh.wiktionary.org/wiki/Template:Accent 

68 raw_tags.append(clean_node(wxr, None, template_node).strip("()")) 

69 elif template_name in ["audio", "音"]: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true

70 sounds.extend(process_audio_template(wxr, template_node, raw_tags)) 

71 elif template_name == "ipa": 

72 sounds.extend(process_ipa_template(wxr, template_node, raw_tags)) 

73 elif template_name == "enpr": 73 ↛ 75line 73 didn't jump to line 75 because the condition on line 73 was always true

74 sounds.extend(process_enpr_template(wxr, template_node, raw_tags)) 

75 return sounds, categories 

76 

77 

78def process_zh_pron_template( 

79 wxr: WiktextractContext, template_node: TemplateNode 

80) -> tuple[list[Sound], list[str]]: 

81 # https://zh.wiktionary.org/wiki/Template:Zh-pron 

82 expanded_node = wxr.wtp.parse( 

83 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

84 ) 

85 seen_lists = set() 

86 sounds = [] 

87 categories = {} 

88 for list_node in expanded_node.find_child_recursively(NodeKind.LIST): 

89 if list_node not in seen_lists: 

90 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

91 sounds.extend( 

92 process_zh_pron_list_item(wxr, list_item, [], seen_lists) 

93 ) 

94 clean_node(wxr, categories, expanded_node) 

95 for sound in sounds: 

96 translate_raw_tags(sound) 

97 return sounds, categories.get("categories", []) 

98 

99 

100def process_zh_pron_list_item( 

101 wxr: WiktextractContext, 

102 list_item_node: WikiNode, 

103 raw_tags: list[str], 

104 seen_lists: set[WikiNode], 

105) -> list[Sound]: 

106 current_tags = raw_tags[:] 

107 sounds = [] 

108 for node in list_item_node.children: 

109 if isinstance(node, WikiNode): 

110 if node.kind == NodeKind.LINK: 

111 if len(node.largs) > 0 and node.largs[0][0].startswith("File:"): 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 sound_file_data = create_audio_url_dict( 

113 node.largs[0][0].removeprefix("File:") 

114 ) 

115 sound_data = Sound() 

116 for key, value in sound_file_data.items(): 

117 if key in Sound.model_fields: 

118 setattr(sound_data, key, value) 

119 else: 

120 wxr.wtp.warning( 

121 f"{key=} not defined in Sound", 

122 sortid="zh.pronunciation/56", 

123 ) 

124 sounds.append(sound_data) 

125 else: 

126 current_tags.append(clean_node(wxr, None, node).strip("()")) 

127 elif isinstance(node, HTMLNode): 

128 if node.tag == "small": 

129 # remove "幫助"(help) <sup> tag 

130 raw_tags = re.split( 

131 r",|:", 

132 clean_node( 

133 wxr, 

134 None, 

135 list(node.invert_find_child(NodeKind.HTML)), 

136 ).strip("()"), 

137 ) 

138 current_tags.extend( 

139 [t.strip() for t in raw_tags if len(t.strip()) > 0] 

140 ) 

141 elif node.tag == "span": 

142 zh_pron = clean_node(wxr, None, node) 

143 if len(zh_pron) > 0: 143 ↛ 108line 143 didn't jump to line 108 because the condition on line 143 was always true

144 if "IPA" in node.attrs.get("class", ""): 144 ↛ 145line 144 didn't jump to line 145 because the condition on line 144 was never true

145 sound = Sound(ipa=zh_pron, raw_tags=current_tags) 

146 else: 

147 sound = Sound( 

148 zh_pron=zh_pron, raw_tags=current_tags 

149 ) 

150 sounds.append(sound) 

151 elif ( 151 ↛ 108line 151 didn't jump to line 108

152 node.tag == "table" 

153 and len(current_tags) > 0 

154 and current_tags[-1] == "同音詞" 

155 ): 

156 sounds.extend( 

157 process_homophones_table(wxr, node, current_tags) 

158 ) 

159 

160 elif node.kind == NodeKind.LIST: 160 ↛ 108line 160 didn't jump to line 108 because the condition on line 160 was always true

161 seen_lists.add(node) 

162 for next_list_item in node.find_child(NodeKind.LIST_ITEM): 

163 sounds.extend( 

164 process_zh_pron_list_item( 

165 wxr, 

166 next_list_item, 

167 current_tags, 

168 seen_lists, 

169 ) 

170 ) 

171 return sounds 

172 

173 

174def process_homophones_template( 

175 wxr: WiktextractContext, template_node: TemplateNode 

176) -> list[Sound]: 

177 # https://zh.wiktionary.org/wiki/Template:homophones 

178 sounds = [] 

179 for word_index in itertools.count(2): 179 ↛ 187line 179 didn't jump to line 187 because the loop on line 179 didn't complete

180 if word_index not in template_node.template_parameters: 

181 break 

182 homophone = clean_node( 

183 wxr, None, template_node.template_parameters.get(word_index, "") 

184 ) 

185 if len(homophone) > 0: 185 ↛ 179line 185 didn't jump to line 179 because the condition on line 185 was always true

186 sounds.append(Sound(homophone=homophone)) 

187 return sounds 

188 

189 

190def process_homophones_table( 

191 wxr: WiktextractContext, 

192 table_node: HTMLNode, 

193 raw_tags: list[str], 

194) -> list[Sound]: 

195 sounds = [] 

196 for span_node in table_node.find_html_recursively("span", attr_name="lang"): 

197 sound_data = Sound( 

198 homophone=clean_node(wxr, None, span_node), raw_tags=raw_tags 

199 ) 

200 sounds.append(sound_data) 

201 return sounds 

202 

203 

204def process_audio_template( 

205 wxr: WiktextractContext, template_node: TemplateNode, raw_tags: list[str] 

206) -> list[Sound]: 

207 # https://zh.wiktionary.org/wiki/Template:Audio 

208 sound_file = clean_node( 

209 wxr, None, template_node.template_parameters.get(2, "") 

210 ) 

211 sound_data = Sound() 

212 set_sound_file_url_fields(wxr, sound_file, sound_data) 

213 raw_tag = clean_node( 

214 wxr, None, template_node.template_parameters.get(3, "") 

215 ) 

216 if len(raw_tag) > 0: 

217 sound_data.raw_tags.append(raw_tag) 

218 sound_data.raw_tags.extend(raw_tags) 

219 return [sound_data] 

220 

221 

222def process_ipa_template( 

223 wxr: WiktextractContext, 

224 template_node: TemplateNode, 

225 raw_tags: list[str], 

226) -> list[Sound]: 

227 # https://zh.wiktionary.org/wiki/Template:IPA 

228 sounds = [] 

229 for index in itertools.count(2): 229 ↛ 239line 229 didn't jump to line 239 because the loop on line 229 didn't complete

230 if index not in template_node.template_parameters: 

231 break 

232 sound = Sound( 

233 ipa=clean_node( 

234 wxr, None, template_node.template_parameters.get(index) 

235 ), 

236 raw_tags=raw_tags, 

237 ) 

238 sounds.append(sound) 

239 return sounds 

240 

241 

242def process_enpr_template( 

243 wxr: WiktextractContext, 

244 template_node: TemplateNode, 

245 raw_tags: list[str], 

246) -> list[Sound]: 

247 # https://zh.wiktionary.org/wiki/Template:enPR 

248 sounds = [] 

249 for index in range(1, 4): 249 ↛ 259line 249 didn't jump to line 259 because the loop on line 249 didn't complete

250 if index not in template_node.template_parameters: 

251 break 

252 sound = Sound( 

253 enpr=clean_node( 

254 wxr, None, template_node.template_parameters.get(index) 

255 ), 

256 raw_tags=raw_tags, 

257 ) 

258 sounds.append(sound) 

259 return sounds