Coverage for src/wiktextract/extractor/th/sound.py: 74%

141 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import re 

2from dataclasses import dataclass 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..share import set_sound_file_url_fields 

15from .models import Hyphenation, Sound, WordEntry 

16from .tags import translate_raw_tags 

17 

18 

19def extract_sound_section( 

20 wxr: WiktextractContext, 

21 base_data: WordEntry, 

22 level_node: LevelNode, 

23) -> None: 

24 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

25 if t_node.template_name == "th-pron": 

26 extract_th_pron_template(wxr, base_data, t_node) 

27 elif t_node.template_name == "lo-pron": 27 ↛ 24line 27 didn't jump to line 24 because the condition on line 27 was always true

28 extract_lo_pron_template(wxr, base_data, t_node) 

29 for list_node in level_node.find_child(NodeKind.LIST): 29 ↛ 30line 29 didn't jump to line 30 because the loop on line 29 never started

30 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

31 extract_sound_list_item(wxr, base_data, list_item) 

32 

33 

34def extract_sound_list_item( 

35 wxr: WiktextractContext, 

36 base_data: WordEntry, 

37 list_item: WikiNode, 

38) -> None: 

39 for t_node in list_item.find_child(NodeKind.TEMPLATE): 

40 if t_node.template_name == "IPA": 

41 extract_ipa_template(wxr, base_data, t_node) 

42 elif t_node.template_name == "X-SAMPA": 

43 extract_x_sampa_template(wxr, base_data, t_node) 

44 elif t_node.template_name == "enPR": 

45 extract_enpr_template(wxr, base_data, t_node) 

46 elif t_node.template_name == "audio": 

47 extract_audio_template(wxr, base_data, t_node) 

48 

49 

50def extract_ipa_template( 

51 wxr: WiktextractContext, 

52 base_data: WordEntry, 

53 t_node: TemplateNode, 

54) -> None: 

55 sound = Sound( 

56 ipa=clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

57 ) 

58 if sound.ipa != "": 

59 base_data.sounds.append(sound) 

60 clean_node(wxr, base_data, t_node) 

61 

62 

63def extract_x_sampa_template( 

64 wxr: WiktextractContext, 

65 base_data: WordEntry, 

66 t_node: TemplateNode, 

67) -> None: 

68 sound = Sound( 

69 ipa=clean_node(wxr, None, t_node.template_parameters.get(1, "")), 

70 tags=["X-SAMPA"], 

71 ) 

72 if sound.ipa != "": 

73 base_data.sounds.append(sound) 

74 

75 

76def extract_enpr_template( 

77 wxr: WiktextractContext, 

78 base_data: WordEntry, 

79 t_node: TemplateNode, 

80) -> None: 

81 sound = Sound( 

82 enpr=clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

83 ) 

84 if sound.enpr != "": 

85 base_data.sounds.append(sound) 

86 

87 

88def extract_audio_template( 

89 wxr: WiktextractContext, 

90 base_data: WordEntry, 

91 t_node: TemplateNode, 

92) -> None: 

93 sound = Sound() 

94 filename = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

95 if filename != "": 95 ↛ exitline 95 didn't return from function 'extract_audio_template' because the condition on line 95 was always true

96 set_sound_file_url_fields(wxr, filename, sound) 

97 for raw_tag in clean_node( 

98 wxr, None, t_node.template_parameters.get("a", "") 

99 ).split(","): 

100 raw_tag = raw_tag.strip() 

101 if raw_tag != "": 101 ↛ 97line 101 didn't jump to line 97 because the condition on line 101 was always true

102 sound.raw_tags.append(raw_tag) 

103 translate_raw_tags(sound) 

104 base_data.sounds.append(sound) 

105 clean_node(wxr, base_data, t_node) 

106 

107 

108@dataclass 

109class TableHeader: 

110 text: str 

111 rowspan: int 

112 

113 

114def extract_th_pron_template( 

115 wxr: WiktextractContext, 

116 base_data: WordEntry, 

117 t_node: TemplateNode, 

118) -> None: 

119 # https://th.wiktionary.org/wiki/แม่แบบ:th-pron 

120 expanded_node = wxr.wtp.parse( 

121 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

122 ) 

123 for table_tag in expanded_node.find_html("table"): 

124 row_headers = [] 

125 for tr_tag in table_tag.find_html("tr"): 

126 field = "other" 

127 new_headers = [] 

128 for header in row_headers: 

129 if header.rowspan > 1: 

130 header.rowspan -= 1 

131 new_headers.append(header) 

132 row_headers = new_headers 

133 for th_tag in tr_tag.find_html("th"): 

134 header_str = clean_node(wxr, None, th_tag) 

135 if header_str.startswith("(มาตรฐาน) สัทอักษรสากล"): 

136 field = "ipa" 

137 elif header_str.startswith("คำพ้องเสียง"): 

138 field = "homophone" 

139 elif header_str == "ไฟล์เสียง": 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 field = "audio" 

141 elif header_str != "": 141 ↛ 133line 141 didn't jump to line 133 because the condition on line 141 was always true

142 rowspan = 1 

143 rowspan_str = th_tag.attrs.get("rowspan", "1") 

144 if re.fullmatch(r"\d+", rowspan_str): 144 ↛ 146line 144 didn't jump to line 146 because the condition on line 144 was always true

145 rowspan = int(rowspan_str) 

146 row_headers.append(TableHeader(header_str, rowspan)) 

147 

148 for td_tag in tr_tag.find_html("td"): 

149 if field == "audio": 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true

150 for link_node in td_tag.find_child(NodeKind.LINK): 

151 filename = clean_node(wxr, None, link_node.largs[0]) 

152 if filename != "": 

153 sound = Sound() 

154 set_sound_file_url_fields(wxr, filename, sound) 

155 base_data.sounds.append(sound) 

156 elif field == "homophone": 

157 for span_tag in td_tag.find_html_recursively( 

158 "span", attr_name="lang", attr_value="th" 

159 ): 

160 word = clean_node(wxr, None, span_tag) 

161 if word != "": 161 ↛ 157line 161 didn't jump to line 157 because the condition on line 161 was always true

162 base_data.sounds.append(Sound(homophone=word)) 

163 else: 

164 data = clean_node(wxr, None, td_tag) 

165 if data != "": 165 ↛ 148line 165 didn't jump to line 148 because the condition on line 165 was always true

166 sound = Sound() 

167 setattr(sound, field, data) 

168 for header in row_headers: 

169 sound.raw_tags.append(header.text) 

170 translate_raw_tags(sound) 

171 base_data.sounds.append(sound) 

172 

173 clean_node(wxr, base_data, expanded_node) 

174 

175 

176def extract_lo_pron_template( 

177 wxr: WiktextractContext, 

178 base_data: WordEntry, 

179 t_node: TemplateNode, 

180) -> None: 

181 # https://th.wiktionary.org/wiki/แม่แบบ:lo-pron 

182 expanded_node = wxr.wtp.parse( 

183 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

184 ) 

185 for list_node in expanded_node.find_child(NodeKind.LIST): 

186 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

187 field = "other" 

188 raw_tag = "" 

189 for node in list_item.children: 

190 if isinstance(node, HTMLNode) and node.tag == "span": 

191 span_class = node.attrs.get("class", "") 

192 if "qualifier-content" in span_class: 

193 raw_tag = clean_node(wxr, None, node) 

194 elif span_class == "IPA": 

195 ipa = clean_node(wxr, None, node) 

196 if ipa != "": 196 ↛ 189line 196 didn't jump to line 189 because the condition on line 196 was always true

197 sound = Sound(ipa=ipa) 

198 if raw_tag != "": 198 ↛ 201line 198 didn't jump to line 201 because the condition on line 198 was always true

199 sound.raw_tags.append(raw_tag) 

200 translate_raw_tags(sound) 

201 base_data.sounds.append(sound) 

202 else: 

203 span_lang = node.attrs.get("lang", "") 

204 if span_lang == "lo" and field == "hyphenation": 

205 span_str = clean_node(wxr, None, node) 

206 if span_str != "": 206 ↛ 189line 206 didn't jump to line 189 because the condition on line 206 was always true

207 base_data.hyphenations.append( 

208 Hyphenation(parts=span_str.split("-")) 

209 ) 

210 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

211 link_str = clean_node(wxr, None, node) 

212 if link_str == "สัทอักษรสากล": 

213 field = "ipa" 

214 elif link_str != "" and field == "rhymes": 

215 base_data.sounds.append(Sound(rhymes=link_str)) 

216 elif isinstance(node, str) and node.strip().endswith(":"): 

217 node = node.strip() 

218 if node == "การแบ่งพยางค์:": 

219 field = "hyphenation" 

220 elif node == "สัมผัส:": 220 ↛ 189line 220 didn't jump to line 189 because the condition on line 220 was always true

221 field = "rhymes" 

222 

223 clean_node(wxr, base_data, expanded_node)