Coverage for src/wiktextract/extractor/ms/sound.py: 78%

135 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..share import set_sound_file_url_fields 

6from .models import Hyphenation, Sound, WordEntry 

7from .tags import translate_raw_tags 

8 

9 

10def extract_sound_section( 

11 wxr: WiktextractContext, 

12 page_data: list[WordEntry], 

13 base_data: WordEntry, 

14 level_node: LevelNode, 

15) -> None: 

16 sounds = [] 

17 for list_node in level_node.find_child(NodeKind.LIST): 

18 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

19 sounds.extend(extract_sound_list_item(wxr, list_item)) 

20 for node in level_node.find_child(NodeKind.TEMPLATE): 20 ↛ 21line 20 didn't jump to line 21 because the loop on line 20 never started

21 extract_sound_templates(wxr, node, []) 

22 

23 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 23 ↛ 24line 23 didn't jump to line 24 because the condition on line 23 was never true

24 for sound in sounds: 

25 if len(sound.hyphenations) > 0: 

26 base_data.hyphenations.extend(sound.hyphenations) 

27 else: 

28 base_data.sounds.append(sound) 

29 for cat in sound.categories: 

30 if cat not in base_data: 

31 base_data.categories.append(cat) 

32 elif level_node.kind == NodeKind.LEVEL3: 32 ↛ 44line 32 didn't jump to line 44 because the condition on line 32 was always true

33 for data in page_data: 

34 if data.lang_code == page_data[-1].lang_code: 34 ↛ 33line 34 didn't jump to line 33 because the condition on line 34 was always true

35 for sound in sounds: 

36 if len(sound.hyphenations) > 0: 

37 data.hyphenations.extend(sound.hyphenations) 

38 else: 

39 data.sounds.append(sound) 

40 for cat in sound.categories: 

41 if cat not in data.categories: 

42 data.categories.append(cat) 

43 else: 

44 for sound in sounds: 

45 if len(sound.hyphenations) > 0: 

46 page_data[-1].hyphenations.extend(sound.hyphenations) 

47 else: 

48 page_data[-1].sounds.append(sound) 

49 for cat in sound.categories: 

50 if cat not in page_data[-1].categories: 

51 page_data[-1].categories.append(cat) 

52 

53 

54def extract_sound_list_item( 

55 wxr: WiktextractContext, list_item: WikiNode 

56) -> list[Sound]: 

57 raw_tags = [] 

58 cats = {} 

59 sounds = [] 

60 for node in list_item.children: 

61 if isinstance(node, TemplateNode): 

62 if node.template_name in ["a", "accent"]: 

63 raw_tag = clean_node(wxr, cats, node).strip("() ") 

64 if raw_tag != "": 64 ↛ 60line 64 didn't jump to line 60 because the condition on line 64 was always true

65 raw_tags.append(raw_tag) 

66 else: 

67 sounds.extend(extract_sound_templates(wxr, node, raw_tags)) 

68 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

69 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

70 sounds.extend(extract_sound_list_item(wxr, child_list_item)) 

71 elif isinstance(node, str) and node.strip().endswith(":"): 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 raw_tag = node.strip(": ") 

73 if raw_tag != "": 

74 raw_tags.append(raw_tag) 

75 for sound in sounds: 

76 sound.categories.extend(cats.get("categories", [])) 

77 return sounds 

78 

79 

80def extract_sound_templates( 

81 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str] 

82) -> list[Sound]: 

83 sounds = [] 

84 if t_node.template_name == "dewan": 

85 sounds.extend(extract_dewan_template(wxr, t_node)) 

86 elif t_node.template_name in ["audio-AFA", "audio-IPA"]: 

87 sounds.extend(extract_audio_ipa_template(wxr, t_node, raw_tags)) 

88 elif t_node.template_name.lower() in [ 

89 "afa", 

90 "ipa", 

91 ] or t_node.template_name.lower().endswith(("-afa", "-ipa")): 

92 sounds.extend(extract_ipa_template(wxr, t_node, raw_tags)) 

93 elif t_node.template_name in ["penyempangan", "hyphenation", "hyph"]: 

94 sounds.extend(extract_hyph_template(wxr, t_node)) 

95 elif t_node.template_name == "audio": 

96 sounds.extend(extract_audio_template(wxr, t_node)) 

97 elif t_node.template_name in ["rima", "rhymes", "rhyme"]: 97 ↛ 99line 97 didn't jump to line 99 because the condition on line 97 was always true

98 sounds.extend(extract_rhyme_template(wxr, t_node)) 

99 return sounds 

100 

101 

102def extract_dewan_template( 

103 wxr: WiktextractContext, t_node: TemplateNode 

104) -> list[Sound]: 

105 sounds = [] 

106 cats = {} 

107 text = clean_node(wxr, cats, t_node).removeprefix("Kamus Dewan:").strip() 

108 if text != "": 108 ↛ 116line 108 didn't jump to line 116 because the condition on line 108 was always true

109 sounds.append( 

110 Sound( 

111 other=text, 

112 raw_tags=["Kamus Dewan"], 

113 categories=cats.get("categories", []), 

114 ) 

115 ) 

116 return sounds 

117 

118 

119def extract_ipa_template( 

120 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str] 

121) -> list[Sound]: 

122 sounds = [] 

123 cats = {} 

124 expanded_template = wxr.wtp.parse( 

125 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

126 ) 

127 clean_node(wxr, cats, expanded_template) 

128 for span_tag in expanded_template.find_html( 

129 "span", attr_name="class", attr_value="IPA" 

130 ): 

131 ipa = clean_node(wxr, None, span_tag) 

132 if ipa != "": 132 ↛ 128line 132 didn't jump to line 128 because the condition on line 132 was always true

133 sound = Sound( 

134 ipa=ipa, 

135 raw_tags=raw_tags, 

136 categories=cats.get("categories", []), 

137 ) 

138 translate_raw_tags(sound) 

139 sounds.append(sound) 

140 return sounds 

141 

142 

143def extract_hyph_template( 

144 wxr: WiktextractContext, t_node: TemplateNode 

145) -> list[Sound]: 

146 sounds = [] 

147 expanded_template = wxr.wtp.parse( 

148 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

149 ) 

150 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

151 for span_tag in expanded_template.find_html( 

152 "span", attr_name="lang", attr_value=lang_code 

153 ): 

154 text = clean_node(wxr, None, span_tag) 

155 if text != "": 155 ↛ 151line 155 didn't jump to line 151 because the condition on line 155 was always true

156 sounds.append( 

157 Sound(hyphenations=[Hyphenation(parts=text.split("‧"))]) 

158 ) 

159 return sounds 

160 

161 

162def extract_audio_template( 

163 wxr: WiktextractContext, t_node: TemplateNode 

164) -> list[Sound]: 

165 sounds = [] 

166 filename = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

167 cats = {} 

168 clean_node(wxr, cats, t_node) 

169 if filename != "": 169 ↛ 173line 169 didn't jump to line 173 because the condition on line 169 was always true

170 sound = Sound(categories=cats.get("categories", [])) 

171 set_sound_file_url_fields(wxr, filename, sound) 

172 sounds.append(sound) 

173 return sounds 

174 

175 

176def extract_rhyme_template( 

177 wxr: WiktextractContext, t_node: TemplateNode 

178) -> list[Sound]: 

179 sounds = [] 

180 expanded_template = wxr.wtp.parse( 

181 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

182 ) 

183 cats = {} 

184 clean_node(wxr, cats, expanded_template) 

185 for link in expanded_template.find_child(NodeKind.LINK): 

186 sound = Sound(categories=cats.get("categories", [])) 

187 text = clean_node(wxr, None, link) 

188 if text != "": 

189 sound.rhymes = text 

190 sounds.append(sound) 

191 return sounds 

192 

193 

194def extract_audio_ipa_template( 

195 wxr: WiktextractContext, 

196 t_node: TemplateNode, 

197 raw_tags: list[str], 

198) -> list[Sound]: 

199 sounds = [] 

200 filename = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

201 cats = {} 

202 clean_node(wxr, cats, t_node) 

203 if filename != "": 203 ↛ 208line 203 didn't jump to line 208 because the condition on line 203 was always true

204 ipa = clean_node(wxr, None, t_node.template_parameters.get(3, "")) 

205 sound = Sound(ipa=ipa, categories=cats.get("categories", [])) 

206 set_sound_file_url_fields(wxr, filename, sound) 

207 sounds.append(sound) 

208 return sounds