Coverage for src/wiktextract/extractor/ms/sound.py: 78%

135 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..share import set_sound_file_url_fields 

6from .models import Sound, WordEntry 

7from .tags import translate_raw_tags 

8 

9 

10def extract_sound_section( 

11 wxr: WiktextractContext, 

12 page_data: list[WordEntry], 

13 base_data: WordEntry, 

14 level_node: LevelNode, 

15) -> None: 

16 sounds = [] 

17 for list_node in level_node.find_child(NodeKind.LIST): 

18 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

19 sounds.extend(extract_sound_list_item(wxr, list_item)) 

20 for node in level_node.find_child(NodeKind.TEMPLATE): 20 ↛ 21line 20 didn't jump to line 21 because the loop on line 20 never started

21 extract_sound_templates(wxr, node, []) 

22 

23 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 23 ↛ 24line 23 didn't jump to line 24 because the condition on line 23 was never true

24 for sound in sounds: 

25 if sound.hyphenation != "": 

26 base_data.hyphenation = sound.hyphenation 

27 else: 

28 base_data.sounds.append(sound) 

29 for cat in sound.categories: 

30 if cat not in base_data: 

31 base_data.categories.append(cat) 

32 elif level_node.kind == NodeKind.LEVEL3: 32 ↛ 44line 32 didn't jump to line 44 because the condition on line 32 was always true

33 for data in page_data: 

34 if data.lang_code == page_data[-1].lang_code: 34 ↛ 33line 34 didn't jump to line 33 because the condition on line 34 was always true

35 for sound in sounds: 

36 if sound.hyphenation != "": 

37 data.hyphenation = sound.hyphenation 

38 else: 

39 data.sounds.append(sound) 

40 for cat in sound.categories: 

41 if cat not in data.categories: 

42 data.categories.append(cat) 

43 else: 

44 for sound in sounds: 

45 if sound.hyphenation != "": 

46 page_data[-1].hyphenation = sound.hyphenation 

47 else: 

48 page_data[-1].sounds.append(sound) 

49 for cat in sound.categories: 

50 if cat not in page_data[-1].categories: 

51 page_data[-1].categories.append(cat) 

52 

53 

54def extract_sound_list_item( 

55 wxr: WiktextractContext, list_item: WikiNode 

56) -> list[Sound]: 

57 raw_tags = [] 

58 cats = {} 

59 sounds = [] 

60 for node in list_item.children: 

61 if isinstance(node, TemplateNode): 

62 if node.template_name in ["a", "accent"]: 

63 raw_tag = clean_node(wxr, cats, node).strip("() ") 

64 if raw_tag != "": 64 ↛ 60line 64 didn't jump to line 60 because the condition on line 64 was always true

65 raw_tags.append(raw_tag) 

66 else: 

67 sounds.extend(extract_sound_templates(wxr, node, raw_tags)) 

68 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

69 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

70 sounds.extend(extract_sound_list_item(wxr, child_list_item)) 

71 elif isinstance(node, str) and node.strip().endswith(":"): 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 raw_tag = node.strip(": ") 

73 if raw_tag != "": 

74 raw_tags.append(raw_tag) 

75 for sound in sounds: 

76 sound.categories.extend(cats.get("categories", [])) 

77 return sounds 

78 

79 

80def extract_sound_templates( 

81 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str] 

82) -> list[Sound]: 

83 sounds = [] 

84 if t_node.template_name == "dewan": 

85 sounds.extend(extract_dewan_template(wxr, t_node)) 

86 elif t_node.template_name in ["audio-AFA", "audio-IPA"]: 

87 sounds.extend(extract_audio_ipa_template(wxr, t_node, raw_tags)) 

88 elif t_node.template_name.lower() in [ 

89 "afa", 

90 "ipa", 

91 ] or t_node.template_name.lower().endswith(("-afa", "-ipa")): 

92 sounds.extend(extract_ipa_template(wxr, t_node, raw_tags)) 

93 elif t_node.template_name in ["penyempangan", "hyphenation", "hyph"]: 

94 sounds.extend(extract_hyph_template(wxr, t_node)) 

95 elif t_node.template_name == "audio": 

96 sounds.extend(extract_audio_template(wxr, t_node)) 

97 elif t_node.template_name in ["rima", "rhymes", "rhyme"]: 97 ↛ 99line 97 didn't jump to line 99 because the condition on line 97 was always true

98 sounds.extend(extract_rhyme_template(wxr, t_node)) 

99 return sounds 

100 

101 

102def extract_dewan_template( 

103 wxr: WiktextractContext, t_node: TemplateNode 

104) -> list[Sound]: 

105 sounds = [] 

106 cats = {} 

107 text = clean_node(wxr, cats, t_node).removeprefix("Kamus Dewan:").strip() 

108 if text != "": 108 ↛ 116line 108 didn't jump to line 116 because the condition on line 108 was always true

109 sounds.append( 

110 Sound( 

111 other=text, 

112 raw_tags=["Kamus Dewan"], 

113 categories=cats.get("categories", []), 

114 ) 

115 ) 

116 return sounds 

117 

118 

119def extract_ipa_template( 

120 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str] 

121) -> list[Sound]: 

122 sounds = [] 

123 cats = {} 

124 expanded_template = wxr.wtp.parse( 

125 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

126 ) 

127 clean_node(wxr, cats, expanded_template) 

128 for span_tag in expanded_template.find_html( 

129 "span", attr_name="class", attr_value="IPA" 

130 ): 

131 ipa = clean_node(wxr, None, span_tag) 

132 if ipa != "": 132 ↛ 128line 132 didn't jump to line 128 because the condition on line 132 was always true

133 sound = Sound( 

134 ipa=ipa, 

135 raw_tags=raw_tags, 

136 categories=cats.get("categories", []), 

137 ) 

138 translate_raw_tags(sound) 

139 sounds.append(sound) 

140 return sounds 

141 

142 

143def extract_hyph_template( 

144 wxr: WiktextractContext, t_node: TemplateNode 

145) -> list[Sound]: 

146 sounds = [] 

147 expanded_template = wxr.wtp.parse( 

148 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

149 ) 

150 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

151 for span_tag in expanded_template.find_html( 

152 "span", attr_name="lang", attr_value=lang_code 

153 ): 

154 text = clean_node(wxr, None, span_tag) 

155 if text != "": 155 ↛ 151line 155 didn't jump to line 151 because the condition on line 155 was always true

156 sounds.append(Sound(hyphenation=text)) 

157 return sounds 

158 

159 

160def extract_audio_template( 

161 wxr: WiktextractContext, t_node: TemplateNode 

162) -> list[Sound]: 

163 sounds = [] 

164 filename = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

165 cats = {} 

166 clean_node(wxr, cats, t_node) 

167 if filename != "": 167 ↛ 171line 167 didn't jump to line 171 because the condition on line 167 was always true

168 sound = Sound(categories=cats.get("categories", [])) 

169 set_sound_file_url_fields(wxr, filename, sound) 

170 sounds.append(sound) 

171 return sounds 

172 

173 

174def extract_rhyme_template( 

175 wxr: WiktextractContext, t_node: TemplateNode 

176) -> list[Sound]: 

177 sounds = [] 

178 expanded_template = wxr.wtp.parse( 

179 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

180 ) 

181 cats = {} 

182 clean_node(wxr, cats, expanded_template) 

183 for link in expanded_template.find_child(NodeKind.LINK): 

184 sound = Sound(categories=cats.get("categories", [])) 

185 text = clean_node(wxr, None, link) 

186 if text != "": 

187 sound.rhymes = text 

188 sounds.append(sound) 

189 return sounds 

190 

191 

192def extract_audio_ipa_template( 

193 wxr: WiktextractContext, 

194 t_node: TemplateNode, 

195 raw_tags: list[str], 

196) -> list[Sound]: 

197 sounds = [] 

198 filename = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

199 cats = {} 

200 clean_node(wxr, cats, t_node) 

201 if filename != "": 201 ↛ 206line 201 didn't jump to line 206 because the condition on line 201 was always true

202 ipa = clean_node(wxr, None, t_node.template_parameters.get(3, "")) 

203 sound = Sound(ipa=ipa, categories=cats.get("categories", [])) 

204 set_sound_file_url_fields(wxr, filename, sound) 

205 sounds.append(sound) 

206 return sounds