Coverage for src/wiktextract/extractor/ms/sound.py: 78%

1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

3from ...page import clean_node

4from ...wxr_context import WiktextractContext

5from ..share import set_sound_file_url_fields

6from .models import Sound, WordEntry

7from .tags import translate_raw_tags

10def extract_sound_section(

11 wxr: WiktextractContext,

12 page_data: list[WordEntry],

13 base_data: WordEntry,

14 level_node: LevelNode,

15) -> None:

16 sounds = []

17 for list_node in level_node.find_child(NodeKind.LIST):

18 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

19 sounds.extend(extract_sound_list_item(wxr, list_item))

20 for node in level_node.find_child(NodeKind.TEMPLATE): 20 ↛ 21line 20 didn't jump to line 21 because the loop on line 20 never started

21 extract_sound_templates(wxr, node, [])

23 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 23 ↛ 24line 23 didn't jump to line 24 because the condition on line 23 was never true

24 for sound in sounds:

25 if sound.hyphenation != "":

26 base_data.hyphenation = sound.hyphenation

27 else:

28 base_data.sounds.append(sound)

29 for cat in sound.categories:

30 if cat not in base_data:

31 base_data.categories.append(cat)

32 elif level_node.kind == NodeKind.LEVEL3: 32 ↛ 44line 32 didn't jump to line 44 because the condition on line 32 was always true

33 for data in page_data:

34 if data.lang_code == page_data[-1].lang_code: 34 ↛ 33line 34 didn't jump to line 33 because the condition on line 34 was always true

35 for sound in sounds:

36 if sound.hyphenation != "":

37 data.hyphenation = sound.hyphenation

38 else:

39 data.sounds.append(sound)

40 for cat in sound.categories:

41 if cat not in data.categories:

42 data.categories.append(cat)

43 else:

44 for sound in sounds:

45 if sound.hyphenation != "":

46 page_data[-1].hyphenation = sound.hyphenation

47 else:

48 page_data[-1].sounds.append(sound)

49 for cat in sound.categories:

50 if cat not in page_data[-1].categories:

51 page_data[-1].categories.append(cat)

54def extract_sound_list_item(

55 wxr: WiktextractContext, list_item: WikiNode

56) -> list[Sound]:

57 raw_tags = []

58 cats = {}

59 sounds = []

60 for node in list_item.children:

61 if isinstance(node, TemplateNode):

62 if node.template_name in ["a", "accent"]:

63 raw_tag = clean_node(wxr, cats, node).strip("() ")

64 if raw_tag != "": 64 ↛ 60line 64 didn't jump to line 60 because the condition on line 64 was always true

65 raw_tags.append(raw_tag)

66 else:

67 sounds.extend(extract_sound_templates(wxr, node, raw_tags))

68 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

69 for child_list_item in node.find_child(NodeKind.LIST_ITEM):

70 sounds.extend(extract_sound_list_item(wxr, child_list_item))

71 elif isinstance(node, str) and node.strip().endswith(":"): 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 raw_tag = node.strip(": ")

73 if raw_tag != "":

74 raw_tags.append(raw_tag)

75 for sound in sounds:

76 sound.categories.extend(cats.get("categories", []))

77 return sounds

80def extract_sound_templates(

81 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str]

82) -> list[Sound]:

83 sounds = []

84 if t_node.template_name == "dewan":

85 sounds.extend(extract_dewan_template(wxr, t_node))

86 elif t_node.template_name in ["audio-AFA", "audio-IPA"]:

87 sounds.extend(extract_audio_ipa_template(wxr, t_node, raw_tags))

88 elif t_node.template_name.lower() in [

89 "afa",

90 "ipa",

91 ] or t_node.template_name.lower().endswith(("-afa", "-ipa")):

92 sounds.extend(extract_ipa_template(wxr, t_node, raw_tags))

93 elif t_node.template_name in ["penyempangan", "hyphenation", "hyph"]:

94 sounds.extend(extract_hyph_template(wxr, t_node))

95 elif t_node.template_name == "audio":

96 sounds.extend(extract_audio_template(wxr, t_node))

97 elif t_node.template_name in ["rima", "rhymes", "rhyme"]: 97 ↛ 99line 97 didn't jump to line 99 because the condition on line 97 was always true

98 sounds.extend(extract_rhyme_template(wxr, t_node))

99 return sounds

100

101

102def extract_dewan_template(

103 wxr: WiktextractContext, t_node: TemplateNode

104) -> list[Sound]:

105 sounds = []

106 cats = {}

107 text = clean_node(wxr, cats, t_node).removeprefix("Kamus Dewan:").strip()

108 if text != "": 108 ↛ 116line 108 didn't jump to line 116 because the condition on line 108 was always true

109 sounds.append(

110 Sound(

111 other=text,

112 raw_tags=["Kamus Dewan"],

113 categories=cats.get("categories", []),

114 )

115 )

116 return sounds

117

118

119def extract_ipa_template(

120 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str]

121) -> list[Sound]:

122 sounds = []

123 cats = {}

124 expanded_template = wxr.wtp.parse(

125 wxr.wtp.node_to_wikitext(t_node), expand_all=True

126 )

127 clean_node(wxr, cats, expanded_template)

128 for span_tag in expanded_template.find_html(

129 "span", attr_name="class", attr_value="IPA"

130 ):

131 ipa = clean_node(wxr, None, span_tag)

132 if ipa != "": 132 ↛ 128line 132 didn't jump to line 128 because the condition on line 132 was always true

133 sound = Sound(

134 ipa=ipa,

135 raw_tags=raw_tags,

136 categories=cats.get("categories", []),

137 )

138 translate_raw_tags(sound)

139 sounds.append(sound)

140 return sounds

141

142

143def extract_hyph_template(

144 wxr: WiktextractContext, t_node: TemplateNode

145) -> list[Sound]:

146 sounds = []

147 expanded_template = wxr.wtp.parse(

148 wxr.wtp.node_to_wikitext(t_node), expand_all=True

149 )

150 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

151 for span_tag in expanded_template.find_html(

152 "span", attr_name="lang", attr_value=lang_code

153 ):

154 text = clean_node(wxr, None, span_tag)

155 if text != "": 155 ↛ 151line 155 didn't jump to line 151 because the condition on line 155 was always true

156 sounds.append(Sound(hyphenation=text))

157 return sounds

158

159

160def extract_audio_template(

161 wxr: WiktextractContext, t_node: TemplateNode

162) -> list[Sound]:

163 sounds = []

164 filename = clean_node(wxr, None, t_node.template_parameters.get(2, ""))

165 cats = {}

166 clean_node(wxr, cats, t_node)

167 if filename != "": 167 ↛ 171line 167 didn't jump to line 171 because the condition on line 167 was always true

168 sound = Sound(categories=cats.get("categories", []))

169 set_sound_file_url_fields(wxr, filename, sound)

170 sounds.append(sound)

171 return sounds

172

173

174def extract_rhyme_template(

175 wxr: WiktextractContext, t_node: TemplateNode

176) -> list[Sound]:

177 sounds = []

178 expanded_template = wxr.wtp.parse(

179 wxr.wtp.node_to_wikitext(t_node), expand_all=True

180 )

181 cats = {}

182 clean_node(wxr, cats, expanded_template)

183 for link in expanded_template.find_child(NodeKind.LINK):

184 sound = Sound(categories=cats.get("categories", []))

185 text = clean_node(wxr, None, link)

186 if text != "":

187 sound.rhymes = text

188 sounds.append(sound)

189 return sounds

190

191

192def extract_audio_ipa_template(

193 wxr: WiktextractContext,

194 t_node: TemplateNode,

195 raw_tags: list[str],

196) -> list[Sound]:

197 sounds = []

198 filename = clean_node(wxr, None, t_node.template_parameters.get(2, ""))

199 cats = {}

200 clean_node(wxr, cats, t_node)

201 if filename != "": 201 ↛ 206line 201 didn't jump to line 206 because the condition on line 201 was always true

202 ipa = clean_node(wxr, None, t_node.template_parameters.get(3, ""))

203 sound = Sound(ipa=ipa, categories=cats.get("categories", []))

204 set_sound_file_url_fields(wxr, filename, sound)

205 sounds.append(sound)

206 return sounds