Coverage for src/wiktextract/extractor/cs/sound.py: 72%

1from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

3from ...page import clean_node

4from ...wxr_context import WiktextractContext

5from ..share import set_sound_file_url_fields

6from .models import Form, Hyphenation, Sound, WordEntry

7from .tags import translate_raw_tags

10def extract_sound_section(

11 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode

12):

13 for list_node in level_node.find_child(NodeKind.LIST):

14 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

15 raw_tags = []

16 for node in list_item.children:

17 if isinstance(node, TemplateNode):

18 if node.template_name == "IPA":

19 extract_ipa_template(wxr, base_data, node, raw_tags)

20 raw_tags.clear()

21 elif node.template_name == "IPA2":

22 extract_ipa2_template(wxr, base_data, node, raw_tags)

23 raw_tags.clear()

24 elif node.template_name == "Audio": 24 ↛ 27line 24 didn't jump to line 27 because the condition on line 24 was always true

25 extract_audio_template(wxr, base_data, node, raw_tags)

26 raw_tags.clear()

27 elif node.template_name == "Příznak2":

28 raw_tags.extend(extract_příznak2_template(wxr, node))

29 elif (

30 isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC

31 ):

32 raw_tag = clean_node(wxr, None, node)

33 if raw_tag != "": 33 ↛ 16line 33 didn't jump to line 16 because the condition on line 33 was always true

34 raw_tags.append(raw_tag)

37def extract_ipa_template(

38 wxr: WiktextractContext,

39 base_data: WordEntry,

40 t_node: TemplateNode,

41 raw_tags: list[str],

42):

43 # https://cs.wiktionary.org/wiki/Šablona:IPA

44 expanded_node = wxr.wtp.parse(

45 wxr.wtp.node_to_wikitext(t_node), expand_all=True

46 )

47 for span_tag in expanded_node.find_html(

48 "span", attr_name="class", attr_value="IPA"

49 ):

50 text = clean_node(wxr, None, span_tag)

51 for ipa in text.split(","):

52 ipa = ipa.strip()

53 if ipa != "": 53 ↛ 51line 53 didn't jump to line 51 because the condition on line 53 was always true

54 sound = Sound(ipa=ipa, raw_tags=raw_tags)

55 translate_raw_tags(sound)

56 base_data.sounds.append(sound)

57 clean_node(wxr, base_data, expanded_node)

60def extract_ipa2_template(

61 wxr: WiktextractContext,

62 base_data: WordEntry,

63 t_node: TemplateNode,

64 raw_tags: list[str],

65):

66 # https://cs.wiktionary.org/wiki/Šablona:IPA2

67 ipa = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

68 if ipa != "": 68 ↛ exitline 68 didn't return from function 'extract_ipa2_template' because the condition on line 68 was always true

69 sound = Sound(ipa=f"[{ipa}]", raw_tags=raw_tags)

70 translate_raw_tags(sound)

71 base_data.sounds.append(sound)

74def extract_audio_template(

75 wxr: WiktextractContext,

76 base_data: WordEntry,

77 t_node: TemplateNode,

78 raw_tags: list[str],

79):

80 # https://cs.wiktionary.org/wiki/Šablona:Audio

81 file = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

82 if file != "": 82 ↛ exitline 82 didn't return from function 'extract_audio_template' because the condition on line 82 was always true

83 sound = Sound(raw_tags=raw_tags)

84 set_sound_file_url_fields(wxr, file, sound)

85 translate_raw_tags(sound)

86 base_data.sounds.append(sound)

89def extract_příznak2_template(

90 wxr: WiktextractContext, t_node: TemplateNode

91) -> list[str]:

92 raw_tags = []

93 text = clean_node(wxr, None, t_node).strip("() ")

94 for raw_tag in text.split(","):

95 raw_tag = raw_tag.strip()

96 if raw_tag != "": 96 ↛ 94line 96 didn't jump to line 94 because the condition on line 96 was always true

97 raw_tags.append(raw_tag)

98 return raw_tags

100

101def extract_hyphenation_section(

102 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode

103):

104 for list_node in level_node.find_child(NodeKind.LIST):

105 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

106 h_str = clean_node(wxr, None, list_item.children)

107 h_parts = list(filter(None, map(str.strip, h_str.split("-"))))

108 if len(h_parts) > 0: 108 ↛ 105line 108 didn't jump to line 105 because the condition on line 108 was always true

109 base_data.hyphenations.append(Hyphenation(parts=h_parts))

110

111

112def extract_homophone_section(

113 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode

114):

115 for list_node in level_node.find_child(NodeKind.LIST):

116 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

117 for link_node in list_item.find_child(NodeKind.LINK):

118 homophone = clean_node(wxr, None, link_node)

119 if homophone != "":

120 base_data.sounds.append(Sound(homophone=homophone))

121

122

123def extract_transcript_section(

124 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode

125):

126 for list_node in level_node.find_child(NodeKind.LIST):

127 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

128 for index, node in enumerate(list_item.children):

129 if isinstance(node, TemplateNode) and node.template_name in [

130 "Hiragana",

131 "Rómadži",

132 "Kana",

133 ]:

134 extract_ja_transcript_template(wxr, word_entry, node)

135 elif isinstance(node, TemplateNode) and node.template_name in [ 135 ↛ 140line 135 didn't jump to line 140 because the condition on line 135 was never true

136 "Pinyin",

137 "Švarný",

138 "bopomofo",

139 ]:

140 extract_zh_transcript_template(wxr, word_entry, node)

141 elif (

142 isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC

143 ):

144 italic_str = clean_node(wxr, None, node).removesuffix(":")

145 if italic_str != "": 145 ↛ 128line 145 didn't jump to line 128 because the condition on line 145 was always true

146 sound = Sound(raw_tags=[italic_str])

147 if italic_str in ["Pinyin", "Bopomofo"]: 147 ↛ 152line 147 didn't jump to line 152 because the condition on line 147 was always true

148 sound.zh_pron = clean_node(

149 wxr, None, list_item.children[index + 1 :]

150 )

151 else:

152 sound.other = clean_node(

153 wxr, None, list_item.children[index + 1 :]

154 )

155 translate_raw_tags(sound)

156 word_entry.sounds.append(sound)

157 break

158

159

160def extract_ja_transcript_template(

161 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

162):

163 expanded_template = wxr.wtp.parse(

164 wxr.wtp.node_to_wikitext(t_node), expand_all=True

165 )

166 for span_tag in expanded_template.find_html("span"):

167 span_class = span_tag.attrs.get("class", "")

168 if not span_class.endswith("-title") and span_class != "":

169 span_text = clean_node(wxr, None, span_tag)

170 if span_text != "": 170 ↛ 166line 170 didn't jump to line 166 because the condition on line 170 was always true

171 form = Form(form=span_text, raw_tags=[span_class])

172 translate_raw_tags(form)

173 word_entry.forms.append(form)

174 clean_node(wxr, word_entry, expanded_template)

175

176

177def extract_zh_transcript_template(

178 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

179):

180 expanded_template = wxr.wtp.parse(

181 wxr.wtp.node_to_wikitext(t_node), expand_all=True

182 )

183 raw_tag = ""

184 for span_tag in expanded_template.find_html("span"):

185 span_class = span_tag.attrs.get("class", "")

186 span_lang = span_tag.attrs.get("lang", "")

187 if span_class.endswith("-title"):

188 raw_tag = clean_node(wxr, None, span_tag).removesuffix(":")

189 elif span_lang == "zh":

190 pron = clean_node(wxr, None, span_tag)

191 if pron != "":

192 sound = Sound(zh_pron=pron)

193 if raw_tag != "":

194 sound.raw_tags.append(raw_tag)

195 translate_raw_tags(sound)

196 word_entry.sounds.append(sound)

Coverage for src / wiktextract / extractor / cs / sound.py: 72%

118 statements