Coverage for src/wiktextract/extractor/de/pronunciation.py: 55%

90 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from typing import Union 

2 

3from mediawiki_langcodes import code_to_name 

4from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from ..share import create_audio_url_dict 

9from .models import Sound, WordEntry 

10 

11 

12def extract_pronunciation( 

13 wxr: WiktextractContext, 

14 word_entry: WordEntry, 

15 level_node: LevelNode, 

16): 

17 for list_node in level_node.find_child(NodeKind.LIST): 

18 sound_data: list[Sound] = [Sound()] 

19 

20 for not_list_item_node in list_node.invert_find_child( 

21 NodeKind.LIST_ITEM 

22 ): 

23 wxr.wtp.debug( 

24 f"Found unexpected non-list-item node in pronunciation " 

25 f"section: {not_list_item_node}", 

26 sortid="extractor/de/pronunciation/extract_pronunciation/28", 

27 ) 

28 

29 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

30 children = list(list_item_node.filter_empty_str_child()) 

31 if len(children) == 0: 

32 continue 

33 

34 head_template, rest = children[0], children[1:] 

35 if ( 

36 not isinstance(head_template, WikiNode) 

37 or head_template.kind != NodeKind.TEMPLATE 

38 or not rest 

39 ): 

40 wxr.wtp.debug( 

41 f"Found unexpected non-template node in pronunciation " 

42 f"section: {head_template}", 

43 sortid="extractor/de/pronunciation/43", 

44 ) 

45 continue 

46 if head_template.template_name == "IPA": 

47 process_ipa(wxr, sound_data, rest) 

48 elif head_template.template_name == "Hörbeispiele": 

49 sound_data.append(Sound()) 

50 process_hoerbeispiele(wxr, sound_data, rest) 

51 elif head_template.template_name == "Reime": 

52 process_rhymes(wxr, sound_data, rest, word_entry) 

53 else: 

54 wxr.wtp.debug( 

55 "Unexpected template in pronunciation section: " 

56 f"{head_template} with content {rest}", 

57 sortid="extractor/de/pronunciation/58)", 

58 ) 

59 

60 # Remove empty entries 

61 sound_data = [ 

62 entry 

63 for entry in sound_data 

64 if entry.model_dump(exclude_defaults=True) != {} 

65 ] 

66 if len(sound_data) > 0: 

67 word_entry.sounds.extend(sound_data) 

68 

69 for non_list_node in level_node.invert_find_child(NodeKind.LIST): 

70 wxr.wtp.debug( 

71 "Unexpected non-list node in pronunciation section: " 

72 f"{non_list_node}", 

73 sortid="extractor/de/pronunciation/extract_pronunciation/64", 

74 ) 

75 

76 

77def process_ipa( 

78 wxr: WiktextractContext, 

79 sound_data: list[Sound], 

80 nodes: list[Union[WikiNode, str]], 

81): 

82 for node in nodes: 

83 if is_template_node_with_name(node, "Lautschrift"): 

84 process_lautschrift_template(wxr, sound_data, node) 

85 elif is_tag_node(node): 

86 append_tag(wxr, sound_data[-1], node) 

87 elif is_new_sound_data_entry_sep(node): 87 ↛ 90line 87 didn't jump to line 90 because the condition on line 87 was always true

88 sound_data.append(Sound()) 

89 else: 

90 wxr.wtp.debug( 

91 f"Found unexpected non-Lautschrift node in IPA section: {node}", 

92 sortid="extractor/de/pronunciation/process_ipa/57", 

93 ) 

94 

95 

96def process_lautschrift_template( 

97 wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode 

98) -> None: 

99 template_parameters = node.template_parameters 

100 

101 ipa = template_parameters.get(1, "") 

102 

103 lang_code = template_parameters.get("spr") 

104 if lang_code: 

105 lang = code_to_name(lang_code, "de") 

106 new_data = { 

107 "lang_code": lang_code, 

108 "lang": lang, 

109 } 

110 else: 

111 new_data = dict() 

112 

113 new_data["ipa"] = ipa 

114 

115 add_sound_data_without_appending_to_existing_properties( 

116 wxr, 

117 sound_data, 

118 new_data, 

119 ) 

120 

121 

122def process_hoerbeispiele( 

123 wxr: WiktextractContext, 

124 sound_data: list[Sound], 

125 nodes: list[Union[str, WikiNode]], 

126): 

127 for node in nodes: 

128 if is_template_node_with_name(node, "Audio"): 

129 process_audio_template(wxr, sound_data, node) 

130 elif is_tag_node(node): 

131 append_tag(wxr, sound_data[-1], node) 

132 elif is_new_sound_data_entry_sep(node): 132 ↛ 135line 132 didn't jump to line 135 because the condition on line 132 was always true

133 sound_data.append(Sound()) 

134 else: 

135 wxr.wtp.debug( 

136 f"Found unexpected node in Hoerbeispiele section: {node}", 

137 sortid="extractor/de/pronunciation/process_hoerbeispiele/193", 

138 ) 

139 

140 

141def process_audio_template( 

142 wxr: WiktextractContext, sound_data: list[Sound], node: WikiNode 

143): 

144 audio_file = node.template_parameters.get(1, "").strip() 

145 if len(audio_file) > 0: 145 ↛ exitline 145 didn't return from function 'process_audio_template' because the condition on line 145 was always true

146 add_sound_data_without_appending_to_existing_properties( 

147 wxr, sound_data, create_audio_url_dict(audio_file) 

148 ) 

149 

150 

151def process_rhymes( 

152 wxr: WiktextractContext, 

153 sound_data: list[Sound], 

154 nodes: list[WikiNode], 

155 word_entry: WordEntry, 

156): 

157 for node in nodes: 

158 if isinstance(node, TemplateNode) and node.template_name == "Reim": 

159 # https://de.wiktionary.org/wiki/Vorlage:Reime 

160 rhyme = clean_node(wxr, word_entry, node) 

161 if rhyme != "": 

162 sound_data.append(Sound(rhymes=rhyme)) 

163 

164 

165def is_template_node_with_name(node: Union[WikiNode, str], template_name: str): 

166 return ( 

167 isinstance(node, WikiNode) 

168 and node.kind == NodeKind.TEMPLATE 

169 and node.template_name == template_name 

170 ) 

171 

172 

173def add_sound_data_without_appending_to_existing_properties( 

174 wxr: WiktextractContext, 

175 sound_data: list[Sound], 

176 new_sound_data: dict, 

177): 

178 """Creates a new IPA data entry if properties exist in previous entry.""" 

179 if any( 

180 [ 

181 key in sound_data[-1].model_dump(exclude_defaults=True) 

182 for key in new_sound_data.keys() 

183 ] 

184 ): 

185 sound_data.append(Sound()) 

186 

187 for key, value in new_sound_data.items(): 

188 if key in sound_data[-1].model_fields: 188 ↛ 194line 188 didn't jump to line 194 because the condition on line 188 was always true

189 if isinstance(value, str): 189 ↛ 192line 189 didn't jump to line 192 because the condition on line 189 was always true

190 setattr(sound_data[-1], key, value) 

191 else: 

192 getattr(sound_data[-1], key).extend(value) 

193 else: 

194 wxr.wtp.debug( 

195 f"Unexpected key {key} for Sound", 

196 sortid="extractor/de/pronunciation/196", 

197 ) 

198 

199 

200def is_tag_node(node: Union[WikiNode, str]): 

201 return isinstance(node, WikiNode) and node.kind in [ 

202 NodeKind.TEMPLATE, 

203 NodeKind.ITALIC, 

204 ] 

205 

206 

207def append_tag(wxr: WiktextractContext, sound_data: Sound, node: WikiNode): 

208 tag = clean_node(wxr, None, node) 

209 if tag != "": 209 ↛ exitline 209 didn't return from function 'append_tag' because the condition on line 209 was always true

210 sound_data.raw_tags.append(tag) 

211 

212 

213def is_new_sound_data_entry_sep(node: Union[WikiNode, str]): 

214 return isinstance(node, str) and node.strip() in [",", ";"]