Coverage for src/wiktextract/extractor/ja/sound.py: 90%

122 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import itertools 

2 

3from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..share import set_sound_file_url_fields 

8from .models import Sound, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def extract_sound_section( 

13 wxr: WiktextractContext, 

14 page_data: list[WordEntry], 

15 base_data: WordEntry, 

16 level_node: LevelNode, 

17) -> None: 

18 sounds = [] 

19 cats = {} 

20 if base_data.lang_code == "zh": 

21 extract_zh_sounds(wxr, level_node, sounds) 

22 else: 

23 for template_node in level_node.find_child_recursively( 

24 NodeKind.TEMPLATE 

25 ): 

26 process_sound_template(wxr, template_node, sounds, cats) 

27 

28 if level_node.kind == NodeKind.LEVEL3: 

29 base_data.sounds.extend(sounds) 

30 base_data.categories.extend(cats.get("categories", [])) 

31 

32 for data in page_data: 

33 if data.lang_code == base_data.lang_code: 33 ↛ 32line 33 didn't jump to line 32 because the condition on line 33 was always true

34 data.sounds.extend(sounds) 

35 data.categories.extend(cats.get("categories", [])) 

36 

37 

38def process_sound_template( 

39 wxr: WiktextractContext, 

40 template_node: TemplateNode, 

41 sounds: list[Sound], 

42 cats: dict[str, list[str]], 

43) -> None: 

44 if template_node.template_name == "音声": 

45 audio_file = clean_node( 

46 wxr, None, template_node.template_parameters.get(2, "") 

47 ) 

48 if audio_file not in ["", "-"]: 48 ↛ 86line 48 didn't jump to line 86 because the condition on line 48 was always true

49 sound = Sound() 

50 raw_tag = clean_node( 

51 wxr, None, template_node.template_parameters.get(3, "") 

52 ) 

53 if len(raw_tag) > 0: 53 ↛ 55line 53 didn't jump to line 55 because the condition on line 53 was always true

54 sound.raw_tags.append(raw_tag) 

55 set_sound_file_url_fields(wxr, audio_file, sound) 

56 sounds.append(sound) 

57 elif template_node.template_name in ["IPA", "X-SAMPA"]: 

58 for index in itertools.count(1): 58 ↛ 86line 58 didn't jump to line 86 because the loop on line 58 didn't complete

59 if index not in template_node.template_parameters: 

60 break 

61 ipa = clean_node( 

62 wxr, None, template_node.template_parameters[index] 

63 ) 

64 if len(ipa) > 0: 64 ↛ 58line 64 didn't jump to line 58 because the condition on line 64 was always true

65 sound = Sound(ipa=ipa) 

66 if template_node.template_name == "X-SAMPA": 

67 sound.tags.append("X-SAMPA") 

68 sounds.append(sound) 

69 elif template_node.template_name == "homophones": 

70 homophones = [] 

71 for index in itertools.count(1): 71 ↛ 79line 71 didn't jump to line 79 because the loop on line 71 didn't complete

72 if index not in template_node.template_parameters: 

73 break 

74 homophone = clean_node( 

75 wxr, None, template_node.template_parameters[index] 

76 ) 

77 if len(homophone) > 0: 77 ↛ 71line 77 didn't jump to line 71 because the condition on line 77 was always true

78 homophones.append(homophone) 

79 if len(homophones) > 0: 79 ↛ 86line 79 didn't jump to line 86 because the condition on line 79 was always true

80 sounds.append(Sound(homophones=homophones)) 

81 elif template_node.template_name == "ja-pron": 

82 process_ja_pron_template(wxr, template_node, sounds) 

83 elif template_node.template_name == "ja-accent-common": 83 ↛ 86line 83 didn't jump to line 86 because the condition on line 83 was always true

84 process_ja_accent_common_template(wxr, template_node, sounds) 

85 

86 clean_node(wxr, cats, template_node) 

87 

88 

89JA_PRON_ACCENTS = { 

90 "中高型": "Nakadaka", 

91 "平板型": "Heiban", 

92 "頭高型": "Atamadaka", 

93 "尾高型": "Odaka", 

94} 

95 

96 

97def process_ja_pron_template( 

98 wxr: WiktextractContext, 

99 template_node: TemplateNode, 

100 sounds: list[Sound], 

101) -> None: 

102 # https://ja.wiktionary.org/wiki/テンプレート:ja-pron 

103 expanded_node = wxr.wtp.parse( 

104 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

105 ) 

106 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

107 if list_item.contain_node(NodeKind.TABLE): 107 ↛ 108line 107 didn't jump to line 108 because the condition on line 107 was never true

108 continue 

109 else: 

110 sound = Sound() 

111 for span_tag in list_item.find_html_recursively("span"): 

112 span_classes = span_tag.attrs.get("class", "") 

113 if "qualifier-content" in span_classes: 

114 raw_tag = clean_node(wxr, None, span_tag) 

115 if len(raw_tag) > 0: 115 ↛ 111line 115 didn't jump to line 111 because the condition on line 115 was always true

116 sound.raw_tags.append(raw_tag) 

117 elif "IPA" in span_classes: 

118 sound.ipa = clean_node(wxr, None, span_tag) 

119 elif "Latn" in span_classes: 

120 sound.roman = clean_node(wxr, None, span_tag) 

121 elif "Jpan" in span_classes: 

122 sound.form = clean_node(wxr, None, span_tag) 

123 for link_node in list_item.find_child(NodeKind.LINK): 

124 link_text = clean_node(wxr, None, link_node) 

125 if link_text in JA_PRON_ACCENTS: 

126 sound.tags.append(JA_PRON_ACCENTS[link_text]) 

127 if len(sound.model_dump(exclude_defaults=True)) > 0: 

128 sounds.append(sound) 

129 

130 for arg in ["a", "audio"]: 

131 audio_file = clean_node( 

132 wxr, None, template_node.template_parameters.get(arg, "") 

133 ) 

134 if len(audio_file) > 0: 

135 sound = Sound() 

136 set_sound_file_url_fields(wxr, audio_file, sound) 

137 sounds.append(sound) 

138 

139 

140JA_ACCENT_COMMON_TYPES = { 

141 "h": "Heiban", 

142 "a": "Atamadaka", 

143 "n": "Nakadaka", 

144 "o": "Odaka", 

145} 

146 

147 

148def process_ja_accent_common_template( 

149 wxr: WiktextractContext, 

150 template_node: TemplateNode, 

151 sounds: list[Sound], 

152) -> None: 

153 # https://ja.wiktionary.org/wiki/テンプレート:ja-accent-common 

154 expanded_node = wxr.wtp.parse( 

155 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

156 ) 

157 sound = Sound() 

158 for link_node in expanded_node.find_child_recursively(NodeKind.LINK): 158 ↛ 163line 158 didn't jump to line 163 because the loop on line 158 didn't complete

159 raw_tag = clean_node(wxr, None, link_node) 

160 if raw_tag != "": 160 ↛ 158line 160 didn't jump to line 158 because the condition on line 160 was always true

161 sound.raw_tags.append(raw_tag) 

162 break 

163 for span_tag in expanded_node.find_html_recursively("span"): 163 ↛ 168line 163 didn't jump to line 168 because the loop on line 163 didn't complete

164 span_text = clean_node(wxr, None, span_tag) 

165 if len(span_text) > 0: 165 ↛ 163line 165 didn't jump to line 163 because the condition on line 165 was always true

166 sound.form = span_text 

167 break 

168 accent_type = clean_node( 

169 wxr, None, template_node.template_parameters.get(1, "") 

170 ) 

171 if accent_type in JA_ACCENT_COMMON_TYPES: 171 ↛ 173line 171 didn't jump to line 173 because the condition on line 171 was always true

172 sound.tags.append(JA_ACCENT_COMMON_TYPES[accent_type]) 

173 if sound.form != "": 173 ↛ exitline 173 didn't return from function 'process_ja_accent_common_template' because the condition on line 173 was always true

174 sounds.append(sound) 

175 

176 

177def extract_zh_sounds( 

178 wxr: WiktextractContext, level_node: LevelNode, sounds: list[Sound] 

179) -> None: 

180 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

181 after_colon = False 

182 tag_nodes = [] 

183 value_nodes = [] 

184 for child in list_item.children: 

185 if isinstance(child, str) and ":" in child and not after_colon: 

186 tag_nodes.append(child[: child.index(":")]) 

187 value_nodes.append(child[child.index(":") + 1 :]) 

188 after_colon = True 

189 elif not after_colon: 189 ↛ 192line 189 didn't jump to line 192 because the condition on line 189 was always true

190 tag_nodes.append(child) 

191 else: 

192 value_nodes.append(child) 

193 sound = Sound( 

194 zh_pron=clean_node(wxr, None, value_nodes), 

195 raw_tags=[clean_node(wxr, None, tag_nodes)], 

196 ) 

197 translate_raw_tags(sound) 

198 sounds.append(sound)