Coverage for src/wiktextract/extractor/ko/example.py: 96%

99 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..ruby import extract_ruby 

6from ..share import calculate_bold_offsets, set_sound_file_url_fields 

7from .models import Example, Sense, Sound 

8 

9 

10def extract_example_list_item( 

11 wxr: WiktextractContext, 

12 sense: Sense, 

13 list_item: WikiNode, 

14 lang_code: str, 

15 parent_example: Example | None = None, 

16) -> None: 

17 example = Example() if parent_example is None else parent_example 

18 e_text_nodes = [] 

19 e_tr_nodes = [] 

20 after_lang_template = False 

21 for node in list_item.children: 

22 if isinstance(node, TemplateNode) and node.template_name == "lang": 

23 after_lang_template = True 

24 extract_example_lang_template(wxr, example, node, lang_code) 

25 elif isinstance(node, TemplateNode) and node.template_name.startswith( 

26 ("따옴", "지봉유설") 

27 ): 

28 example.ref = ( 

29 clean_node(wxr, None, node).strip("() ").removeprefix("따옴◄") 

30 ) 

31 elif isinstance(node, TemplateNode) and node.template_name in [ 

32 "예문", 

33 "ux", 

34 "uxi", 

35 ]: 

36 extract_ux_template(wxr, sense, example, node) 

37 break 

38 elif after_lang_template: 

39 e_tr_nodes.append(node) 

40 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

41 break 

42 elif ( 

43 isinstance(node, WikiNode) 

44 and node.kind == NodeKind.LINK 

45 and len(node.largs) > 0 

46 and len(node.largs[0]) > 0 

47 and isinstance(node.largs[0][0], str) 

48 and node.largs[0][0].startswith("File:") 

49 ): 

50 sound = Sound() 

51 sound_file = node.largs[0][0].removeprefix("File:").strip() 

52 set_sound_file_url_fields(wxr, sound_file, sound) 

53 if sound.audio != "": 53 ↛ 21line 53 didn't jump to line 21 because the condition on line 53 was always true

54 example.sounds.append(sound) 

55 else: 

56 e_text_nodes.append(node) 

57 

58 e_text = clean_node(wxr, sense, e_text_nodes) 

59 if e_text != "": 

60 example.text = e_text 

61 calculate_bold_offsets( 

62 wxr, 

63 wxr.wtp.parse(wxr.wtp.node_to_wikitext(e_text_nodes)), 

64 e_text, 

65 example, 

66 "bold_text_offsets", 

67 ) 

68 e_tr = clean_node(wxr, sense, e_tr_nodes) 

69 if e_tr != "": 

70 example.translation = e_tr 

71 

72 if len(example.text) > 0: 

73 if lang_code == "zh" and "/" in example.text: 

74 example.bold_text_offsets = example.bold_text_offsets[ 

75 : len(example.bold_text_offsets) // 2 

76 ] 

77 for index, text in enumerate(example.text.split("/", 1)): 

78 new_example = example.model_copy(deep=True) 

79 new_example.text = text 

80 new_example.tags.append( 

81 "Traditional Chinese" 

82 if index == 0 

83 else "Simplified Chinese" 

84 ) 

85 sense.examples.append(new_example) 

86 else: 

87 sense.examples.append(example) 

88 

89 for nested_list in list_item.find_child(NodeKind.LIST): 

90 for nested_list_item in nested_list.find_child(NodeKind.LIST_ITEM): 

91 extract_example_list_item( 

92 wxr, 

93 sense, 

94 nested_list_item, 

95 lang_code, 

96 example if example.text == "" else Example(), 

97 ) 

98 

99 

100def extract_example_lang_template( 

101 wxr: WiktextractContext, 

102 example: Example, 

103 node: TemplateNode, 

104 lang_code: str, 

105) -> None: 

106 # https://ko.wiktionary.org/wiki/틀:lang 

107 if lang_code == "ja": 

108 example.ruby, text_nodes = extract_ruby( 

109 wxr, 

110 wxr.wtp.parse( 

111 wxr.wtp.node_to_wikitext(node.template_parameters.get(2, "")), 

112 expand_all=True, 

113 ).children, 

114 ) 

115 example.text = clean_node(wxr, None, text_nodes) 

116 calculate_bold_offsets( 

117 wxr, 

118 wxr.wtp.parse(wxr.wtp.node_to_wikitext(text_nodes)), 

119 example.text, 

120 example, 

121 "bold_text_offsets", 

122 ) 

123 else: 

124 second_arg = node.template_parameters.get(2, "") 

125 example.text = clean_node(wxr, None, second_arg) 

126 calculate_bold_offsets( 

127 wxr, 

128 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)), 

129 example.text, 

130 example, 

131 "bold_text_offsets", 

132 ) 

133 tr_arg = node.template_parameters.get(4, "") 

134 example.translation = clean_node(wxr, None, tr_arg) 

135 calculate_bold_offsets( 

136 wxr, 

137 wxr.wtp.parse(wxr.wtp.node_to_wikitext(tr_arg)), 

138 example.translation, 

139 example, 

140 "bold_translation_offsets", 

141 ) 

142 if lang_code == "zh" and "(" in example.text and example.text.endswith(")"): 

143 roman_start_index = example.text.index("(") 

144 example.roman = example.text[roman_start_index:].strip("() ") 

145 example.text = example.text[:roman_start_index].strip() 

146 

147 

148def extract_ux_template( 

149 wxr: WiktextractContext, 

150 sense: Sense, 

151 example: Example, 

152 t_node: TemplateNode, 

153) -> None: 

154 # https://ko.wiktionary.org/wiki/틀:ux 

155 # https://ko.wiktionary.org/wiki/모듈:usex/templates 

156 lang_code = t_node.template_parameters.get(1, "") 

157 expanded_node = wxr.wtp.parse( 

158 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

159 ) 

160 if lang_code == "ja": 

161 for span_tag in expanded_node.find_html_recursively("span"): 

162 span_class = span_tag.attrs.get("class", "") 

163 if span_class == "Jpan": 

164 example.ruby, no_ruby = extract_ruby(wxr, span_tag) 

165 example.text = clean_node(wxr, None, no_ruby) 

166 calculate_bold_offsets( 

167 wxr, 

168 wxr.wtp.parse(wxr.wtp.node_to_wikitext(no_ruby)), 

169 example.text, 

170 example, 

171 "bold_text_offsets", 

172 ) 

173 elif span_class == "tr": 173 ↛ 161line 173 didn't jump to line 161 because the condition on line 173 was always true

174 example.roman = clean_node(wxr, None, span_tag) 

175 calculate_bold_offsets( 

176 wxr, 

177 wxr.wtp.parse(wxr.wtp.node_to_wikitext(span_tag)), 

178 example.roman, 

179 example, 

180 "bold_roman_offsets", 

181 ) 

182 tr_arg = t_node.template_parameters.get(4, "") 

183 example.translation = clean_node(wxr, None, tr_arg) 

184 calculate_bold_offsets( 

185 wxr, 

186 wxr.wtp.parse(wxr.wtp.node_to_wikitext(tr_arg)), 

187 example.translation, 

188 example, 

189 "bold_translation_offsets", 

190 ) 

191 lit_arg = t_node.template_parameters.get("lit", "") 

192 example.literal_meaning = clean_node(wxr, None, lit_arg) 

193 calculate_bold_offsets( 

194 wxr, 

195 wxr.wtp.parse(wxr.wtp.node_to_wikitext(lit_arg)), 

196 example.literal_meaning, 

197 example, 

198 "bold_literal_offsets", 

199 ) 

200 if example.ref == "": 200 ↛ 235line 200 didn't jump to line 235 because the condition on line 200 was always true

201 example.ref = clean_node( 

202 wxr, None, t_node.template_parameters.get("ref", "") 

203 ) 

204 else: 

205 second_arg = t_node.template_parameters.get(2, "") 

206 example.text = clean_node(wxr, None, second_arg) 

207 calculate_bold_offsets( 

208 wxr, 

209 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)), 

210 example.text, 

211 example, 

212 "bold_text_offsets", 

213 ) 

214 third_arg = t_node.template_parameters.get(3, "") 

215 example.translation = clean_node(wxr, None, third_arg) 

216 calculate_bold_offsets( 

217 wxr, 

218 wxr.wtp.parse(wxr.wtp.node_to_wikitext(third_arg)), 

219 example.translation, 

220 example, 

221 "bold_translation_offsets", 

222 ) 

223 example.note = clean_node( 

224 wxr, None, t_node.template_parameters.get("footer", "") 

225 ) 

226 if example.ref == "": 226 ↛ 230line 226 didn't jump to line 230 because the condition on line 226 was always true

227 example.ref = clean_node( 

228 wxr, None, t_node.template_parameters.get("출처", "") 

229 ) 

230 if example.ref == "": 230 ↛ 231line 230 didn't jump to line 231 because the condition on line 230 was never true

231 example.ref = clean_node( 

232 wxr, None, t_node.template_parameters.get("source", "") 

233 ) 

234 

235 for link_node in expanded_node.find_child(NodeKind.LINK): 

236 clean_node(wxr, sense, link_node)