Coverage for src/wiktextract/extractor/ko/example.py: 58%

163 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..ruby import extract_ruby 

6from ..share import calculate_bold_offsets, set_sound_file_url_fields 

7from .models import Example, Sense, Sound 

8from .tags import translate_raw_tags 

9 

10 

11def extract_example_list_item( 

12 wxr: WiktextractContext, 

13 sense: Sense, 

14 list_item: WikiNode, 

15 lang_code: str, 

16 parent_example: Example | None = None, 

17) -> None: 

18 example = Example() if parent_example is None else parent_example 

19 e_text_nodes = [] 

20 e_tr_nodes = [] 

21 after_lang_template = False 

22 for node in list_item.children: 

23 if isinstance(node, TemplateNode) and node.template_name == "lang": 

24 after_lang_template = True 

25 extract_example_lang_template(wxr, example, node, lang_code) 

26 elif isinstance(node, TemplateNode) and node.template_name.startswith( 

27 ("따옴", "지봉유설") 

28 ): 

29 example.ref = ( 

30 clean_node(wxr, None, node).strip("() ").removeprefix("따옴◄") 

31 ) 

32 elif isinstance(node, TemplateNode) and node.template_name in [ 

33 "예문", 

34 "ux", 

35 "uxi", 

36 ]: 

37 extract_ux_template(wxr, sense, example, node) 

38 break 

39 elif isinstance(node, TemplateNode) and node.template_name in [ 39 ↛ 43line 39 didn't jump to line 43 because the condition on line 39 was never true

40 "zh-x", 

41 "zh-usex", 

42 ]: 

43 extract_template_zh_x(wxr, sense, node) 

44 break 

45 elif after_lang_template: 

46 e_tr_nodes.append(node) 

47 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

48 break 

49 elif ( 

50 isinstance(node, WikiNode) 

51 and node.kind == NodeKind.LINK 

52 and len(node.largs) > 0 

53 and len(node.largs[0]) > 0 

54 and isinstance(node.largs[0][0], str) 

55 and node.largs[0][0].startswith("File:") 

56 ): 

57 sound = Sound() 

58 sound_file = node.largs[0][0].removeprefix("File:").strip() 

59 set_sound_file_url_fields(wxr, sound_file, sound) 

60 if sound.audio != "": 60 ↛ 22line 60 didn't jump to line 22 because the condition on line 60 was always true

61 example.sounds.append(sound) 

62 else: 

63 e_text_nodes.append(node) 

64 

65 e_text = clean_node(wxr, sense, e_text_nodes) 

66 if e_text != "": 

67 example.text = e_text 

68 calculate_bold_offsets( 

69 wxr, 

70 wxr.wtp.parse(wxr.wtp.node_to_wikitext(e_text_nodes)), 

71 e_text, 

72 example, 

73 "bold_text_offsets", 

74 ) 

75 e_tr = clean_node(wxr, sense, e_tr_nodes) 

76 if e_tr != "": 

77 example.translation = e_tr 

78 

79 if len(example.text) > 0: 

80 if lang_code == "zh" and "/" in example.text: 

81 example.bold_text_offsets = example.bold_text_offsets[ 

82 : len(example.bold_text_offsets) // 2 

83 ] 

84 for index, text in enumerate(example.text.split("/", 1)): 

85 new_example = example.model_copy(deep=True) 

86 new_example.text = text 

87 new_example.tags.append( 

88 "Traditional-Chinese" 

89 if index == 0 

90 else "Simplified-Chinese" 

91 ) 

92 sense.examples.append(new_example) 

93 else: 

94 sense.examples.append(example) 

95 

96 for nested_list in list_item.find_child(NodeKind.LIST): 

97 for nested_list_item in nested_list.find_child(NodeKind.LIST_ITEM): 

98 extract_example_list_item( 

99 wxr, 

100 sense, 

101 nested_list_item, 

102 lang_code, 

103 example if example.text == "" else Example(), 

104 ) 

105 

106 

107def extract_example_lang_template( 

108 wxr: WiktextractContext, 

109 example: Example, 

110 node: TemplateNode, 

111 lang_code: str, 

112) -> None: 

113 # https://ko.wiktionary.org/wiki/틀:lang 

114 if lang_code == "ja": 

115 example.ruby, text_nodes = extract_ruby( 

116 wxr, 

117 wxr.wtp.parse( 

118 wxr.wtp.node_to_wikitext(node.template_parameters.get(2, "")), 

119 expand_all=True, 

120 ).children, 

121 ) 

122 example.text = clean_node(wxr, None, text_nodes) 

123 calculate_bold_offsets( 

124 wxr, 

125 wxr.wtp.parse(wxr.wtp.node_to_wikitext(text_nodes)), 

126 example.text, 

127 example, 

128 "bold_text_offsets", 

129 ) 

130 else: 

131 second_arg = node.template_parameters.get(2, "") 

132 example.text = clean_node(wxr, None, second_arg) 

133 calculate_bold_offsets( 

134 wxr, 

135 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)), 

136 example.text, 

137 example, 

138 "bold_text_offsets", 

139 ) 

140 tr_arg = node.template_parameters.get(4, "") 

141 example.translation = clean_node(wxr, None, tr_arg) 

142 calculate_bold_offsets( 

143 wxr, 

144 wxr.wtp.parse(wxr.wtp.node_to_wikitext(tr_arg)), 

145 example.translation, 

146 example, 

147 "bold_translation_offsets", 

148 ) 

149 if lang_code == "zh" and "(" in example.text and example.text.endswith(")"): 

150 roman_start_index = example.text.index("(") 

151 example.roman = example.text[roman_start_index:].strip("() ") 

152 example.text = example.text[:roman_start_index].strip() 

153 

154 

155def extract_ux_template( 

156 wxr: WiktextractContext, 

157 sense: Sense, 

158 example: Example, 

159 t_node: TemplateNode, 

160) -> None: 

161 # https://ko.wiktionary.org/wiki/틀:ux 

162 # https://ko.wiktionary.org/wiki/모듈:usex/templates 

163 lang_code = t_node.template_parameters.get(1, "") 

164 expanded_node = wxr.wtp.parse( 

165 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

166 ) 

167 if lang_code == "ja": 

168 for span_tag in expanded_node.find_html_recursively("span"): 

169 span_class = span_tag.attrs.get("class", "") 

170 if span_class == "Jpan": 

171 example.ruby, no_ruby = extract_ruby(wxr, span_tag) 

172 example.text = clean_node(wxr, None, no_ruby) 

173 calculate_bold_offsets( 

174 wxr, 

175 wxr.wtp.parse(wxr.wtp.node_to_wikitext(no_ruby)), 

176 example.text, 

177 example, 

178 "bold_text_offsets", 

179 ) 

180 elif span_class == "tr": 180 ↛ 168line 180 didn't jump to line 168 because the condition on line 180 was always true

181 example.roman = clean_node(wxr, None, span_tag) 

182 calculate_bold_offsets( 

183 wxr, 

184 wxr.wtp.parse(wxr.wtp.node_to_wikitext(span_tag)), 

185 example.roman, 

186 example, 

187 "bold_roman_offsets", 

188 ) 

189 tr_arg = t_node.template_parameters.get(4, "") 

190 example.translation = clean_node(wxr, None, tr_arg) 

191 calculate_bold_offsets( 

192 wxr, 

193 wxr.wtp.parse(wxr.wtp.node_to_wikitext(tr_arg)), 

194 example.translation, 

195 example, 

196 "bold_translation_offsets", 

197 ) 

198 lit_arg = t_node.template_parameters.get("lit", "") 

199 example.literal_meaning = clean_node(wxr, None, lit_arg) 

200 calculate_bold_offsets( 

201 wxr, 

202 wxr.wtp.parse(wxr.wtp.node_to_wikitext(lit_arg)), 

203 example.literal_meaning, 

204 example, 

205 "bold_literal_offsets", 

206 ) 

207 if example.ref == "": 207 ↛ 242line 207 didn't jump to line 242 because the condition on line 207 was always true

208 example.ref = clean_node( 

209 wxr, None, t_node.template_parameters.get("ref", "") 

210 ) 

211 else: 

212 second_arg = t_node.template_parameters.get(2, "") 

213 example.text = clean_node(wxr, None, second_arg) 

214 calculate_bold_offsets( 

215 wxr, 

216 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)), 

217 example.text, 

218 example, 

219 "bold_text_offsets", 

220 ) 

221 third_arg = t_node.template_parameters.get(3, "") 

222 example.translation = clean_node(wxr, None, third_arg) 

223 calculate_bold_offsets( 

224 wxr, 

225 wxr.wtp.parse(wxr.wtp.node_to_wikitext(third_arg)), 

226 example.translation, 

227 example, 

228 "bold_translation_offsets", 

229 ) 

230 example.note = clean_node( 

231 wxr, None, t_node.template_parameters.get("footer", "") 

232 ) 

233 if example.ref == "": 233 ↛ 237line 233 didn't jump to line 237 because the condition on line 233 was always true

234 example.ref = clean_node( 

235 wxr, None, t_node.template_parameters.get("출처", "") 

236 ) 

237 if example.ref == "": 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true

238 example.ref = clean_node( 

239 wxr, None, t_node.template_parameters.get("source", "") 

240 ) 

241 

242 for link_node in expanded_node.find_child(NodeKind.LINK): 

243 clean_node(wxr, sense, link_node) 

244 

245 

246def extract_template_zh_x( 

247 wxr: WiktextractContext, 

248 sense: Sense, 

249 t_node: TemplateNode, 

250) -> None: 

251 expanded_node = wxr.wtp.parse( 

252 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

253 ) 

254 examples = [] 

255 for dl_tag in expanded_node.find_html("dl"): 

256 examples.extend(extract_zh_x_dl_tag(wxr, dl_tag)) 

257 if len(examples) == 0: 

258 examples.extend(extract_zh_x_no_dl_tag(wxr, expanded_node)) 

259 

260 second_arg = t_node.template_parameters.get(2, "") 

261 translation = clean_node(wxr, None, second_arg) 

262 for e_data in examples: 

263 e_data.translation = translation 

264 calculate_bold_offsets( 

265 wxr, 

266 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)), 

267 translation, 

268 e_data, 

269 "bold_translation_offsets", 

270 ) 

271 translate_raw_tags(e_data) 

272 

273 for link_node in expanded_node.find_child(NodeKind.LINK): 

274 clean_node(wxr, sense, link_node) 

275 

276 sense.examples.extend(examples) 

277 

278 

279def extract_zh_x_dl_tag( 

280 wxr: WiktextractContext, dl_tag: HTMLNode 

281) -> list[Example]: 

282 examples = [] 

283 for span_tag in dl_tag.find_html("span"): 

284 if "lang" in span_tag.attrs: 

285 e_text = clean_node(wxr, None, span_tag) 

286 if e_text != "": 

287 e_data = Example(text=e_text) 

288 calculate_bold_offsets( 

289 wxr, span_tag, e_text, e_data, "bold_text_offsets" 

290 ) 

291 examples.append(e_data) 

292 else: 

293 raw_tags = clean_node(wxr, None, span_tag).strip("[] ") 

294 for raw_tag in raw_tags.split(","): 

295 raw_tag = raw_tag.strip() 

296 if raw_tag != "" and len(examples) > 0: 

297 examples[-1].raw_tags.append(raw_tag) 

298 for dd_tag in dl_tag.find_html("dd"): 

299 for span_tag in dd_tag.find_html("span"): 

300 if "Latn" in span_tag.attrs.get("lang", ""): 

301 roman = clean_node(wxr, None, span_tag) 

302 for e_data in examples: 

303 e_data.roman = roman 

304 calculate_bold_offsets( 

305 wxr, span_tag, roman, e_data, "bold_roman_offsets" 

306 ) 

307 else: 

308 raw_tag = clean_node(wxr, None, span_tag).strip("[] ") 

309 if raw_tag != "": 

310 for e_data in examples: 

311 e_data.raw_tags.append(raw_tag) 

312 return examples 

313 

314 

315def extract_zh_x_no_dl_tag( 

316 wxr: WiktextractContext, expanded_node: WikiNode 

317) -> list[Example]: 

318 examples = [] 

319 for span_tag in expanded_node.find_html("span"): 

320 lang = span_tag.attrs.get("lang", "") 

321 match lang: 

322 case "zh-Latn": 

323 roman = clean_node(wxr, None, span_tag) 

324 for e_data in examples: 

325 e_data.roman = roman 

326 calculate_bold_offsets( 

327 wxr, span_tag, roman, e_data, "bold_roman_offsets" 

328 ) 

329 case "zh-Hant" | "zh-Hans": 

330 e_text = clean_node(wxr, None, span_tag) 

331 example = Example(text=e_text) 

332 example.tags.append( 

333 "Traditional-Chinese" 

334 if lang == "zh-Hant" 

335 else "Simplified-Chinese" 

336 ) 

337 if example.text != "": 

338 calculate_bold_offsets( 

339 wxr, span_tag, e_text, example, "bold_text_offsets" 

340 ) 

341 examples.append(example) 

342 

343 return examples