Coverage for src/wiktextract/extractor/en/example.py: 60%

182 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-11 10:26 +0000

1from copy import deepcopy 

2 

3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...tags import valid_tags 

7from ...wxr_context import WiktextractContext 

8from ..ruby import extract_ruby 

9from ..share import calculate_bold_offsets 

10from .type_utils import ExampleData, SenseData 

11 

12 

13def extract_example_list_item( 

14 wxr: WiktextractContext, 

15 list_item: WikiNode, 

16 sense_data: SenseData, 

17 parent_data: ExampleData, 

18) -> list[ExampleData]: 

19 examples = [] 

20 for template_node in list_item.find_child(NodeKind.TEMPLATE): 

21 if template_node.template_name in ["zh-x", "zh-usex", "zh-q"]: 

22 examples.extend( 

23 extract_template_zh_x( 

24 wxr, 

25 template_node, 

26 sense_data, 

27 parent_data, 

28 ) 

29 ) 

30 elif template_node.template_name in ["ja-usex", "ja-x", "ja-ux"]: 

31 examples.append( 

32 extract_template_ja_usex( 

33 wxr, 

34 template_node, 

35 sense_data, 

36 parent_data, 

37 ) 

38 ) 

39 elif ( 

40 template_node.template_name.startswith(("quote-", "RQ:")) 

41 or template_node.template_name == "quote" 

42 ): 

43 q_example = extract_quote_templates(wxr, template_node, sense_data) 

44 if list_item.contain_node(NodeKind.LIST): 

45 for next_list_item in list_item.find_child_recursively( 

46 NodeKind.LIST_ITEM 

47 ): 

48 for key in ["tags", "raw_tags"]: 

49 if key not in q_example: 

50 q_example[key] = [] 

51 examples.extend( 

52 extract_example_list_item( 

53 wxr, next_list_item, sense_data, q_example 

54 ) 

55 ) 

56 else: 

57 examples.append(q_example) 

58 elif template_node.template_name in [ 

59 "ux", 

60 "usex", 

61 "uxi", 

62 "ko-usex", 

63 "koex", 

64 "ko-x", 

65 "th-usex", 

66 "th-x", 

67 "th-xi", 

68 "uxa", 

69 "collocation", 

70 "co", 

71 "coi", 

72 ]: 

73 copy_of_parent_data = deepcopy(parent_data) 

74 if template_node.template_name in ("collocation", "co", "coi"): 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 copy_of_parent_data["tags"].append("collocation") 

76 examples.append( 

77 extract_ux_template( 

78 wxr, 

79 template_node, 

80 sense_data, 

81 copy_of_parent_data, 

82 ) 

83 ) 

84 

85 return examples 

86 

87 

88def extract_quote_templates( 

89 wxr: WiktextractContext, node: TemplateNode, sense_data: SenseData 

90) -> ExampleData: 

91 expanded_node = wxr.wtp.parse( 

92 wxr.wtp.node_to_wikitext(node), expand_all=True 

93 ) 

94 clean_node(wxr, sense_data, expanded_node) 

95 example_data = ExampleData( 

96 text="", ref="", english="", roman="", type="quote" 

97 ) 

98 for span_tag in expanded_node.find_html_recursively("span"): 98 ↛ 99line 98 didn't jump to line 99 because the loop on line 98 never started

99 span_class = span_tag.attrs.get("class", "") 

100 if "cited-source" == span_class: 

101 example_data["ref"] = clean_node(wxr, None, span_tag) 

102 elif "e-quotation" in span_class: 

103 example_data["text"] = clean_node(wxr, None, span_tag) 

104 calculate_bold_offsets( 

105 wxr, 

106 span_tag, 

107 example_data["text"], 

108 example_data, 

109 "bold_text_offsets", 

110 ) 

111 elif "e-translation" in span_class: 

112 example_data["english"] = clean_node(wxr, None, span_tag) 

113 calculate_bold_offsets( 

114 wxr, 

115 span_tag, 

116 example_data["english"], 

117 example_data, 

118 "bold_english_offsets", 

119 ) 

120 for i_tag in expanded_node.find_html_recursively( 120 ↛ 123line 120 didn't jump to line 123 because the loop on line 120 never started

121 "i", attr_name="class", attr_value="e-transliteration" 

122 ): 

123 example_data["roman"] = clean_node(wxr, None, i_tag) 

124 calculate_bold_offsets( 

125 wxr, 

126 span_tag, 

127 example_data["roman"], 

128 example_data, 

129 "bold_roman_offsets", 

130 ) 

131 break 

132 clean_example_empty_data(example_data) 

133 return example_data 

134 

135 

136def extract_template_ja_usex( 

137 wxr: WiktextractContext, 

138 node: TemplateNode, 

139 sense_data: SenseData, 

140 example_data: ExampleData, 

141) -> ExampleData: 

142 # https://en.wiktionary.org/wiki/Template:ja-usex 

143 expanded_node = wxr.wtp.parse( 

144 wxr.wtp.node_to_wikitext(node), expand_all=True 

145 ) 

146 clean_node(wxr, sense_data, expanded_node) 

147 for span_tag in expanded_node.find_html( 147 ↛ 150line 147 didn't jump to line 150 because the loop on line 147 never started

148 "span", attr_name="class", attr_value="Jpan" 

149 ): 

150 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

151 example_data["text"] = clean_node(wxr, None, node_without_ruby) 

152 calculate_bold_offsets( 

153 wxr, 

154 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)), 

155 example_data["text"], 

156 example_data, 

157 "bold_text_offsets", 

158 ) 

159 example_data["ruby"] = ruby_data 

160 for span_tag in expanded_node.find_html_recursively( 160 ↛ 163line 160 didn't jump to line 163 because the loop on line 160 never started

161 "span", attr_name="class", attr_value="tr" 

162 ): 

163 example_data["roman"] = clean_node(wxr, None, span_tag) 

164 calculate_bold_offsets( 

165 wxr, 

166 span_tag, 

167 example_data["roman"], 

168 example_data, 

169 "bold_roman_offsets", 

170 ) 

171 tr_arg = wxr.wtp.parse( 

172 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")), 

173 expand_all=True, 

174 ) 

175 example_data["english"] = clean_node(wxr, None, tr_arg) 

176 calculate_bold_offsets( 

177 wxr, 

178 tr_arg, 

179 example_data["english"], 

180 example_data, 

181 "bold_english_offsets", 

182 ) 

183 lit_arg = wxr.wtp.parse( 

184 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")), 

185 expand_all=True, 

186 ) 

187 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg) 

188 calculate_bold_offsets( 

189 wxr, 

190 lit_arg, 

191 example_data["literal_meaning"], 

192 example_data, 

193 "bold_literal_offsets", 

194 ) 

195 clean_example_empty_data(example_data) 

196 return example_data 

197 

198 

199def extract_template_zh_x( 

200 wxr: WiktextractContext, 

201 template_node: TemplateNode, 

202 sense_data: SenseData | None, 

203 parent_example: ExampleData, 

204) -> list[ExampleData]: 

205 # https://en.wiktionary.org/wiki/Template:zh-x 

206 expanded_node = wxr.wtp.parse( 

207 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

208 ) 

209 clean_node(wxr, sense_data, expanded_node) 

210 has_dl_tag = False 

211 results = [] 

212 example_data = deepcopy(parent_example) 

213 tr_arg = wxr.wtp.parse( 

214 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")), 

215 expand_all=True, 

216 ) 

217 example_data["english"] = clean_node(wxr, None, tr_arg) 

218 calculate_bold_offsets( 

219 wxr, 

220 tr_arg, 

221 example_data["english"], 

222 example_data, 

223 "bold_english_offsets", 

224 ) 

225 lit_arg = wxr.wtp.parse( 

226 wxr.wtp.node_to_wikitext( 

227 template_node.template_parameters.get("lit", "") 

228 ), 

229 expand_all=True, 

230 ) 

231 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg) 

232 calculate_bold_offsets( 

233 wxr, 

234 tr_arg, 

235 example_data["literal_meaning"], 

236 example_data, 

237 "bold_literal_offsets", 

238 ) 

239 for dl_tag in expanded_node.find_html_recursively("dl"): 239 ↛ 240line 239 didn't jump to line 240 because the loop on line 239 never started

240 has_dl_tag = True 

241 for dd_tag in dl_tag.find_html("dd"): 

242 dd_text = clean_node(wxr, None, dd_tag) 

243 if dd_text.startswith("From:"): 

244 example_data["ref"] = dd_text.removeprefix("From:") 

245 elif not dd_text.startswith("(literally,"): 

246 for span_tag in dd_tag.find_html_recursively( 

247 "span", attr_name="lang", attr_value="Latn" 

248 ): 

249 example_data["roman"] = clean_node(wxr, None, span_tag) 

250 calculate_bold_offsets( 

251 wxr, 

252 span_tag, 

253 example_data["roman"], 

254 example_data, 

255 "bold_roman_offsets", 

256 ) 

257 for span_tag in dd_tag.find_html_recursively("span"): 

258 span_text = clean_node(wxr, None, span_tag) 

259 if span_text.startswith("[") and span_text.endswith( 

260 "]" 

261 ): 

262 example_data["raw_tags"].append( 

263 span_text.strip("[]") 

264 ) 

265 break 

266 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data)) 

267 

268 # no source, single line example 

269 if not has_dl_tag: 269 ↛ 307line 269 didn't jump to line 307 because the condition on line 269 was always true

270 for span_tag in expanded_node.find_html( 

271 "span", attr_name="lang", attr_value="Latn" 

272 ): 

273 example_data["roman"] = clean_node(wxr, None, span_tag) 

274 calculate_bold_offsets( 

275 wxr, 

276 span_tag, 

277 example_data["roman"], 

278 example_data, 

279 "bold_roman_offsets", 

280 ) 

281 break 

282 for span_tag in expanded_node.find_html("span"): 

283 span_text = clean_node(wxr, None, span_tag) 

284 if span_text.startswith("[") and span_text.endswith("]"): 284 ↛ 285line 284 didn't jump to line 285 because the condition on line 284 was never true

285 example_data["raw_tags"].append(span_text.strip("[]")) 

286 for span_tag in expanded_node.find_html("span"): 

287 span_lang = span_tag.attrs.get("lang", "") 

288 if span_lang in ["zh-Hant", "zh-Hans"]: 

289 example_text = clean_node(wxr, None, span_tag) 

290 if len(example_text) > 0: 290 ↛ 286line 290 didn't jump to line 286 because the condition on line 290 was always true

291 new_example = deepcopy(example_data) 

292 new_example["text"] = example_text 

293 calculate_bold_offsets( 

294 wxr, 

295 span_tag, 

296 example_text, 

297 new_example, 

298 "bold_text_offsets", 

299 ) 

300 new_example["tags"].append( 

301 "Traditional Chinese" 

302 if span_lang == "zh-Hant" 

303 else "Simplified Chinese" 

304 ) 

305 clean_example_empty_data(new_example) 

306 results.append(new_example) 

307 return results 

308 

309 

310def extract_zh_x_dl_span_tag( 

311 wxr: WiktextractContext, dl_tag: HTMLNode, example: ExampleData 

312) -> list[ExampleData]: 

313 # process example text span tag and dialect span tag 

314 results = [] 

315 is_first_hide = True 

316 for span_tag in dl_tag.find_html("span"): 

317 span_lang = span_tag.attrs.get("lang", "") 

318 if span_lang in ["zh-Hant", "zh-Hans"]: 

319 new_example = deepcopy(example) 

320 new_example["text"] = clean_node(wxr, None, span_tag) 

321 calculate_bold_offsets( 

322 wxr, 

323 span_tag, 

324 new_example["text"], 

325 new_example, 

326 "bold_text_offsets", 

327 ) 

328 results.append(new_example) 

329 elif "vsHide" in span_tag.attrs.get("class", ""): 

330 # template has arg "collapsed=y" 

331 results.extend( 

332 extract_zh_x_dl_span_tag( 

333 wxr, 

334 span_tag, 

335 results[-1] 

336 if is_first_hide and len(results) > 0 

337 else example, 

338 ) 

339 ) 

340 is_first_hide = False 

341 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 

342 for link_node in span_tag.find_child_recursively(NodeKind.LINK): 

343 raw_tag = clean_node(wxr, None, link_node) 

344 if len(raw_tag) > 0: 

345 if len(results) > 0: 

346 results[-1]["raw_tags"].append(raw_tag) 

347 else: 

348 example["raw_tags"].append(raw_tag) 

349 

350 if dl_tag.tag == "dl": 

351 for data in results: 

352 clean_example_empty_data(data) 

353 return results 

354 

355 

356ZH_X_TAGS = { 

357 "trad.": "Traditional Chinese", 

358 "simp.": "Simplified Chinese", 

359} 

360 

361 

362def clean_example_empty_data(data: ExampleData) -> None: 

363 # remove empty data and convert raw tags 

364 raw_tags = data.get("raw_tags", []) 

365 new_raw_tags = [] 

366 for raw_tag in raw_tags: 

367 if raw_tag in ZH_X_TAGS: 367 ↛ 368line 367 didn't jump to line 368 because the condition on line 367 was never true

368 data["tags"].append(ZH_X_TAGS[raw_tag]) 

369 elif raw_tag in valid_tags: 369 ↛ 370line 369 didn't jump to line 370 because the condition on line 369 was never true

370 data["tags"].append(raw_tag) 

371 else: 

372 new_raw_tags.append(raw_tag) 

373 data["raw_tags"] = new_raw_tags 

374 if len(data.get("ref", "")) > 0: 374 ↛ 375line 374 didn't jump to line 375 because the condition on line 374 was never true

375 data["type"] = "quote" 

376 else: 

377 data["type"] = "example" 

378 for key, value in data.copy().items(): 

379 if len(value) == 0: 

380 del data[key] 

381 

382 

383def extract_ux_template( 

384 wxr: WiktextractContext, 

385 t_node: TemplateNode, 

386 sense_data: SenseData, 

387 example_data: ExampleData, 

388) -> ExampleData: 

389 expanded_node = wxr.wtp.parse( 

390 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

391 ) 

392 clean_node(wxr, sense_data, expanded_node) 

393 for html_node in expanded_node.find_child_recursively(NodeKind.HTML): 

394 class_names = html_node.attrs.get("class", "") 

395 if "e-example" in class_names: 

396 example_data["text"] = clean_node(wxr, None, html_node) 

397 calculate_bold_offsets( 

398 wxr, 

399 html_node, 

400 example_data["text"], 

401 example_data, 

402 "bold_text_offsets", 

403 ) 

404 elif "e-transliteration" in class_names: 404 ↛ 405line 404 didn't jump to line 405 because the condition on line 404 was never true

405 example_data["roman"] = clean_node(wxr, None, html_node) 

406 calculate_bold_offsets( 

407 wxr, 

408 html_node, 

409 example_data["roman"], 

410 example_data, 

411 "bold_roman_offsets", 

412 ) 

413 elif "e-translation" in class_names: 

414 example_data["english"] = clean_node(wxr, None, html_node) 

415 calculate_bold_offsets( 

416 wxr, 

417 html_node, 

418 example_data["english"], 

419 example_data, 

420 "bold_english_offsets", 

421 ) 

422 elif "e-literally" in class_names: 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true

423 example_data["literal_meaning"] = clean_node(wxr, None, html_node) 

424 calculate_bold_offsets( 

425 wxr, 

426 html_node, 

427 example_data["literal_meaning"], 

428 example_data, 

429 "bold_literal_offsets", 

430 ) 

431 elif "qualifier-content" in class_names: 

432 raw_tag = clean_node(wxr, None, html_node) 

433 if raw_tag != "": 433 ↛ 393line 433 didn't jump to line 393 because the condition on line 433 was always true

434 example_data["raw_tags"].append(raw_tag) 

435 

436 clean_example_empty_data(example_data) 

437 return example_data