Coverage for src/wiktextract/extractor/en/example.py: 59%

189 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1from copy import deepcopy 

2 

3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...tags import valid_tags 

7from ...wxr_context import WiktextractContext 

8from ..ruby import extract_ruby 

9from ..share import calculate_bold_offsets 

10from .type_utils import ExampleData, SenseData 

11 

12 

13def extract_example_list_item( 

14 wxr: WiktextractContext, 

15 list_item: WikiNode, 

16 sense_data: SenseData, 

17 parent_data: ExampleData, 

18) -> list[ExampleData]: 

19 examples = [] 

20 for template_node in list_item.find_child(NodeKind.TEMPLATE): 

21 if template_node.template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]: 

22 examples.extend( 

23 extract_template_zh_x( 

24 wxr, 

25 template_node, 

26 sense_data, 

27 parent_data, 

28 ) 

29 ) 

30 elif template_node.template_name in ["ja-usex", "ja-x", "ja-ux"]: 

31 examples.append( 

32 extract_template_ja_usex( 

33 wxr, 

34 template_node, 

35 sense_data, 

36 parent_data, 

37 ) 

38 ) 

39 elif ( 

40 template_node.template_name.startswith(("quote-", "RQ:")) 

41 or template_node.template_name == "quote" 

42 ): 

43 q_example = extract_quote_templates(wxr, template_node, sense_data) 

44 if list_item.contain_node(NodeKind.LIST): 

45 for next_list_item in list_item.find_child_recursively( 

46 NodeKind.LIST_ITEM 

47 ): 

48 for key in ["tags", "raw_tags"]: 

49 if key not in q_example: 

50 q_example[key] = [] 

51 examples.extend( 

52 extract_example_list_item( 

53 wxr, next_list_item, sense_data, q_example 

54 ) 

55 ) 

56 else: 

57 examples.append(q_example) 

58 elif template_node.template_name in [ 

59 "ux", 

60 "usex", 

61 "uxi", 

62 "ko-usex", 

63 "koex", 

64 "ko-x", 

65 "th-usex", 

66 "th-x", 

67 "th-xi", 

68 "uxa", 

69 "collocation", 

70 "co", 

71 "coi", 

72 ]: 

73 copy_of_parent_data = deepcopy(parent_data) 

74 if template_node.template_name in ("collocation", "co", "coi"): 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 copy_of_parent_data["tags"].append("collocation") 

76 examples.append( 

77 extract_ux_template( 

78 wxr, 

79 template_node, 

80 sense_data, 

81 copy_of_parent_data, 

82 ) 

83 ) 

84 

85 return examples 

86 

87 

88def extract_quote_templates( 

89 wxr: WiktextractContext, node: TemplateNode, sense_data: SenseData 

90) -> ExampleData: 

91 expanded_node = wxr.wtp.parse( 

92 wxr.wtp.node_to_wikitext(node), expand_all=True 

93 ) 

94 clean_node(wxr, sense_data, expanded_node) 

95 example_data = ExampleData( 

96 text="", ref="", english="", roman="", type="quote" 

97 ) 

98 for span_tag in expanded_node.find_html_recursively("span"): 98 ↛ 99line 98 didn't jump to line 99 because the loop on line 98 never started

99 span_class = span_tag.attrs.get("class", "") 

100 if "cited-source" == span_class: 

101 example_data["ref"] = clean_node(wxr, None, span_tag) 

102 elif "e-quotation" in span_class: 

103 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

104 if len(ruby_data) > 0: 

105 example_data["ruby"] = ruby_data 

106 example_data["text"] = clean_node(wxr, None, node_without_ruby) 

107 calculate_bold_offsets( 

108 wxr, 

109 span_tag, 

110 example_data["text"], 

111 example_data, 

112 "bold_text_offsets", 

113 ) 

114 elif "e-translation" in span_class: 

115 example_data["translation"] = clean_node( 

116 wxr, None, span_tag 

117 ) # DEPRECATED for "translation" 

118 example_data["english"] = example_data[ 

119 "translation" 

120 ] # DEPRECATED for "translation" 

121 calculate_bold_offsets( 

122 wxr, 

123 span_tag, 

124 example_data["translation"], 

125 example_data, 

126 "bold_translation_offsets", 

127 ) 

128 for i_tag in expanded_node.find_html_recursively( 128 ↛ 131line 128 didn't jump to line 131 because the loop on line 128 never started

129 "i", attr_name="class", attr_value="e-transliteration" 

130 ): 

131 example_data["roman"] = clean_node(wxr, None, i_tag) 

132 calculate_bold_offsets( 

133 wxr, 

134 span_tag, 

135 example_data["roman"], 

136 example_data, 

137 "bold_roman_offsets", 

138 ) 

139 break 

140 clean_example_empty_data(example_data) 

141 return example_data 

142 

143 

144def extract_template_ja_usex( 

145 wxr: WiktextractContext, 

146 node: TemplateNode, 

147 sense_data: SenseData, 

148 example_data: ExampleData, 

149) -> ExampleData: 

150 # https://en.wiktionary.org/wiki/Template:ja-usex 

151 expanded_node = wxr.wtp.parse( 

152 wxr.wtp.node_to_wikitext(node), expand_all=True 

153 ) 

154 clean_node(wxr, sense_data, expanded_node) 

155 for span_tag in expanded_node.find_html( 155 ↛ 158line 155 didn't jump to line 158 because the loop on line 155 never started

156 "span", attr_name="class", attr_value="Jpan" 

157 ): 

158 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

159 example_data["text"] = clean_node(wxr, None, node_without_ruby) 

160 calculate_bold_offsets( 

161 wxr, 

162 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)), 

163 example_data["text"], 

164 example_data, 

165 "bold_text_offsets", 

166 ) 

167 example_data["ruby"] = ruby_data 

168 for span_tag in expanded_node.find_html_recursively( 168 ↛ 171line 168 didn't jump to line 171 because the loop on line 168 never started

169 "span", attr_name="class", attr_value="tr" 

170 ): 

171 example_data["roman"] = clean_node(wxr, None, span_tag) 

172 calculate_bold_offsets( 

173 wxr, 

174 span_tag, 

175 example_data["roman"], 

176 example_data, 

177 "bold_roman_offsets", 

178 ) 

179 tr_arg = wxr.wtp.parse( 

180 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")), 

181 expand_all=True, 

182 ) 

183 example_data["translation"] = clean_node(wxr, None, tr_arg) 

184 example_data["english"] = example_data[ 

185 "translation" 

186 ] # DEPRECATED for "translation" 

187 calculate_bold_offsets( 

188 wxr, 

189 tr_arg, 

190 example_data["translation"], 

191 example_data, 

192 "bold_translation_offsets", 

193 ) 

194 lit_arg = wxr.wtp.parse( 

195 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")), 

196 expand_all=True, 

197 ) 

198 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg) 

199 calculate_bold_offsets( 

200 wxr, 

201 lit_arg, 

202 example_data["literal_meaning"], 

203 example_data, 

204 "bold_literal_offsets", 

205 ) 

206 clean_example_empty_data(example_data) 

207 return example_data 

208 

209 

210def extract_template_zh_x( 

211 wxr: WiktextractContext, 

212 template_node: TemplateNode, 

213 sense_data: SenseData | None, 

214 parent_example: ExampleData, 

215) -> list[ExampleData]: 

216 # https://en.wiktionary.org/wiki/Template:zh-x 

217 expanded_node = wxr.wtp.parse( 

218 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

219 ) 

220 clean_node(wxr, sense_data, expanded_node) 

221 has_dl_tag = False 

222 results = [] 

223 example_data = deepcopy(parent_example) 

224 tr_arg = wxr.wtp.parse( 

225 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")), 

226 expand_all=True, 

227 ) 

228 example_data["translation"] = clean_node(wxr, None, tr_arg) 

229 example_data["english"] = example_data["translation"] 

230 calculate_bold_offsets( 

231 wxr, 

232 tr_arg, 

233 example_data["translation"], 

234 example_data, 

235 "bold_translation_offsets", 

236 ) 

237 lit_arg = wxr.wtp.parse( 

238 wxr.wtp.node_to_wikitext( 

239 template_node.template_parameters.get("lit", "") 

240 ), 

241 expand_all=True, 

242 ) 

243 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg) 

244 calculate_bold_offsets( 

245 wxr, 

246 tr_arg, 

247 example_data["literal_meaning"], 

248 example_data, 

249 "bold_literal_offsets", 

250 ) 

251 for dl_tag in expanded_node.find_html_recursively("dl"): 251 ↛ 252line 251 didn't jump to line 252 because the loop on line 251 never started

252 has_dl_tag = True 

253 for dd_tag in dl_tag.find_html("dd"): 

254 dd_text = clean_node(wxr, None, dd_tag) 

255 if dd_text.startswith("From:"): 

256 example_data["ref"] = dd_text.removeprefix("From:") 

257 elif not dd_text.startswith("(literally,"): 

258 for span_tag in dd_tag.find_html_recursively( 

259 "span", attr_name="lang", attr_value="Latn" 

260 ): 

261 example_data["roman"] = clean_node(wxr, None, span_tag) 

262 calculate_bold_offsets( 

263 wxr, 

264 span_tag, 

265 example_data["roman"], 

266 example_data, 

267 "bold_roman_offsets", 

268 ) 

269 for span_tag in dd_tag.find_html_recursively("span"): 

270 span_text = clean_node(wxr, None, span_tag) 

271 if span_text.startswith("[") and span_text.endswith( 

272 "]" 

273 ): 

274 example_data["raw_tags"].append( 

275 span_text.strip("[]") 

276 ) 

277 break 

278 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data)) 

279 

280 # no source, single line example 

281 if not has_dl_tag: 281 ↛ 319line 281 didn't jump to line 319 because the condition on line 281 was always true

282 for span_tag in expanded_node.find_html( 

283 "span", attr_name="lang", attr_value="Latn" 

284 ): 

285 example_data["roman"] = clean_node(wxr, None, span_tag) 

286 calculate_bold_offsets( 

287 wxr, 

288 span_tag, 

289 example_data["roman"], 

290 example_data, 

291 "bold_roman_offsets", 

292 ) 

293 break 

294 for span_tag in expanded_node.find_html("span"): 

295 span_text = clean_node(wxr, None, span_tag) 

296 if span_text.startswith("[") and span_text.endswith("]"): 296 ↛ 297line 296 didn't jump to line 297 because the condition on line 296 was never true

297 example_data["raw_tags"].append(span_text.strip("[]")) 

298 for span_tag in expanded_node.find_html("span"): 

299 span_lang = span_tag.attrs.get("lang", "") 

300 if span_lang in ["zh-Hant", "zh-Hans"]: 

301 example_text = clean_node(wxr, None, span_tag) 

302 if len(example_text) > 0: 302 ↛ 298line 302 didn't jump to line 298 because the condition on line 302 was always true

303 new_example = deepcopy(example_data) 

304 new_example["text"] = example_text 

305 calculate_bold_offsets( 

306 wxr, 

307 span_tag, 

308 example_text, 

309 new_example, 

310 "bold_text_offsets", 

311 ) 

312 new_example["tags"].append( 

313 "Traditional-Chinese" 

314 if span_lang == "zh-Hant" 

315 else "Simplified-Chinese" 

316 ) 

317 clean_example_empty_data(new_example) 

318 results.append(new_example) 

319 return results 

320 

321 

322def extract_zh_x_dl_span_tag( 

323 wxr: WiktextractContext, dl_tag: HTMLNode, example: ExampleData 

324) -> list[ExampleData]: 

325 # process example text span tag and dialect span tag 

326 results = [] 

327 is_first_hide = True 

328 for span_tag in dl_tag.find_html("span"): 

329 span_lang = span_tag.attrs.get("lang", "") 

330 if span_lang in ["zh-Hant", "zh-Hans"]: 

331 new_example = deepcopy(example) 

332 new_example["text"] = clean_node(wxr, None, span_tag) 

333 calculate_bold_offsets( 

334 wxr, 

335 span_tag, 

336 new_example["text"], 

337 new_example, 

338 "bold_text_offsets", 

339 ) 

340 results.append(new_example) 

341 elif "vsHide" in span_tag.attrs.get("class", ""): 

342 # template has arg "collapsed=y" 

343 results.extend( 

344 extract_zh_x_dl_span_tag( 

345 wxr, 

346 span_tag, 

347 results[-1] 

348 if is_first_hide and len(results) > 0 

349 else example, 

350 ) 

351 ) 

352 is_first_hide = False 

353 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 

354 for link_node in span_tag.find_child_recursively(NodeKind.LINK): 

355 raw_tag = clean_node(wxr, None, link_node) 

356 if len(raw_tag) > 0: 

357 if len(results) > 0: 

358 results[-1]["raw_tags"].append(raw_tag) 

359 else: 

360 example["raw_tags"].append(raw_tag) 

361 

362 if dl_tag.tag == "dl": 

363 for data in results: 

364 clean_example_empty_data(data) 

365 return results 

366 

367 

368ZH_X_TAGS = { 

369 "trad.": "Traditional-Chinese", 

370 "simp.": "Simplified-Chinese", 

371} 

372 

373 

374def clean_example_empty_data(data: ExampleData) -> None: 

375 # remove empty data and convert raw tags 

376 raw_tags = data.get("raw_tags", []) 

377 new_raw_tags = [] 

378 for raw_tag in raw_tags: 

379 if raw_tag in ZH_X_TAGS: 379 ↛ 380line 379 didn't jump to line 380 because the condition on line 379 was never true

380 data["tags"].append(ZH_X_TAGS[raw_tag]) 

381 elif raw_tag in valid_tags: 381 ↛ 382line 381 didn't jump to line 382 because the condition on line 381 was never true

382 data["tags"].append(raw_tag) 

383 else: 

384 new_raw_tags.append(raw_tag) 

385 data["raw_tags"] = new_raw_tags 

386 if len(data.get("ref", "")) > 0: 386 ↛ 387line 386 didn't jump to line 387 because the condition on line 386 was never true

387 data["type"] = "quote" 

388 else: 

389 data["type"] = "example" 

390 for key, value in data.copy().items(): 

391 if len(value) == 0: 

392 del data[key] 

393 

394 

395def extract_ux_template( 

396 wxr: WiktextractContext, 

397 t_node: TemplateNode, 

398 sense_data: SenseData, 

399 example_data: ExampleData, 

400) -> ExampleData: 

401 expanded_node = wxr.wtp.parse( 

402 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

403 ) 

404 clean_node(wxr, sense_data, expanded_node) 

405 for html_node in expanded_node.find_child_recursively(NodeKind.HTML): 

406 class_names = html_node.attrs.get("class", "") 

407 if "e-example" in class_names: 

408 example_data["text"] = clean_node(wxr, None, html_node) 

409 calculate_bold_offsets( 

410 wxr, 

411 html_node, 

412 example_data["text"], 

413 example_data, 

414 "bold_text_offsets", 

415 ) 

416 elif "e-transliteration" in class_names: 416 ↛ 417line 416 didn't jump to line 417 because the condition on line 416 was never true

417 example_data["roman"] = clean_node(wxr, None, html_node) 

418 calculate_bold_offsets( 

419 wxr, 

420 html_node, 

421 example_data["roman"], 

422 example_data, 

423 "bold_roman_offsets", 

424 ) 

425 elif "e-translation" in class_names: 

426 example_data["translation"] = clean_node(wxr, None, html_node) 

427 example_data["english"] = example_data[ 

428 "translation" 

429 ] # DEPRECATED for "translation" 

430 calculate_bold_offsets( 

431 wxr, 

432 html_node, 

433 example_data["translation"], 

434 example_data, 

435 "bold_translation_offsets", 

436 ) 

437 elif "e-literally" in class_names: 437 ↛ 438line 437 didn't jump to line 438 because the condition on line 437 was never true

438 example_data["literal_meaning"] = clean_node(wxr, None, html_node) 

439 calculate_bold_offsets( 

440 wxr, 

441 html_node, 

442 example_data["literal_meaning"], 

443 example_data, 

444 "bold_literal_offsets", 

445 ) 

446 elif "qualifier-content" in class_names: 

447 raw_tag = clean_node(wxr, None, html_node) 

448 if raw_tag != "": 448 ↛ 405line 448 didn't jump to line 405 because the condition on line 448 was always true

449 example_data["raw_tags"].append(raw_tag) 

450 

451 clean_example_empty_data(example_data) 

452 return example_data