Coverage for src/wiktextract/extractor/en/example.py: 53%

199 statements  

« prev     ^ index     » next       coverage.py v7.14.3, created at 2026-06-23 09:14 +0000

1from copy import deepcopy 

2 

3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...tags import valid_tags 

7from ...wxr_context import WiktextractContext 

8from ..ruby import extract_ruby 

9from ..share import calculate_bold_offsets 

10from .type_utils import ExampleData, SenseData 

11 

12 

13def extract_example_list_item( 

14 wxr: WiktextractContext, 

15 list_item: WikiNode, 

16 sense_data: SenseData, 

17 parent_data: ExampleData, 

18) -> list[ExampleData]: 

19 examples = [] 

20 if "tags" not in parent_data: 20 ↛ 21line 20 didn't jump to line 21 because the condition on line 20 was never true

21 parent_data["tags"] = [] 

22 if "raw_tags" not in parent_data: 22 ↛ 23line 22 didn't jump to line 23 because the condition on line 22 was never true

23 parent_data["raw_tags"] = [] 

24 for template_node in list_item.find_child(NodeKind.TEMPLATE): 

25 if template_node.template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]: 

26 examples.extend( 

27 extract_template_zh_x( 

28 wxr, 

29 template_node, 

30 sense_data, 

31 parent_data, 

32 ) 

33 ) 

34 elif ( 

35 template_node.template_name.startswith(("quote-", "RQ:")) 

36 or template_node.template_name == "quote" 

37 ): 

38 q_example = extract_quote_templates(wxr, template_node, sense_data) 

39 if list_item.contain_node(NodeKind.LIST): 

40 for next_list_item in list_item.find_child_recursively( 

41 NodeKind.LIST_ITEM 

42 ): 

43 for key in ["tags", "raw_tags"]: 

44 if key not in q_example: 

45 q_example[key] = [] 

46 examples.extend( 

47 extract_example_list_item( 

48 wxr, next_list_item, sense_data, q_example 

49 ) 

50 ) 

51 else: 

52 examples.append(q_example) 

53 elif template_node.template_name in [ 

54 "ux", 

55 "usex", 

56 "uxi", 

57 "ko-usex", 

58 "koex", 

59 "ko-x", 

60 "th-usex", 

61 "th-x", 

62 "th-xi", 

63 "uxa", 

64 "collocation", 

65 "co", 

66 "coi", 

67 "uxa", 

68 "ja-usex", 

69 "ja-x", 

70 "ja-ux", 

71 ]: 

72 copy_of_parent_data = deepcopy(parent_data) 

73 if template_node.template_name in ("collocation", "co", "coi"): 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true

74 copy_of_parent_data["tags"].append("collocation") 

75 examples.append( 

76 extract_ux_template( 

77 wxr, 

78 template_node, 

79 sense_data, 

80 copy_of_parent_data, 

81 ) 

82 ) 

83 

84 return examples 

85 

86 

87def extract_quote_templates( 

88 wxr: WiktextractContext, node: TemplateNode, sense_data: SenseData 

89) -> ExampleData: 

90 expanded_node = wxr.wtp.parse( 

91 wxr.wtp.node_to_wikitext(node), expand_all=True 

92 ) 

93 clean_node(wxr, sense_data, expanded_node) 

94 example_data = ExampleData( 

95 text="", ref="", english="", roman="", type="quote" 

96 ) 

97 for span_tag in expanded_node.find_html_recursively("span"): 97 ↛ 98line 97 didn't jump to line 98 because the loop on line 97 never started

98 span_class = span_tag.attrs.get("class", "") 

99 if "cited-source" == span_class: 

100 example_data["ref"] = clean_node(wxr, None, span_tag) 

101 elif "e-quotation" in span_class: 

102 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

103 if len(ruby_data) > 0: 

104 example_data["ruby"] = ruby_data 

105 example_data["text"] = clean_node(wxr, None, node_without_ruby) 

106 calculate_bold_offsets( 

107 wxr, 

108 span_tag, 

109 example_data["text"], 

110 example_data, 

111 "bold_text_offsets", 

112 ) 

113 elif "e-translation" in span_class: 

114 example_data["translation"] = clean_node( 

115 wxr, None, span_tag 

116 ) # DEPRECATED for "translation" 

117 example_data["english"] = example_data[ 

118 "translation" 

119 ] # DEPRECATED for "translation" 

120 calculate_bold_offsets( 

121 wxr, 

122 span_tag, 

123 example_data["translation"], 

124 example_data, 

125 "bold_translation_offsets", 

126 ) 

127 for i_tag in expanded_node.find_html_recursively( 127 ↛ 130line 127 didn't jump to line 130 because the loop on line 127 never started

128 "i", attr_name="class", attr_value="e-transliteration" 

129 ): 

130 example_data["roman"] = clean_node(wxr, None, i_tag) 

131 calculate_bold_offsets( 

132 wxr, 

133 span_tag, 

134 example_data["roman"], 

135 example_data, 

136 "bold_roman_offsets", 

137 ) 

138 break 

139 clean_example_empty_data(example_data) 

140 return example_data 

141 

142 

143def extract_template_ja_usex( 

144 wxr: WiktextractContext, 

145 node: TemplateNode, 

146 sense_data: SenseData, 

147 example_data: ExampleData, 

148) -> ExampleData: 

149 # https://en.wiktionary.org/wiki/Template:ja-usex 

150 expanded_node = wxr.wtp.parse( 

151 wxr.wtp.node_to_wikitext(node), expand_all=True 

152 ) 

153 clean_node(wxr, sense_data, expanded_node) 

154 for span_tag in expanded_node.find_html( 

155 "span", attr_name="class", attr_value="Jpan" 

156 ): 

157 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

158 example_data["text"] = clean_node(wxr, None, node_without_ruby) 

159 calculate_bold_offsets( 

160 wxr, 

161 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)), 

162 example_data["text"], 

163 example_data, 

164 "bold_text_offsets", 

165 ) 

166 example_data["ruby"] = ruby_data 

167 for span_tag in expanded_node.find_html_recursively( 

168 "span", attr_name="class", attr_value="tr" 

169 ): 

170 example_data["roman"] = clean_node(wxr, None, span_tag) 

171 calculate_bold_offsets( 

172 wxr, 

173 span_tag, 

174 example_data["roman"], 

175 example_data, 

176 "bold_roman_offsets", 

177 ) 

178 tr_arg = wxr.wtp.parse( 

179 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")), 

180 expand_all=True, 

181 ) 

182 example_data["translation"] = clean_node(wxr, None, tr_arg) 

183 example_data["english"] = example_data[ 

184 "translation" 

185 ] # DEPRECATED for "translation" 

186 calculate_bold_offsets( 

187 wxr, 

188 tr_arg, 

189 example_data["translation"], 

190 example_data, 

191 "bold_translation_offsets", 

192 ) 

193 lit_arg = wxr.wtp.parse( 

194 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")), 

195 expand_all=True, 

196 ) 

197 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg) 

198 calculate_bold_offsets( 

199 wxr, 

200 lit_arg, 

201 example_data["literal_meaning"], 

202 example_data, 

203 "bold_literal_offsets", 

204 ) 

205 clean_example_empty_data(example_data) 

206 return example_data 

207 

208 

209def extract_template_zh_x( 

210 wxr: WiktextractContext, 

211 template_node: TemplateNode, 

212 sense_data: SenseData | None, 

213 parent_example: ExampleData, 

214) -> list[ExampleData]: 

215 # https://en.wiktionary.org/wiki/Template:zh-x 

216 expanded_node = wxr.wtp.parse( 

217 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

218 ) 

219 clean_node(wxr, sense_data, expanded_node) 

220 has_dl_tag = False 

221 results = [] 

222 example_data = deepcopy(parent_example) 

223 tr_arg = wxr.wtp.parse( 

224 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")), 

225 expand_all=True, 

226 ) 

227 example_data["translation"] = clean_node(wxr, None, tr_arg) 

228 example_data["english"] = example_data["translation"] 

229 calculate_bold_offsets( 

230 wxr, 

231 tr_arg, 

232 example_data["translation"], 

233 example_data, 

234 "bold_translation_offsets", 

235 ) 

236 lit_arg = wxr.wtp.parse( 

237 wxr.wtp.node_to_wikitext( 

238 template_node.template_parameters.get("lit", "") 

239 ), 

240 expand_all=True, 

241 ) 

242 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg) 

243 calculate_bold_offsets( 

244 wxr, 

245 tr_arg, 

246 example_data["literal_meaning"], 

247 example_data, 

248 "bold_literal_offsets", 

249 ) 

250 for dl_tag in expanded_node.find_html_recursively("dl"): 250 ↛ 251line 250 didn't jump to line 251 because the loop on line 250 never started

251 has_dl_tag = True 

252 for dd_tag in dl_tag.find_html("dd"): 

253 dd_text = clean_node(wxr, None, dd_tag) 

254 if dd_text.startswith("From:"): 

255 example_data["ref"] = dd_text.removeprefix("From:") 

256 elif not dd_text.startswith("(literally,"): 

257 for span_tag in dd_tag.find_html_recursively( 

258 "span", attr_name="lang", attr_value="Latn" 

259 ): 

260 example_data["roman"] = clean_node(wxr, None, span_tag) 

261 calculate_bold_offsets( 

262 wxr, 

263 span_tag, 

264 example_data["roman"], 

265 example_data, 

266 "bold_roman_offsets", 

267 ) 

268 for span_tag in dd_tag.find_html_recursively("span"): 

269 span_text = clean_node(wxr, None, span_tag) 

270 if span_text.startswith("[") and span_text.endswith( 

271 "]" 

272 ): 

273 example_data["raw_tags"].append( 

274 span_text.strip("[]") 

275 ) 

276 break 

277 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data)) 

278 

279 # no source, single line example 

280 if not has_dl_tag: 280 ↛ 310line 280 didn't jump to line 310 because the condition on line 280 was always true

281 for span_tag in expanded_node.find_html( 

282 "span", attr_name="lang", attr_value="Latn" 

283 ): 

284 example_data["roman"] = clean_node(wxr, None, span_tag) 

285 calculate_bold_offsets( 

286 wxr, 

287 span_tag, 

288 example_data["roman"], 

289 example_data, 

290 "bold_roman_offsets", 

291 ) 

292 break 

293 for span_tag in expanded_node.find_html("span"): 

294 span_text = clean_node(wxr, None, span_tag) 

295 if span_text.startswith("[") and span_text.endswith("]"): 295 ↛ 296line 295 didn't jump to line 296 because the condition on line 295 was never true

296 example_data["raw_tags"].append(span_text.strip("[]")) 

297 for span_tag in expanded_node.find_html("span"): 

298 span_lang = span_tag.attrs.get("lang", "") 

299 if span_lang in ["zh-Hant", "zh-Hans"]: 

300 example_text = clean_node(wxr, None, span_tag) 

301 if len(example_text) > 0: 301 ↛ 297line 301 didn't jump to line 297 because the condition on line 301 was always true

302 new_example = add_zh_hant_hans_spans( 

303 wxr, 

304 example_data, 

305 example_text, 

306 span_tag, 

307 span_lang, 

308 ) 

309 results.append(new_example) 

310 return results 

311 

312 

313def extract_zh_x_dl_span_tag( 

314 wxr: WiktextractContext, dl_tag: HTMLNode, example: ExampleData 

315) -> list[ExampleData]: 

316 # process example text span tag and dialect span tag 

317 results = [] 

318 is_first_hide = True 

319 for span_tag in dl_tag.find_html("span"): 

320 span_lang = span_tag.attrs.get("lang", "") 

321 if span_lang in ["zh-Hant", "zh-Hans"]: 

322 new_example = add_zh_hant_hans_spans( 

323 wxr, 

324 example, 

325 clean_node(wxr, None, span_tag), 

326 span_tag, 

327 span_lang, 

328 ) 

329 results.append(new_example) 

330 elif "vsHide" in span_tag.attrs.get("class", ""): 

331 # template has arg "collapsed=y" 

332 results.extend( 

333 extract_zh_x_dl_span_tag( 

334 wxr, 

335 span_tag, 

336 results[-1] 

337 if is_first_hide and len(results) > 0 

338 else example, 

339 ) 

340 ) 

341 is_first_hide = False 

342 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 

343 for link_node in span_tag.find_child_recursively(NodeKind.LINK): 

344 raw_tag = clean_node(wxr, None, link_node) 

345 if len(raw_tag) > 0: 

346 if len(results) > 0: 

347 results[-1]["raw_tags"].append(raw_tag) 

348 else: 

349 example["raw_tags"].append(raw_tag) 

350 

351 if dl_tag.tag == "dl": 

352 for data in results: 

353 clean_example_empty_data(data) 

354 return results 

355 

356 

357ZH_X_TAGS = { 

358 "trad.": "Traditional-Chinese", 

359 "simp.": "Simplified-Chinese", 

360 "Taiwanese Mandarin": "Taiwanese-Mandarin", 

361 "MSC": "Standard-Chinese", 

362 "Literary Chinese": "Literary-Chinese", 

363 "Classical Chinese": "Classical-Chinese", 

364 "Guangzhou Cantonese": "Guangzhou-Cantonese", 

365} 

366 

367 

368def clean_example_empty_data(data: ExampleData) -> None: 

369 # remove empty data and convert raw tags 

370 raw_tags = data.get("raw_tags", []) 

371 new_raw_tags = [] 

372 for raw_tag in raw_tags: 

373 if raw_tag in ZH_X_TAGS: 373 ↛ 374line 373 didn't jump to line 374 because the condition on line 373 was never true

374 data["tags"].append(ZH_X_TAGS[raw_tag]) 

375 elif raw_tag in valid_tags: 375 ↛ 376line 375 didn't jump to line 376 because the condition on line 375 was never true

376 data["tags"].append(raw_tag) 

377 else: 

378 new_raw_tags.append(raw_tag) 

379 data["raw_tags"] = new_raw_tags 

380 if len(data.get("ref", "")) > 0: 380 ↛ 381line 380 didn't jump to line 381 because the condition on line 380 was never true

381 data["type"] = "quotation" 

382 else: 

383 data["type"] = "example" 

384 for key, value in data.copy().items(): 

385 if len(value) == 0: 

386 del data[key] 

387 

388 

389def extract_ux_template( 

390 wxr: WiktextractContext, 

391 t_node: TemplateNode, 

392 sense_data: SenseData, 

393 example_data: ExampleData, 

394) -> ExampleData: 

395 expanded_node = wxr.wtp.parse( 

396 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

397 ) 

398 clean_node(wxr, sense_data, expanded_node) 

399 for html_node in expanded_node.find_child_recursively(NodeKind.HTML): 

400 class_names = html_node.attrs.get("class", "") 

401 if len(class_names) == 0: 401 ↛ 402line 401 didn't jump to line 402 because the condition on line 401 was never true

402 continue 

403 if "e-example" in class_names: 

404 # extract ruby in Japanese template 

405 if t_node.template_name in ("ja-usex", "ja-x", "ja-ux"): 405 ↛ 406line 405 didn't jump to line 406 because the condition on line 405 was never true

406 ruby_data, node_without_ruby = extract_ruby(wxr, html_node) 

407 example_data["text"] = clean_node(wxr, None, node_without_ruby) 

408 calculate_bold_offsets( 

409 wxr, 

410 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)), 

411 example_data["text"], 

412 example_data, 

413 "bold_text_offsets", 

414 ) 

415 example_data["ruby"] = ruby_data 

416 else: 

417 example_data["text"] = clean_node(wxr, None, html_node) 

418 calculate_bold_offsets( 

419 wxr, 

420 html_node, 

421 example_data["text"], 

422 example_data, 

423 "bold_text_offsets", 

424 ) 

425 elif "e-transliteration" in class_names: 425 ↛ 426line 425 didn't jump to line 426 because the condition on line 425 was never true

426 example_data["roman"] = clean_node(wxr, None, html_node) 

427 calculate_bold_offsets( 

428 wxr, 

429 html_node, 

430 example_data["roman"], 

431 example_data, 

432 "bold_roman_offsets", 

433 ) 

434 elif "e-translation" in class_names: 

435 example_data["translation"] = clean_node(wxr, None, html_node) 

436 example_data["english"] = example_data[ 

437 "translation" 

438 ] # DEPRECATED for "translation" 

439 calculate_bold_offsets( 

440 wxr, 

441 html_node, 

442 example_data["translation"], 

443 example_data, 

444 "bold_translation_offsets", 

445 ) 

446 elif "e-literally" in class_names: 446 ↛ 447line 446 didn't jump to line 447 because the condition on line 446 was never true

447 example_data["literal_meaning"] = clean_node(wxr, None, html_node) 

448 calculate_bold_offsets( 

449 wxr, 

450 html_node, 

451 example_data["literal_meaning"], 

452 example_data, 

453 "bold_literal_offsets", 

454 ) 

455 elif "qualifier-content" in class_names: 

456 raw_tag = clean_node(wxr, None, html_node) 

457 if raw_tag != "": 457 ↛ 399line 457 didn't jump to line 399 because the condition on line 457 was always true

458 example_data["raw_tags"].append(raw_tag) 

459 

460 clean_example_empty_data(example_data) 

461 return example_data 

462 

463 

464def add_zh_hant_hans_spans( 

465 wxr, example_data, example_text, span_tag, span_lang 

466): 

467 new_example = deepcopy(example_data) 

468 new_example["text"] = example_text 

469 calculate_bold_offsets( 

470 wxr, 

471 span_tag, 

472 example_text, 

473 new_example, 

474 "bold_text_offsets", 

475 ) 

476 new_example["tags"].append( 

477 "Traditional-Chinese" 

478 if span_lang == "zh-Hant" 

479 else "Simplified-Chinese" 

480 ) 

481 clean_example_empty_data(new_example) 

482 return new_example