Coverage for src / wiktextract / extractor / en / example.py: 59%

189 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-11 03:38 +0000

1from copy import deepcopy 

2 

3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...tags import valid_tags 

7from ...wxr_context import WiktextractContext 

8from ..ruby import extract_ruby 

9from ..share import calculate_bold_offsets 

10from .type_utils import ExampleData, SenseData 

11 

12 

13def extract_example_list_item( 

14 wxr: WiktextractContext, 

15 list_item: WikiNode, 

16 sense_data: SenseData, 

17 parent_data: ExampleData, 

18) -> list[ExampleData]: 

19 examples = [] 

20 for template_node in list_item.find_child(NodeKind.TEMPLATE): 

21 if template_node.template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]: 

22 examples.extend( 

23 extract_template_zh_x( 

24 wxr, 

25 template_node, 

26 sense_data, 

27 parent_data, 

28 ) 

29 ) 

30 elif template_node.template_name in ["ja-usex", "ja-x", "ja-ux"]: 

31 examples.append( 

32 extract_template_ja_usex( 

33 wxr, 

34 template_node, 

35 sense_data, 

36 parent_data, 

37 ) 

38 ) 

39 elif ( 

40 template_node.template_name.startswith(("quote-", "RQ:")) 

41 or template_node.template_name == "quote" 

42 ): 

43 q_example = extract_quote_templates(wxr, template_node, sense_data) 

44 if list_item.contain_node(NodeKind.LIST): 

45 for next_list_item in list_item.find_child_recursively( 

46 NodeKind.LIST_ITEM 

47 ): 

48 for key in ["tags", "raw_tags"]: 

49 if key not in q_example: 

50 q_example[key] = [] 

51 examples.extend( 

52 extract_example_list_item( 

53 wxr, next_list_item, sense_data, q_example 

54 ) 

55 ) 

56 else: 

57 examples.append(q_example) 

58 elif template_node.template_name in [ 

59 "ux", 

60 "usex", 

61 "uxi", 

62 "ko-usex", 

63 "koex", 

64 "ko-x", 

65 "th-usex", 

66 "th-x", 

67 "th-xi", 

68 "uxa", 

69 "collocation", 

70 "co", 

71 "coi", 

72 "uxa", 

73 ]: 

74 copy_of_parent_data = deepcopy(parent_data) 

75 if template_node.template_name in ("collocation", "co", "coi"): 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 copy_of_parent_data["tags"].append("collocation") 

77 examples.append( 

78 extract_ux_template( 

79 wxr, 

80 template_node, 

81 sense_data, 

82 copy_of_parent_data, 

83 ) 

84 ) 

85 

86 return examples 

87 

88 

89def extract_quote_templates( 

90 wxr: WiktextractContext, node: TemplateNode, sense_data: SenseData 

91) -> ExampleData: 

92 expanded_node = wxr.wtp.parse( 

93 wxr.wtp.node_to_wikitext(node), expand_all=True 

94 ) 

95 clean_node(wxr, sense_data, expanded_node) 

96 example_data = ExampleData( 

97 text="", ref="", english="", roman="", type="quote" 

98 ) 

99 for span_tag in expanded_node.find_html_recursively("span"): 99 ↛ 100line 99 didn't jump to line 100 because the loop on line 99 never started

100 span_class = span_tag.attrs.get("class", "") 

101 if "cited-source" == span_class: 

102 example_data["ref"] = clean_node(wxr, None, span_tag) 

103 elif "e-quotation" in span_class: 

104 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

105 if len(ruby_data) > 0: 

106 example_data["ruby"] = ruby_data 

107 example_data["text"] = clean_node(wxr, None, node_without_ruby) 

108 calculate_bold_offsets( 

109 wxr, 

110 span_tag, 

111 example_data["text"], 

112 example_data, 

113 "bold_text_offsets", 

114 ) 

115 elif "e-translation" in span_class: 

116 example_data["translation"] = clean_node( 

117 wxr, None, span_tag 

118 ) # DEPRECATED for "translation" 

119 example_data["english"] = example_data[ 

120 "translation" 

121 ] # DEPRECATED for "translation" 

122 calculate_bold_offsets( 

123 wxr, 

124 span_tag, 

125 example_data["translation"], 

126 example_data, 

127 "bold_translation_offsets", 

128 ) 

129 for i_tag in expanded_node.find_html_recursively( 129 ↛ 132line 129 didn't jump to line 132 because the loop on line 129 never started

130 "i", attr_name="class", attr_value="e-transliteration" 

131 ): 

132 example_data["roman"] = clean_node(wxr, None, i_tag) 

133 calculate_bold_offsets( 

134 wxr, 

135 span_tag, 

136 example_data["roman"], 

137 example_data, 

138 "bold_roman_offsets", 

139 ) 

140 break 

141 clean_example_empty_data(example_data) 

142 return example_data 

143 

144 

145def extract_template_ja_usex( 

146 wxr: WiktextractContext, 

147 node: TemplateNode, 

148 sense_data: SenseData, 

149 example_data: ExampleData, 

150) -> ExampleData: 

151 # https://en.wiktionary.org/wiki/Template:ja-usex 

152 expanded_node = wxr.wtp.parse( 

153 wxr.wtp.node_to_wikitext(node), expand_all=True 

154 ) 

155 clean_node(wxr, sense_data, expanded_node) 

156 for span_tag in expanded_node.find_html( 156 ↛ 159line 156 didn't jump to line 159 because the loop on line 156 never started

157 "span", attr_name="class", attr_value="Jpan" 

158 ): 

159 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

160 example_data["text"] = clean_node(wxr, None, node_without_ruby) 

161 calculate_bold_offsets( 

162 wxr, 

163 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)), 

164 example_data["text"], 

165 example_data, 

166 "bold_text_offsets", 

167 ) 

168 example_data["ruby"] = ruby_data 

169 for span_tag in expanded_node.find_html_recursively( 169 ↛ 172line 169 didn't jump to line 172 because the loop on line 169 never started

170 "span", attr_name="class", attr_value="tr" 

171 ): 

172 example_data["roman"] = clean_node(wxr, None, span_tag) 

173 calculate_bold_offsets( 

174 wxr, 

175 span_tag, 

176 example_data["roman"], 

177 example_data, 

178 "bold_roman_offsets", 

179 ) 

180 tr_arg = wxr.wtp.parse( 

181 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")), 

182 expand_all=True, 

183 ) 

184 example_data["translation"] = clean_node(wxr, None, tr_arg) 

185 example_data["english"] = example_data[ 

186 "translation" 

187 ] # DEPRECATED for "translation" 

188 calculate_bold_offsets( 

189 wxr, 

190 tr_arg, 

191 example_data["translation"], 

192 example_data, 

193 "bold_translation_offsets", 

194 ) 

195 lit_arg = wxr.wtp.parse( 

196 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")), 

197 expand_all=True, 

198 ) 

199 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg) 

200 calculate_bold_offsets( 

201 wxr, 

202 lit_arg, 

203 example_data["literal_meaning"], 

204 example_data, 

205 "bold_literal_offsets", 

206 ) 

207 clean_example_empty_data(example_data) 

208 return example_data 

209 

210 

211def extract_template_zh_x( 

212 wxr: WiktextractContext, 

213 template_node: TemplateNode, 

214 sense_data: SenseData | None, 

215 parent_example: ExampleData, 

216) -> list[ExampleData]: 

217 # https://en.wiktionary.org/wiki/Template:zh-x 

218 expanded_node = wxr.wtp.parse( 

219 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

220 ) 

221 clean_node(wxr, sense_data, expanded_node) 

222 has_dl_tag = False 

223 results = [] 

224 example_data = deepcopy(parent_example) 

225 tr_arg = wxr.wtp.parse( 

226 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")), 

227 expand_all=True, 

228 ) 

229 example_data["translation"] = clean_node(wxr, None, tr_arg) 

230 example_data["english"] = example_data["translation"] 

231 calculate_bold_offsets( 

232 wxr, 

233 tr_arg, 

234 example_data["translation"], 

235 example_data, 

236 "bold_translation_offsets", 

237 ) 

238 lit_arg = wxr.wtp.parse( 

239 wxr.wtp.node_to_wikitext( 

240 template_node.template_parameters.get("lit", "") 

241 ), 

242 expand_all=True, 

243 ) 

244 example_data["literal_meaning"] = clean_node(wxr, None, lit_arg) 

245 calculate_bold_offsets( 

246 wxr, 

247 tr_arg, 

248 example_data["literal_meaning"], 

249 example_data, 

250 "bold_literal_offsets", 

251 ) 

252 for dl_tag in expanded_node.find_html_recursively("dl"): 252 ↛ 253line 252 didn't jump to line 253 because the loop on line 252 never started

253 has_dl_tag = True 

254 for dd_tag in dl_tag.find_html("dd"): 

255 dd_text = clean_node(wxr, None, dd_tag) 

256 if dd_text.startswith("From:"): 

257 example_data["ref"] = dd_text.removeprefix("From:") 

258 elif not dd_text.startswith("(literally,"): 

259 for span_tag in dd_tag.find_html_recursively( 

260 "span", attr_name="lang", attr_value="Latn" 

261 ): 

262 example_data["roman"] = clean_node(wxr, None, span_tag) 

263 calculate_bold_offsets( 

264 wxr, 

265 span_tag, 

266 example_data["roman"], 

267 example_data, 

268 "bold_roman_offsets", 

269 ) 

270 for span_tag in dd_tag.find_html_recursively("span"): 

271 span_text = clean_node(wxr, None, span_tag) 

272 if span_text.startswith("[") and span_text.endswith( 

273 "]" 

274 ): 

275 example_data["raw_tags"].append( 

276 span_text.strip("[]") 

277 ) 

278 break 

279 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data)) 

280 

281 # no source, single line example 

282 if not has_dl_tag: 282 ↛ 320line 282 didn't jump to line 320 because the condition on line 282 was always true

283 for span_tag in expanded_node.find_html( 

284 "span", attr_name="lang", attr_value="Latn" 

285 ): 

286 example_data["roman"] = clean_node(wxr, None, span_tag) 

287 calculate_bold_offsets( 

288 wxr, 

289 span_tag, 

290 example_data["roman"], 

291 example_data, 

292 "bold_roman_offsets", 

293 ) 

294 break 

295 for span_tag in expanded_node.find_html("span"): 

296 span_text = clean_node(wxr, None, span_tag) 

297 if span_text.startswith("[") and span_text.endswith("]"): 297 ↛ 298line 297 didn't jump to line 298 because the condition on line 297 was never true

298 example_data["raw_tags"].append(span_text.strip("[]")) 

299 for span_tag in expanded_node.find_html("span"): 

300 span_lang = span_tag.attrs.get("lang", "") 

301 if span_lang in ["zh-Hant", "zh-Hans"]: 

302 example_text = clean_node(wxr, None, span_tag) 

303 if len(example_text) > 0: 303 ↛ 299line 303 didn't jump to line 299 because the condition on line 303 was always true

304 new_example = deepcopy(example_data) 

305 new_example["text"] = example_text 

306 calculate_bold_offsets( 

307 wxr, 

308 span_tag, 

309 example_text, 

310 new_example, 

311 "bold_text_offsets", 

312 ) 

313 new_example["tags"].append( 

314 "Traditional-Chinese" 

315 if span_lang == "zh-Hant" 

316 else "Simplified-Chinese" 

317 ) 

318 clean_example_empty_data(new_example) 

319 results.append(new_example) 

320 return results 

321 

322 

323def extract_zh_x_dl_span_tag( 

324 wxr: WiktextractContext, dl_tag: HTMLNode, example: ExampleData 

325) -> list[ExampleData]: 

326 # process example text span tag and dialect span tag 

327 results = [] 

328 is_first_hide = True 

329 for span_tag in dl_tag.find_html("span"): 

330 span_lang = span_tag.attrs.get("lang", "") 

331 if span_lang in ["zh-Hant", "zh-Hans"]: 

332 new_example = deepcopy(example) 

333 new_example["text"] = clean_node(wxr, None, span_tag) 

334 calculate_bold_offsets( 

335 wxr, 

336 span_tag, 

337 new_example["text"], 

338 new_example, 

339 "bold_text_offsets", 

340 ) 

341 results.append(new_example) 

342 elif "vsHide" in span_tag.attrs.get("class", ""): 

343 # template has arg "collapsed=y" 

344 results.extend( 

345 extract_zh_x_dl_span_tag( 

346 wxr, 

347 span_tag, 

348 results[-1] 

349 if is_first_hide and len(results) > 0 

350 else example, 

351 ) 

352 ) 

353 is_first_hide = False 

354 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 

355 for link_node in span_tag.find_child_recursively(NodeKind.LINK): 

356 raw_tag = clean_node(wxr, None, link_node) 

357 if len(raw_tag) > 0: 

358 if len(results) > 0: 

359 results[-1]["raw_tags"].append(raw_tag) 

360 else: 

361 example["raw_tags"].append(raw_tag) 

362 

363 if dl_tag.tag == "dl": 

364 for data in results: 

365 clean_example_empty_data(data) 

366 return results 

367 

368 

369ZH_X_TAGS = { 

370 "trad.": "Traditional-Chinese", 

371 "simp.": "Simplified-Chinese", 

372 "Taiwanese Mandarin": "Taiwanese-Mandarin", 

373 "MSC": "Standard-Chinese", 

374 "Literary Chinese": "Literary-Chinese", 

375 "Classical Chinese": "Classical-Chinese", 

376 "Guangzhou Cantonese": "Guangzhou-Cantonese", 

377} 

378 

379 

380def clean_example_empty_data(data: ExampleData) -> None: 

381 # remove empty data and convert raw tags 

382 raw_tags = data.get("raw_tags", []) 

383 new_raw_tags = [] 

384 for raw_tag in raw_tags: 

385 if raw_tag in ZH_X_TAGS: 385 ↛ 386line 385 didn't jump to line 386 because the condition on line 385 was never true

386 data["tags"].append(ZH_X_TAGS[raw_tag]) 

387 elif raw_tag in valid_tags: 387 ↛ 388line 387 didn't jump to line 388 because the condition on line 387 was never true

388 data["tags"].append(raw_tag) 

389 else: 

390 new_raw_tags.append(raw_tag) 

391 data["raw_tags"] = new_raw_tags 

392 if len(data.get("ref", "")) > 0: 392 ↛ 393line 392 didn't jump to line 393 because the condition on line 392 was never true

393 data["type"] = "quote" 

394 else: 

395 data["type"] = "example" 

396 for key, value in data.copy().items(): 

397 if len(value) == 0: 

398 del data[key] 

399 

400 

401def extract_ux_template( 

402 wxr: WiktextractContext, 

403 t_node: TemplateNode, 

404 sense_data: SenseData, 

405 example_data: ExampleData, 

406) -> ExampleData: 

407 expanded_node = wxr.wtp.parse( 

408 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

409 ) 

410 clean_node(wxr, sense_data, expanded_node) 

411 for html_node in expanded_node.find_child_recursively(NodeKind.HTML): 

412 class_names = html_node.attrs.get("class", "") 

413 if "e-example" in class_names: 

414 example_data["text"] = clean_node(wxr, None, html_node) 

415 calculate_bold_offsets( 

416 wxr, 

417 html_node, 

418 example_data["text"], 

419 example_data, 

420 "bold_text_offsets", 

421 ) 

422 elif "e-transliteration" in class_names: 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true

423 example_data["roman"] = clean_node(wxr, None, html_node) 

424 calculate_bold_offsets( 

425 wxr, 

426 html_node, 

427 example_data["roman"], 

428 example_data, 

429 "bold_roman_offsets", 

430 ) 

431 elif "e-translation" in class_names: 

432 example_data["translation"] = clean_node(wxr, None, html_node) 

433 example_data["english"] = example_data[ 

434 "translation" 

435 ] # DEPRECATED for "translation" 

436 calculate_bold_offsets( 

437 wxr, 

438 html_node, 

439 example_data["translation"], 

440 example_data, 

441 "bold_translation_offsets", 

442 ) 

443 elif "e-literally" in class_names: 443 ↛ 444line 443 didn't jump to line 444 because the condition on line 443 was never true

444 example_data["literal_meaning"] = clean_node(wxr, None, html_node) 

445 calculate_bold_offsets( 

446 wxr, 

447 html_node, 

448 example_data["literal_meaning"], 

449 example_data, 

450 "bold_literal_offsets", 

451 ) 

452 elif "qualifier-content" in class_names: 

453 raw_tag = clean_node(wxr, None, html_node) 

454 if raw_tag != "": 454 ↛ 411line 454 didn't jump to line 411 because the condition on line 454 was always true

455 example_data["raw_tags"].append(raw_tag) 

456 

457 clean_example_empty_data(example_data) 

458 return example_data