Coverage for src / wiktextract / extractor / zh / example.py: 97%

213 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-05 07:46 +0000

1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..ruby import extract_ruby 

6from ..share import calculate_bold_offsets 

7from .linkage import process_linkage_templates_in_gloss 

8from .models import Example, Form, Sense, WordEntry 

9from .tags import translate_raw_tags 

10 

11LINKAGE_TEMPLATES = { 

12 "syn": "synonyms", 

13 "synonyms": "synonyms", 

14 "ant": "antonyms", 

15 "antonyms": "antonyms", 

16 "antonym": "antonyms", 

17 "hyper": "hypernyms", 

18 "hypernyms": "hypernyms", 

19 "hypo": "hyponyms", 

20 "hyponyms": "hyponyms", 

21 "cot": "coordinate_terms", 

22 "coo": "coordinate_terms", 

23 "coord": "coordinate_terms", 

24 "coordinate terms": "coordinate_terms", 

25} 

26 

27 

28def extract_example_list_item( 

29 wxr: WiktextractContext, 

30 sense_data: Sense, 

31 list_item: WikiNode, 

32 word_entry: WordEntry, 

33 parent_example: Example | None = None, 

34) -> None: 

35 example_data = parent_example or Example(text="") 

36 if list_item.contain_node(NodeKind.LIST) and not all( 

37 isinstance(n, TemplateNode) 

38 for n in list_item.invert_find_child(NodeKind.LIST) 

39 ): 

40 # plain text in the nested list, not using any template 

41 # https://zh.wiktionary.org/wiki/%, the second example 

42 extract_plain_text_example_list( 

43 wxr, sense_data, list_item, word_entry, example_data 

44 ) 

45 elif list_item.contain_node(NodeKind.TEMPLATE): 

46 # parse example templates 

47 for child in list_item.find_child(NodeKind.TEMPLATE): 

48 template_name = child.template_name 

49 if ( 

50 template_name.startswith(("quote-", "RQ:")) 

51 or template_name == "quote" 

52 ): 

53 extract_quote_templates(wxr, child, example_data) 

54 clean_node(wxr, sense_data, child) # add cat link 

55 elif template_name in ["ja-x", "ja-usex"]: 

56 extract_template_ja_usex(wxr, child, example_data) 

57 clean_node(wxr, sense_data, child) # add cat link 

58 elif template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]: 

59 sense_data.examples.extend( 

60 extract_template_zh_x(wxr, child, example_data) 

61 ) 

62 clean_node(wxr, sense_data, child) # add cat link 

63 elif template_name in [ 

64 "ux", 

65 "eg", 

66 "usex", 

67 "uxi", 

68 "collocation", 

69 "co", 

70 "coi", 

71 "ko-usex", 

72 "ko-x", 

73 "koex", 

74 "th-usex", 

75 "th-x", 

76 "th-xi", 

77 "uxa", 

78 ]: 

79 extract_template_ux(wxr, child, example_data) 

80 clean_node(wxr, sense_data, child) # add cat link 

81 elif template_name == "Q": 

82 extract_template_Q(wxr, child, example_data) 

83 clean_node(wxr, sense_data, child) # add cat link 

84 elif template_name.lower() in LINKAGE_TEMPLATES: 

85 process_linkage_templates_in_gloss( 

86 wxr, 

87 word_entry, 

88 child, 

89 LINKAGE_TEMPLATES[template_name.lower()], 

90 " ".join(sense_data.glosses), 

91 ) 

92 elif template_name.lower() in ["inline alt forms", "alti"]: 92 ↛ 47line 92 didn't jump to line 47 because the condition on line 92 was always true

93 extract_inline_alt_forms_template(wxr, word_entry, child) 

94 

95 for next_list_item in list_item.find_child_recursively( 

96 NodeKind.LIST_ITEM 

97 ): 

98 extract_example_list_item( 

99 wxr, sense_data, next_list_item, word_entry, example_data 

100 ) 

101 elif not list_item.contain_node(NodeKind.LIST): 101 ↛ 104line 101 didn't jump to line 104 because the condition on line 101 was always true

102 example_data.text = clean_node(wxr, None, list_item.children) 

103 

104 if len(example_data.text) > 0 and parent_example is None: 

105 sense_data.examples.append(example_data) 

106 

107 

108def extract_plain_text_example_list( 

109 wxr: WiktextractContext, 

110 sense: Sense, 

111 list_item: WikiNode, 

112 word_entry: WordEntry, 

113 example_data: Example, 

114) -> None: 

115 for index, nested_list in list_item.find_child( 

116 NodeKind.LIST, with_index=True 

117 ): 

118 example_data.ref = clean_node(wxr, None, list_item.children[:index]) 

119 for child_list_item in nested_list.find_child(NodeKind.LIST_ITEM): 

120 extract_example_list_item( 

121 wxr, sense, child_list_item, word_entry, example_data 

122 ) 

123 

124 

125def extract_quote_templates( 

126 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

127) -> None: 

128 """ 

129 Process `quote-*` and "RQ:*" templates. 

130 """ 

131 expanded_node = wxr.wtp.parse( 

132 wxr.wtp.node_to_wikitext(node), expand_all=True 

133 ) 

134 for span_tag in expanded_node.find_html_recursively("span"): 

135 span_class = span_tag.attrs.get("class", "") 

136 if "cited-source" == span_class: 

137 example_data.ref = clean_node(wxr, None, span_tag) 

138 elif "e-quotation" in span_class: 

139 example_data.ruby, node_without_ruby = extract_ruby(wxr, span_tag) 

140 example_data.text = clean_node(wxr, None, node_without_ruby) 

141 calculate_bold_offsets( 

142 wxr, 

143 span_tag, 

144 example_data.text, 

145 example_data, 

146 "bold_text_offsets", 

147 ) 

148 elif "e-translation" in span_class: 

149 example_data.translation = clean_node(wxr, None, span_tag) 

150 calculate_bold_offsets( 

151 wxr, 

152 span_tag, 

153 example_data.translation, 

154 example_data, 

155 "bold_translation_offsets", 

156 ) 

157 for i_tag in expanded_node.find_html_recursively( 

158 "i", attr_name="class", attr_value="e-transliteration" 

159 ): 

160 example_data.roman = clean_node(wxr, None, i_tag) 

161 calculate_bold_offsets( 

162 wxr, 

163 i_tag, 

164 example_data.roman, 

165 example_data, 

166 "bold_roman_offsets", 

167 ) 

168 break 

169 

170 

171def extract_template_ja_usex( 

172 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

173) -> None: 

174 expanded_node = wxr.wtp.parse( 

175 wxr.wtp.node_to_wikitext(node), expand_all=True 

176 ) 

177 for span_tag in expanded_node.find_html( 

178 "span", attr_name="class", attr_value="Jpan" 

179 ): 

180 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

181 example_data.text = clean_node(wxr, None, node_without_ruby) 

182 calculate_bold_offsets( 

183 wxr, 

184 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)), 

185 example_data.text, 

186 example_data, 

187 "bold_text_offsets", 

188 ) 

189 example_data.ruby = ruby_data 

190 for span_tag in expanded_node.find_html_recursively( 

191 "span", attr_name="class", attr_value="tr" 

192 ): 

193 example_data.roman = clean_node(wxr, None, span_tag) 

194 calculate_bold_offsets( 

195 wxr, 

196 span_tag, 

197 example_data.roman, 

198 example_data, 

199 "bold_roman_offsets", 

200 ) 

201 tr_arg = wxr.wtp.parse( 

202 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")), 

203 expand_all=True, 

204 ) 

205 example_data.translation = clean_node(wxr, None, tr_arg) 

206 calculate_bold_offsets( 

207 wxr, 

208 tr_arg, 

209 example_data.translation, 

210 example_data, 

211 "bold_translation_offsets", 

212 ) 

213 lit_arg = wxr.wtp.parse( 

214 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")), 

215 expand_all=True, 

216 ) 

217 example_data.literal_meaning = clean_node(wxr, None, lit_arg) 

218 calculate_bold_offsets( 

219 wxr, 

220 lit_arg, 

221 example_data.literal_meaning, 

222 example_data, 

223 "bold_literal_offsets", 

224 ) 

225 

226 

227def extract_template_zh_x( 

228 wxr: WiktextractContext, 

229 template_node: TemplateNode, 

230 parent_example: Example, 

231) -> list[Example]: 

232 expanded_node = wxr.wtp.parse( 

233 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

234 ) 

235 has_dl_tag = False 

236 results = [] 

237 example_data = parent_example.model_copy(deep=True) 

238 tr_arg = wxr.wtp.parse( 

239 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")), 

240 expand_all=True, 

241 ) 

242 example_data.translation = clean_node(wxr, None, tr_arg) 

243 calculate_bold_offsets( 

244 wxr, 

245 tr_arg, 

246 example_data.translation, 

247 example_data, 

248 "bold_translation_offsets", 

249 ) 

250 lit_arg = wxr.wtp.parse( 

251 wxr.wtp.node_to_wikitext( 

252 template_node.template_parameters.get("lit", "") 

253 ), 

254 expand_all=True, 

255 ) 

256 example_data.literal_meaning = clean_node(wxr, None, lit_arg) 

257 calculate_bold_offsets( 

258 wxr, 

259 lit_arg, 

260 example_data.literal_meaning, 

261 example_data, 

262 "bold_literal_offsets", 

263 ) 

264 for dl_tag in expanded_node.find_html_recursively("dl"): 

265 has_dl_tag = True 

266 for dd_tag in dl_tag.find_html("dd"): 

267 dd_text = clean_node(wxr, None, dd_tag) 

268 if dd_text.startswith("出自:"): 

269 example_data.ref = dd_text.removeprefix("出自:") 

270 elif not dd_text.startswith("(字面義為"): 270 ↛ 266line 270 didn't jump to line 266 because the condition on line 270 was always true

271 for span_tag in dd_tag.find_html_recursively( 

272 "span", attr_name="lang", attr_value="Latn" 

273 ): 

274 example_data.roman = clean_node(wxr, None, span_tag) 

275 calculate_bold_offsets( 

276 wxr, 

277 span_tag, 

278 example_data.roman, 

279 example_data, 

280 "bold_roman_offsets", 

281 ) 

282 for span_tag in dd_tag.find_html_recursively("span"): 

283 span_text = clean_node(wxr, None, span_tag) 

284 if span_text.startswith("[") and span_text.endswith( 

285 "]" 

286 ): 

287 example_data.raw_tags.append(span_text.strip("[]")) 

288 break 

289 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data)) 

290 

291 # no source, single line example 

292 if not has_dl_tag: 

293 for span_tag in expanded_node.find_html( 293 ↛ 305line 293 didn't jump to line 305 because the loop on line 293 didn't complete

294 "span", attr_name="lang", attr_value="Latn" 

295 ): 

296 example_data.roman = clean_node(wxr, None, span_tag) 

297 calculate_bold_offsets( 

298 wxr, 

299 span_tag, 

300 example_data.roman, 

301 example_data, 

302 "bold_roman_offsets", 

303 ) 

304 break 

305 for span_tag in expanded_node.find_html("span"): 

306 span_text = clean_node(wxr, None, span_tag) 

307 if span_text.startswith("[") and span_text.endswith("]"): 

308 example_data.raw_tags.append(span_text.strip("[]")) 

309 for span_tag in expanded_node.find_html("span"): 

310 span_lang = span_tag.attrs.get("lang", "") 

311 if span_lang in ["zh-Hant", "zh-Hans"]: 

312 example_text = clean_node(wxr, None, span_tag) 

313 if len(example_text) > 0: 313 ↛ 309line 313 didn't jump to line 309 because the condition on line 313 was always true

314 new_example = example_data.model_copy(deep=True) 

315 new_example.text = example_text 

316 calculate_bold_offsets( 

317 wxr, 

318 span_tag, 

319 example_text, 

320 new_example, 

321 "bold_text_offsets", 

322 ) 

323 new_example.tags.append( 

324 "Traditional-Chinese" 

325 if span_lang == "zh-Hant" 

326 else "Simplified-Chinese" 

327 ) 

328 translate_raw_tags(new_example) 

329 results.append(new_example) 

330 return results 

331 

332 

333def extract_zh_x_dl_span_tag( 

334 wxr: WiktextractContext, dl_tag: HTMLNode, example: Example 

335) -> list[Example]: 

336 # process example text span tag and dialect span tag 

337 results = [] 

338 is_first_hide = True 

339 for span_tag in dl_tag.find_html("span"): 

340 span_lang = span_tag.attrs.get("lang", "") 

341 if span_lang in ["zh-Hant", "zh-Hans"]: 

342 new_example = example.model_copy(deep=True) 

343 new_example.text = clean_node(wxr, None, span_tag) 

344 calculate_bold_offsets( 

345 wxr, 

346 span_tag, 

347 new_example.text, 

348 new_example, 

349 "bold_text_offsets", 

350 ) 

351 results.append(new_example) 

352 elif "vsHide" in span_tag.attrs.get("class", ""): 

353 # template has arg "collapsed=y" 

354 results.extend( 

355 extract_zh_x_dl_span_tag( 

356 wxr, 

357 span_tag, 

358 results[-1] 

359 if is_first_hide and len(results) > 0 

360 else example, 

361 ) 

362 ) 

363 is_first_hide = False 

364 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 364 ↛ 339line 364 didn't jump to line 339 because the condition on line 364 was always true

365 for link_node in span_tag.find_child(NodeKind.LINK): 

366 raw_tag = clean_node(wxr, None, link_node) 

367 if len(raw_tag) > 0: 367 ↛ 365line 367 didn't jump to line 365 because the condition on line 367 was always true

368 if len(results) > 0: 

369 results[-1].raw_tags.append(raw_tag) 

370 else: 

371 example.raw_tags.append(raw_tag) 

372 

373 if dl_tag.tag == "dl": 

374 for data in results: 

375 translate_raw_tags(data) 

376 return results 

377 

378 

379def extract_template_ux( 

380 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

381) -> None: 

382 # https://zh.wiktionary.org/wiki/Template:ux 

383 expanded_node = wxr.wtp.parse( 

384 wxr.wtp.node_to_wikitext(node), expand_all=True 

385 ) 

386 for html_node in expanded_node.find_child_recursively(NodeKind.HTML): 

387 class_names = html_node.attrs.get("class", "") 

388 if "e-example" in class_names: 

389 example_data.text = clean_node(wxr, None, html_node) 

390 calculate_bold_offsets( 

391 wxr, 

392 html_node, 

393 example_data.text, 

394 example_data, 

395 "bold_text_offsets", 

396 ) 

397 elif "e-transliteration" in class_names: 

398 example_data.roman = clean_node(wxr, None, html_node) 

399 calculate_bold_offsets( 

400 wxr, 

401 html_node, 

402 example_data.roman, 

403 example_data, 

404 "bold_roman_offsets", 

405 ) 

406 elif "e-translation" in class_names: 

407 example_data.translation = clean_node(wxr, None, html_node) 

408 calculate_bold_offsets( 

409 wxr, 

410 html_node, 

411 example_data.translation, 

412 example_data, 

413 "bold_translation_offsets", 

414 ) 

415 elif "e-literally" in class_names: 

416 example_data.literal_meaning = clean_node(wxr, None, html_node) 

417 calculate_bold_offsets( 

418 wxr, 

419 html_node, 

420 example_data.literal_meaning, 

421 example_data, 

422 "bold_literal_offsets", 

423 ) 

424 elif "qualifier-content" in class_names: 

425 example_data.raw_tags.extend( 

426 clean_node(wxr, None, html_node).split("、") 

427 ) 

428 translate_raw_tags(example_data) 

429 

430 

431def extract_template_Q( 

432 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

433) -> None: 

434 # https://zh.wiktionary.org/wiki/Template:Q 

435 expanded_node = wxr.wtp.parse( 

436 wxr.wtp.node_to_wikitext(node), expand_all=True 

437 ) 

438 for div_tag in expanded_node.find_html( 

439 "div", attr_name="class", attr_value="wiktQuote" 

440 ): 

441 ref_nodes = [] 

442 for child in div_tag.children: 442 ↛ 457line 442 didn't jump to line 457 because the loop on line 442 didn't complete

443 if isinstance(child, HTMLNode) and child.tag == "dl": 

444 for i_tag in child.find_html_recursively( 

445 "i", attr_name="class", attr_value="e-transliteration" 

446 ): 

447 example_data.roman = clean_node(wxr, None, i_tag) 

448 calculate_bold_offsets( 

449 wxr, 

450 i_tag, 

451 example_data.roman, 

452 example_data, 

453 "bold_roman_offsets", 

454 ) 

455 break 

456 ref_nodes.append(child) 

457 ref_text = clean_node(wxr, None, ref_nodes) 

458 if len(ref_text) > 0: 458 ↛ 460line 458 didn't jump to line 460 because the condition on line 458 was always true

459 example_data.ref = ref_text 

460 for t_arg, field in ( 

461 ("quote", "text"), 

462 ("t", "translation"), 

463 ("trans", "translation"), 

464 ("lit", "literal_meaning"), 

465 ): 

466 t_arg_node = wxr.wtp.parse( 

467 wxr.wtp.node_to_wikitext( 

468 node.template_parameters.get(t_arg, "") 

469 ), 

470 expand_all=True, 

471 ) 

472 value = clean_node(wxr, None, t_arg_node) 

473 if len(value) > 0: 

474 setattr(example_data, field, value) 

475 calculate_bold_offsets( 

476 wxr, 

477 t_arg_node, 

478 value, 

479 example_data, 

480 "bold_" + field.split("_")[0] + "_offsets", 

481 ) 

482 

483 

484def extract_inline_alt_forms_template( 

485 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

486): 

487 sense = " ".join(word_entry.senses[-1].glosses) 

488 forms = [] 

489 raw_tag = "" 

490 expanded_node = wxr.wtp.parse( 

491 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

492 ) 

493 lang = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

494 for span_tag in expanded_node.find_html_recursively("span"): 

495 span_class = span_tag.attrs.get("class", "") 

496 span_lang = span_tag.attrs.get("lang", "") 

497 if "qualifier-content" in span_class: 

498 raw_tag = clean_node(wxr, None, span_tag) 

499 elif span_lang == lang: 

500 word = clean_node(wxr, None, span_tag) 

501 if word != "": 501 ↛ 494line 501 didn't jump to line 494 because the condition on line 501 was always true

502 form = Form(form=word, sense=sense, tags=["alternative"]) 

503 if raw_tag != "": 

504 form.raw_tags.append(raw_tag) 

505 raw_tag = "" 

506 translate_raw_tags(form) 

507 forms.append(form) 

508 elif span_class == "tr Latn" and len(forms) > 0: 

509 forms[-1].roman = clean_node(wxr, None, span_tag) 

510 word_entry.forms.extend(forms)