Coverage for src/wiktextract/extractor/zh/example.py: 97%

209 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..ruby import extract_ruby 

6from ..share import calculate_bold_offsets 

7from .linkage import process_linkage_templates_in_gloss 

8from .models import Example, Form, Sense, WordEntry 

9from .tags import translate_raw_tags 

10 

11LINKAGE_TEMPLATES = { 

12 "syn": "synonyms", 

13 "synonyms": "synonyms", 

14 "ant": "antonyms", 

15 "antonyms": "antonyms", 

16 "antonym": "antonyms", 

17 "hyper": "hypernyms", 

18 "hypernyms": "hypernyms", 

19 "hypo": "hyponyms", 

20 "hyponyms": "hyponyms", 

21 "cot": "coordinate_terms", 

22 "coo": "coordinate_terms", 

23 "coord": "coordinate_terms", 

24 "coordinate terms": "coordinate_terms", 

25} 

26 

27 

28def extract_example_list_item( 

29 wxr: WiktextractContext, 

30 sense_data: Sense, 

31 list_item: WikiNode, 

32 word_entry: WordEntry, 

33 parent_example: Example | None = None, 

34) -> None: 

35 example_data = parent_example or Example() 

36 if list_item.contain_node(NodeKind.LIST) and not all( 

37 isinstance(n, TemplateNode) 

38 for n in list_item.invert_find_child(NodeKind.LIST) 

39 ): 

40 # plain text in the nested list, not using any template 

41 # https://zh.wiktionary.org/wiki/%, the second example 

42 extract_plain_text_example_list(wxr, list_item, example_data) 

43 else: 

44 # parse example templates 

45 for child in list_item.find_child(NodeKind.TEMPLATE): 

46 template_name = child.template_name 

47 if ( 

48 template_name.startswith(("quote-", "RQ:")) 

49 or template_name == "quote" 

50 ): 

51 extract_quote_templates(wxr, child, example_data) 

52 clean_node(wxr, sense_data, child) # add cat link 

53 elif template_name in ["ja-x", "ja-usex"]: 

54 extract_template_ja_usex(wxr, child, example_data) 

55 clean_node(wxr, sense_data, child) # add cat link 

56 elif template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]: 

57 sense_data.examples.extend( 

58 extract_template_zh_x(wxr, child, example_data) 

59 ) 

60 clean_node(wxr, sense_data, child) # add cat link 

61 elif template_name in [ 

62 "ux", 

63 "eg", 

64 "usex", 

65 "uxi", 

66 "collocation", 

67 "co", 

68 "coi", 

69 "ko-usex", 

70 "ko-x", 

71 "koex", 

72 "th-usex", 

73 "th-x", 

74 "th-xi", 

75 ]: 

76 extract_template_ux(wxr, child, example_data) 

77 clean_node(wxr, sense_data, child) # add cat link 

78 elif template_name == "Q": 

79 extract_template_Q(wxr, child, example_data) 

80 clean_node(wxr, sense_data, child) # add cat link 

81 elif template_name.lower() in LINKAGE_TEMPLATES: 

82 process_linkage_templates_in_gloss( 

83 wxr, 

84 word_entry, 

85 child, 

86 LINKAGE_TEMPLATES[template_name.lower()], 

87 " ".join(sense_data.glosses), 

88 ) 

89 elif template_name.lower() in ["inline alt forms", "alti"]: 89 ↛ 45line 89 didn't jump to line 45 because the condition on line 89 was always true

90 extract_inline_alt_forms_template(wxr, word_entry, child) 

91 

92 for next_list_item in list_item.find_child_recursively( 

93 NodeKind.LIST_ITEM 

94 ): 

95 extract_example_list_item( 

96 wxr, sense_data, next_list_item, word_entry, example_data 

97 ) 

98 

99 if len(example_data.text) > 0 and parent_example is None: 

100 sense_data.examples.append(example_data) 

101 

102 

103def extract_plain_text_example_list( 

104 wxr: WiktextractContext, list_item: WikiNode, example_data: Example 

105) -> None: 

106 for index, nested_list in list_item.find_child( 

107 NodeKind.LIST, with_index=True 

108 ): 

109 example_data.ref = clean_node(wxr, None, list_item.children[:index]) 

110 example_data.text = clean_node( 

111 wxr, None, nested_list.children[0].children 

112 ) 

113 

114 

115def extract_quote_templates( 

116 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

117) -> None: 

118 """ 

119 Process `quote-*` and "RQ:*" templates. 

120 """ 

121 expanded_node = wxr.wtp.parse( 

122 wxr.wtp.node_to_wikitext(node), expand_all=True 

123 ) 

124 for span_tag in expanded_node.find_html_recursively("span"): 

125 span_class = span_tag.attrs.get("class", "") 

126 if "cited-source" == span_class: 

127 example_data.ref = clean_node(wxr, None, span_tag) 

128 elif "e-quotation" in span_class: 

129 example_data.ruby, node_without_ruby = extract_ruby(wxr, span_tag) 

130 example_data.text = clean_node(wxr, None, node_without_ruby) 

131 calculate_bold_offsets( 

132 wxr, 

133 span_tag, 

134 example_data.text, 

135 example_data, 

136 "bold_text_offsets", 

137 ) 

138 elif "e-translation" in span_class: 

139 example_data.translation = clean_node(wxr, None, span_tag) 

140 calculate_bold_offsets( 

141 wxr, 

142 span_tag, 

143 example_data.translation, 

144 example_data, 

145 "bold_translation_offsets", 

146 ) 

147 for i_tag in expanded_node.find_html_recursively( 

148 "i", attr_name="class", attr_value="e-transliteration" 

149 ): 

150 example_data.roman = clean_node(wxr, None, i_tag) 

151 calculate_bold_offsets( 

152 wxr, 

153 i_tag, 

154 example_data.roman, 

155 example_data, 

156 "bold_roman_offsets", 

157 ) 

158 break 

159 

160 

161def extract_template_ja_usex( 

162 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

163) -> None: 

164 expanded_node = wxr.wtp.parse( 

165 wxr.wtp.node_to_wikitext(node), expand_all=True 

166 ) 

167 for span_tag in expanded_node.find_html( 

168 "span", attr_name="class", attr_value="Jpan" 

169 ): 

170 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

171 example_data.text = clean_node(wxr, None, node_without_ruby) 

172 calculate_bold_offsets( 

173 wxr, 

174 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)), 

175 example_data.text, 

176 example_data, 

177 "bold_text_offsets", 

178 ) 

179 example_data.ruby = ruby_data 

180 for span_tag in expanded_node.find_html_recursively( 

181 "span", attr_name="class", attr_value="tr" 

182 ): 

183 example_data.roman = clean_node(wxr, None, span_tag) 

184 calculate_bold_offsets( 

185 wxr, 

186 span_tag, 

187 example_data.roman, 

188 example_data, 

189 "bold_roman_offsets", 

190 ) 

191 tr_arg = wxr.wtp.parse( 

192 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")), 

193 expand_all=True, 

194 ) 

195 example_data.translation = clean_node(wxr, None, tr_arg) 

196 calculate_bold_offsets( 

197 wxr, 

198 tr_arg, 

199 example_data.translation, 

200 example_data, 

201 "bold_translation_offsets", 

202 ) 

203 lit_arg = wxr.wtp.parse( 

204 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")), 

205 expand_all=True, 

206 ) 

207 example_data.literal_meaning = clean_node(wxr, None, lit_arg) 

208 calculate_bold_offsets( 

209 wxr, 

210 lit_arg, 

211 example_data.literal_meaning, 

212 example_data, 

213 "bold_literal_offsets", 

214 ) 

215 

216 

217def extract_template_zh_x( 

218 wxr: WiktextractContext, 

219 template_node: TemplateNode, 

220 parent_example: Example, 

221) -> list[Example]: 

222 expanded_node = wxr.wtp.parse( 

223 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

224 ) 

225 has_dl_tag = False 

226 results = [] 

227 example_data = parent_example.model_copy(deep=True) 

228 tr_arg = wxr.wtp.parse( 

229 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")), 

230 expand_all=True, 

231 ) 

232 example_data.translation = clean_node(wxr, None, tr_arg) 

233 calculate_bold_offsets( 

234 wxr, 

235 tr_arg, 

236 example_data.translation, 

237 example_data, 

238 "bold_translation_offsets", 

239 ) 

240 lit_arg = wxr.wtp.parse( 

241 wxr.wtp.node_to_wikitext( 

242 template_node.template_parameters.get("lit", "") 

243 ), 

244 expand_all=True, 

245 ) 

246 example_data.literal_meaning = clean_node(wxr, None, lit_arg) 

247 calculate_bold_offsets( 

248 wxr, 

249 lit_arg, 

250 example_data.literal_meaning, 

251 example_data, 

252 "bold_literal_offsets", 

253 ) 

254 for dl_tag in expanded_node.find_html_recursively("dl"): 

255 has_dl_tag = True 

256 for dd_tag in dl_tag.find_html("dd"): 

257 dd_text = clean_node(wxr, None, dd_tag) 

258 if dd_text.startswith("出自:"): 

259 example_data.ref = dd_text.removeprefix("出自:") 

260 elif not dd_text.startswith("(字面義為"): 260 ↛ 256line 260 didn't jump to line 256 because the condition on line 260 was always true

261 for span_tag in dd_tag.find_html_recursively( 

262 "span", attr_name="lang", attr_value="Latn" 

263 ): 

264 example_data.roman = clean_node(wxr, None, span_tag) 

265 calculate_bold_offsets( 

266 wxr, 

267 span_tag, 

268 example_data.roman, 

269 example_data, 

270 "bold_roman_offsets", 

271 ) 

272 for span_tag in dd_tag.find_html_recursively("span"): 

273 span_text = clean_node(wxr, None, span_tag) 

274 if span_text.startswith("[") and span_text.endswith( 

275 "]" 

276 ): 

277 example_data.raw_tags.append(span_text.strip("[]")) 

278 break 

279 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data)) 

280 

281 # no source, single line example 

282 if not has_dl_tag: 

283 for span_tag in expanded_node.find_html( 283 ↛ 295line 283 didn't jump to line 295 because the loop on line 283 didn't complete

284 "span", attr_name="lang", attr_value="Latn" 

285 ): 

286 example_data.roman = clean_node(wxr, None, span_tag) 

287 calculate_bold_offsets( 

288 wxr, 

289 span_tag, 

290 example_data.roman, 

291 example_data, 

292 "bold_roman_offsets", 

293 ) 

294 break 

295 for span_tag in expanded_node.find_html("span"): 

296 span_text = clean_node(wxr, None, span_tag) 

297 if span_text.startswith("[") and span_text.endswith("]"): 

298 example_data.raw_tags.append(span_text.strip("[]")) 

299 for span_tag in expanded_node.find_html("span"): 

300 span_lang = span_tag.attrs.get("lang", "") 

301 if span_lang in ["zh-Hant", "zh-Hans"]: 

302 example_text = clean_node(wxr, None, span_tag) 

303 if len(example_text) > 0: 303 ↛ 299line 303 didn't jump to line 299 because the condition on line 303 was always true

304 new_example = example_data.model_copy(deep=True) 

305 new_example.text = example_text 

306 calculate_bold_offsets( 

307 wxr, 

308 span_tag, 

309 example_text, 

310 new_example, 

311 "bold_text_offsets", 

312 ) 

313 new_example.tags.append( 

314 "Traditional-Chinese" 

315 if span_lang == "zh-Hant" 

316 else "Simplified-Chinese" 

317 ) 

318 translate_raw_tags(new_example) 

319 results.append(new_example) 

320 return results 

321 

322 

323def extract_zh_x_dl_span_tag( 

324 wxr: WiktextractContext, dl_tag: HTMLNode, example: Example 

325) -> list[Example]: 

326 # process example text span tag and dialect span tag 

327 results = [] 

328 is_first_hide = True 

329 for span_tag in dl_tag.find_html("span"): 

330 span_lang = span_tag.attrs.get("lang", "") 

331 if span_lang in ["zh-Hant", "zh-Hans"]: 

332 new_example = example.model_copy(deep=True) 

333 new_example.text = clean_node(wxr, None, span_tag) 

334 calculate_bold_offsets( 

335 wxr, 

336 span_tag, 

337 new_example.text, 

338 new_example, 

339 "bold_text_offsets", 

340 ) 

341 results.append(new_example) 

342 elif "vsHide" in span_tag.attrs.get("class", ""): 

343 # template has arg "collapsed=y" 

344 results.extend( 

345 extract_zh_x_dl_span_tag( 

346 wxr, 

347 span_tag, 

348 results[-1] 

349 if is_first_hide and len(results) > 0 

350 else example, 

351 ) 

352 ) 

353 is_first_hide = False 

354 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 354 ↛ 329line 354 didn't jump to line 329 because the condition on line 354 was always true

355 for link_node in span_tag.find_child(NodeKind.LINK): 

356 raw_tag = clean_node(wxr, None, link_node) 

357 if len(raw_tag) > 0: 357 ↛ 355line 357 didn't jump to line 355 because the condition on line 357 was always true

358 if len(results) > 0: 

359 results[-1].raw_tags.append(raw_tag) 

360 else: 

361 example.raw_tags.append(raw_tag) 

362 

363 if dl_tag.tag == "dl": 

364 for data in results: 

365 translate_raw_tags(data) 

366 return results 

367 

368 

369def extract_template_ux( 

370 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

371) -> None: 

372 # https://zh.wiktionary.org/wiki/Template:ux 

373 expanded_node = wxr.wtp.parse( 

374 wxr.wtp.node_to_wikitext(node), expand_all=True 

375 ) 

376 for html_node in expanded_node.find_child_recursively(NodeKind.HTML): 

377 class_names = html_node.attrs.get("class", "") 

378 if "e-example" in class_names: 

379 example_data.text = clean_node(wxr, None, html_node) 

380 calculate_bold_offsets( 

381 wxr, 

382 html_node, 

383 example_data.text, 

384 example_data, 

385 "bold_text_offsets", 

386 ) 

387 elif "e-transliteration" in class_names: 

388 example_data.roman = clean_node(wxr, None, html_node) 

389 calculate_bold_offsets( 

390 wxr, 

391 html_node, 

392 example_data.roman, 

393 example_data, 

394 "bold_roman_offsets", 

395 ) 

396 elif "e-translation" in class_names: 

397 example_data.translation = clean_node(wxr, None, html_node) 

398 calculate_bold_offsets( 

399 wxr, 

400 html_node, 

401 example_data.translation, 

402 example_data, 

403 "bold_translation_offsets", 

404 ) 

405 elif "e-literally" in class_names: 

406 example_data.literal_meaning = clean_node(wxr, None, html_node) 

407 calculate_bold_offsets( 

408 wxr, 

409 html_node, 

410 example_data.literal_meaning, 

411 example_data, 

412 "bold_literal_offsets", 

413 ) 

414 elif "qualifier-content" in class_names: 

415 example_data.raw_tags.extend( 

416 clean_node(wxr, None, html_node).split("、") 

417 ) 

418 translate_raw_tags(example_data) 

419 

420 

421def extract_template_Q( 

422 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

423) -> None: 

424 # https://zh.wiktionary.org/wiki/Template:Q 

425 expanded_node = wxr.wtp.parse( 

426 wxr.wtp.node_to_wikitext(node), expand_all=True 

427 ) 

428 for div_tag in expanded_node.find_html( 

429 "div", attr_name="class", attr_value="wiktQuote" 

430 ): 

431 ref_nodes = [] 

432 for child in div_tag.children: 432 ↛ 447line 432 didn't jump to line 447 because the loop on line 432 didn't complete

433 if isinstance(child, HTMLNode) and child.tag == "dl": 

434 for i_tag in child.find_html_recursively( 

435 "i", attr_name="class", attr_value="e-transliteration" 

436 ): 

437 example_data.roman = clean_node(wxr, None, i_tag) 

438 calculate_bold_offsets( 

439 wxr, 

440 i_tag, 

441 example_data.roman, 

442 example_data, 

443 "bold_roman_offsets", 

444 ) 

445 break 

446 ref_nodes.append(child) 

447 ref_text = clean_node(wxr, None, ref_nodes) 

448 if len(ref_text) > 0: 448 ↛ 450line 448 didn't jump to line 450 because the condition on line 448 was always true

449 example_data.ref = ref_text 

450 for t_arg, field in ( 

451 ("quote", "text"), 

452 ("t", "translation"), 

453 ("trans", "translation"), 

454 ("lit", "literal_meaning"), 

455 ): 

456 t_arg_node = wxr.wtp.parse( 

457 wxr.wtp.node_to_wikitext( 

458 node.template_parameters.get(t_arg, "") 

459 ), 

460 expand_all=True, 

461 ) 

462 value = clean_node(wxr, None, t_arg_node) 

463 if len(value) > 0: 

464 setattr(example_data, field, value) 

465 calculate_bold_offsets( 

466 wxr, 

467 t_arg_node, 

468 value, 

469 example_data, 

470 "bold_" + field.split("_")[0] + "_offsets", 

471 ) 

472 

473 

474def extract_inline_alt_forms_template( 

475 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

476): 

477 sense = " ".join(word_entry.senses[-1].glosses) 

478 forms = [] 

479 raw_tag = "" 

480 expanded_node = wxr.wtp.parse( 

481 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

482 ) 

483 lang = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

484 for span_tag in expanded_node.find_html_recursively("span"): 

485 span_class = span_tag.attrs.get("class", "") 

486 span_lang = span_tag.attrs.get("lang", "") 

487 if "qualifier-content" in span_class: 

488 raw_tag = clean_node(wxr, None, span_tag) 

489 elif span_lang == lang: 

490 word = clean_node(wxr, None, span_tag) 

491 if word != "": 491 ↛ 484line 491 didn't jump to line 484 because the condition on line 491 was always true

492 form = Form(form=word, sense=sense, tags=["alternative"]) 

493 if raw_tag != "": 

494 form.raw_tags.append(raw_tag) 

495 raw_tag = "" 

496 translate_raw_tags(form) 

497 forms.append(form) 

498 elif span_class == "tr Latn" and len(forms) > 0: 

499 forms[-1].roman = clean_node(wxr, None, span_tag) 

500 word_entry.forms.extend(forms)