Coverage for src/wiktextract/extractor/zh/example.py: 97%

208 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..ruby import extract_ruby 

6from ..share import calculate_bold_offsets 

7from .linkage import process_linkage_templates_in_gloss 

8from .models import Example, Form, Sense, WordEntry 

9from .tags import translate_raw_tags 

10 

11LINKAGE_TEMPLATES = { 

12 "syn": "synonyms", 

13 "synonyms": "synonyms", 

14 "ant": "antonyms", 

15 "antonyms": "antonyms", 

16 "antonym": "antonyms", 

17 "hyper": "hypernyms", 

18 "hypernyms": "hypernyms", 

19 "hypo": "hyponyms", 

20 "hyponyms": "hyponyms", 

21 "cot": "coordinate_terms", 

22 "coo": "coordinate_terms", 

23 "coord": "coordinate_terms", 

24 "coordinate terms": "coordinate_terms", 

25} 

26 

27 

28def extract_example_list_item( 

29 wxr: WiktextractContext, 

30 sense_data: Sense, 

31 list_item: WikiNode, 

32 word_entry: WordEntry, 

33 parent_example: Example | None = None, 

34) -> None: 

35 example_data = parent_example or Example() 

36 if list_item.contain_node(NodeKind.LIST) and not all( 

37 isinstance(n, TemplateNode) 

38 for n in list_item.invert_find_child(NodeKind.LIST) 

39 ): 

40 # plain text in the nested list, not using any template 

41 # https://zh.wiktionary.org/wiki/%, the second example 

42 extract_plain_text_example_list(wxr, list_item, example_data) 

43 else: 

44 # parse example templates 

45 for child in list_item.find_child(NodeKind.TEMPLATE): 

46 template_name = child.template_name 

47 if ( 

48 template_name.startswith(("quote-", "RQ:")) 

49 or template_name == "quote" 

50 ): 

51 extract_quote_templates(wxr, child, example_data) 

52 clean_node(wxr, sense_data, child) # add cat link 

53 elif template_name in ["ja-x", "ja-usex"]: 

54 extract_template_ja_usex(wxr, child, example_data) 

55 clean_node(wxr, sense_data, child) # add cat link 

56 elif template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]: 

57 sense_data.examples.extend( 

58 extract_template_zh_x(wxr, child, example_data) 

59 ) 

60 clean_node(wxr, sense_data, child) # add cat link 

61 elif template_name in [ 

62 "ux", 

63 "eg", 

64 "usex", 

65 "uxi", 

66 "collocation", 

67 "co", 

68 "coi", 

69 "ko-usex", 

70 "ko-x", 

71 "koex", 

72 "th-usex", 

73 "th-x", 

74 "th-xi", 

75 ]: 

76 extract_template_ux(wxr, child, example_data) 

77 clean_node(wxr, sense_data, child) # add cat link 

78 elif template_name == "Q": 

79 extract_template_Q(wxr, child, example_data) 

80 clean_node(wxr, sense_data, child) # add cat link 

81 elif template_name.lower() in LINKAGE_TEMPLATES: 

82 process_linkage_templates_in_gloss( 

83 wxr, 

84 word_entry, 

85 child, 

86 LINKAGE_TEMPLATES[template_name.lower()], 

87 " ".join(sense_data.glosses), 

88 ) 

89 elif template_name.lower() in ["inline alt forms", "alti"]: 89 ↛ 45line 89 didn't jump to line 45 because the condition on line 89 was always true

90 extract_inline_alt_forms_template(wxr, word_entry, child) 

91 

92 for next_list_item in list_item.find_child_recursively( 

93 NodeKind.LIST_ITEM 

94 ): 

95 extract_example_list_item( 

96 wxr, sense_data, next_list_item, word_entry, example_data 

97 ) 

98 

99 if len(example_data.text) > 0 and parent_example is None: 

100 sense_data.examples.append(example_data) 

101 

102 

103def extract_plain_text_example_list( 

104 wxr: WiktextractContext, list_item: WikiNode, example_data: Example 

105) -> None: 

106 for index, nested_list in list_item.find_child( 

107 NodeKind.LIST, with_index=True 

108 ): 

109 example_data.ref = clean_node(wxr, None, list_item.children[:index]) 

110 example_data.text = clean_node( 

111 wxr, None, nested_list.children[0].children 

112 ) 

113 

114 

115def extract_quote_templates( 

116 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

117) -> None: 

118 """ 

119 Process `quote-*` and "RQ:*" templates. 

120 """ 

121 expanded_node = wxr.wtp.parse( 

122 wxr.wtp.node_to_wikitext(node), expand_all=True 

123 ) 

124 for span_tag in expanded_node.find_html_recursively("span"): 

125 span_class = span_tag.attrs.get("class", "") 

126 if "cited-source" == span_class: 

127 example_data.ref = clean_node(wxr, None, span_tag) 

128 elif "e-quotation" in span_class: 

129 example_data.text = clean_node(wxr, None, span_tag) 

130 calculate_bold_offsets( 

131 wxr, 

132 span_tag, 

133 example_data.text, 

134 example_data, 

135 "bold_text_offsets", 

136 ) 

137 elif "e-translation" in span_class: 

138 example_data.translation = clean_node(wxr, None, span_tag) 

139 calculate_bold_offsets( 

140 wxr, 

141 span_tag, 

142 example_data.translation, 

143 example_data, 

144 "bold_translation_offsets", 

145 ) 

146 for i_tag in expanded_node.find_html_recursively( 

147 "i", attr_name="class", attr_value="e-transliteration" 

148 ): 

149 example_data.roman = clean_node(wxr, None, i_tag) 

150 calculate_bold_offsets( 

151 wxr, 

152 i_tag, 

153 example_data.roman, 

154 example_data, 

155 "bold_roman_offsets", 

156 ) 

157 break 

158 

159 

160def extract_template_ja_usex( 

161 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

162) -> None: 

163 expanded_node = wxr.wtp.parse( 

164 wxr.wtp.node_to_wikitext(node), expand_all=True 

165 ) 

166 for span_tag in expanded_node.find_html( 

167 "span", attr_name="class", attr_value="Jpan" 

168 ): 

169 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

170 example_data.text = clean_node(wxr, None, node_without_ruby) 

171 calculate_bold_offsets( 

172 wxr, 

173 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)), 

174 example_data.text, 

175 example_data, 

176 "bold_text_offsets", 

177 ) 

178 example_data.ruby = ruby_data 

179 for span_tag in expanded_node.find_html_recursively( 

180 "span", attr_name="class", attr_value="tr" 

181 ): 

182 example_data.roman = clean_node(wxr, None, span_tag) 

183 calculate_bold_offsets( 

184 wxr, 

185 span_tag, 

186 example_data.roman, 

187 example_data, 

188 "bold_roman_offsets", 

189 ) 

190 tr_arg = wxr.wtp.parse( 

191 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")), 

192 expand_all=True, 

193 ) 

194 example_data.translation = clean_node(wxr, None, tr_arg) 

195 calculate_bold_offsets( 

196 wxr, 

197 tr_arg, 

198 example_data.translation, 

199 example_data, 

200 "bold_translation_offsets", 

201 ) 

202 lit_arg = wxr.wtp.parse( 

203 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")), 

204 expand_all=True, 

205 ) 

206 example_data.literal_meaning = clean_node(wxr, None, lit_arg) 

207 calculate_bold_offsets( 

208 wxr, 

209 lit_arg, 

210 example_data.literal_meaning, 

211 example_data, 

212 "bold_literal_offsets", 

213 ) 

214 

215 

216def extract_template_zh_x( 

217 wxr: WiktextractContext, 

218 template_node: TemplateNode, 

219 parent_example: Example, 

220) -> list[Example]: 

221 expanded_node = wxr.wtp.parse( 

222 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

223 ) 

224 has_dl_tag = False 

225 results = [] 

226 example_data = parent_example.model_copy(deep=True) 

227 tr_arg = wxr.wtp.parse( 

228 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")), 

229 expand_all=True, 

230 ) 

231 example_data.translation = clean_node(wxr, None, tr_arg) 

232 calculate_bold_offsets( 

233 wxr, 

234 tr_arg, 

235 example_data.translation, 

236 example_data, 

237 "bold_translation_offsets", 

238 ) 

239 lit_arg = wxr.wtp.parse( 

240 wxr.wtp.node_to_wikitext( 

241 template_node.template_parameters.get("lit", "") 

242 ), 

243 expand_all=True, 

244 ) 

245 example_data.literal_meaning = clean_node(wxr, None, lit_arg) 

246 calculate_bold_offsets( 

247 wxr, 

248 lit_arg, 

249 example_data.literal_meaning, 

250 example_data, 

251 "bold_literal_offsets", 

252 ) 

253 for dl_tag in expanded_node.find_html_recursively("dl"): 

254 has_dl_tag = True 

255 for dd_tag in dl_tag.find_html("dd"): 

256 dd_text = clean_node(wxr, None, dd_tag) 

257 if dd_text.startswith("出自:"): 

258 example_data.ref = dd_text.removeprefix("出自:") 

259 elif not dd_text.startswith("(字面義為"): 259 ↛ 255line 259 didn't jump to line 255 because the condition on line 259 was always true

260 for span_tag in dd_tag.find_html_recursively( 

261 "span", attr_name="lang", attr_value="Latn" 

262 ): 

263 example_data.roman = clean_node(wxr, None, span_tag) 

264 calculate_bold_offsets( 

265 wxr, 

266 span_tag, 

267 example_data.roman, 

268 example_data, 

269 "bold_roman_offsets", 

270 ) 

271 for span_tag in dd_tag.find_html_recursively("span"): 

272 span_text = clean_node(wxr, None, span_tag) 

273 if span_text.startswith("[") and span_text.endswith( 

274 "]" 

275 ): 

276 example_data.raw_tags.append(span_text.strip("[]")) 

277 break 

278 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data)) 

279 

280 # no source, single line example 

281 if not has_dl_tag: 

282 for span_tag in expanded_node.find_html( 282 ↛ 294line 282 didn't jump to line 294 because the loop on line 282 didn't complete

283 "span", attr_name="lang", attr_value="Latn" 

284 ): 

285 example_data.roman = clean_node(wxr, None, span_tag) 

286 calculate_bold_offsets( 

287 wxr, 

288 span_tag, 

289 example_data.roman, 

290 example_data, 

291 "bold_roman_offsets", 

292 ) 

293 break 

294 for span_tag in expanded_node.find_html("span"): 

295 span_text = clean_node(wxr, None, span_tag) 

296 if span_text.startswith("[") and span_text.endswith("]"): 

297 example_data.raw_tags.append(span_text.strip("[]")) 

298 for span_tag in expanded_node.find_html("span"): 

299 span_lang = span_tag.attrs.get("lang", "") 

300 if span_lang in ["zh-Hant", "zh-Hans"]: 

301 example_text = clean_node(wxr, None, span_tag) 

302 if len(example_text) > 0: 302 ↛ 298line 302 didn't jump to line 298 because the condition on line 302 was always true

303 new_example = example_data.model_copy(deep=True) 

304 new_example.text = example_text 

305 calculate_bold_offsets( 

306 wxr, 

307 span_tag, 

308 example_text, 

309 new_example, 

310 "bold_text_offsets", 

311 ) 

312 new_example.tags.append( 

313 "Traditional-Chinese" 

314 if span_lang == "zh-Hant" 

315 else "Simplified-Chinese" 

316 ) 

317 translate_raw_tags(new_example) 

318 results.append(new_example) 

319 return results 

320 

321 

322def extract_zh_x_dl_span_tag( 

323 wxr: WiktextractContext, dl_tag: HTMLNode, example: Example 

324) -> list[Example]: 

325 # process example text span tag and dialect span tag 

326 results = [] 

327 is_first_hide = True 

328 for span_tag in dl_tag.find_html("span"): 

329 span_lang = span_tag.attrs.get("lang", "") 

330 if span_lang in ["zh-Hant", "zh-Hans"]: 

331 new_example = example.model_copy(deep=True) 

332 new_example.text = clean_node(wxr, None, span_tag) 

333 calculate_bold_offsets( 

334 wxr, 

335 span_tag, 

336 new_example.text, 

337 new_example, 

338 "bold_text_offsets", 

339 ) 

340 results.append(new_example) 

341 elif "vsHide" in span_tag.attrs.get("class", ""): 

342 # template has arg "collapsed=y" 

343 results.extend( 

344 extract_zh_x_dl_span_tag( 

345 wxr, 

346 span_tag, 

347 results[-1] 

348 if is_first_hide and len(results) > 0 

349 else example, 

350 ) 

351 ) 

352 is_first_hide = False 

353 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 353 ↛ 328line 353 didn't jump to line 328 because the condition on line 353 was always true

354 for link_node in span_tag.find_child(NodeKind.LINK): 

355 raw_tag = clean_node(wxr, None, link_node) 

356 if len(raw_tag) > 0: 356 ↛ 354line 356 didn't jump to line 354 because the condition on line 356 was always true

357 if len(results) > 0: 

358 results[-1].raw_tags.append(raw_tag) 

359 else: 

360 example.raw_tags.append(raw_tag) 

361 

362 if dl_tag.tag == "dl": 

363 for data in results: 

364 translate_raw_tags(data) 

365 return results 

366 

367 

368def extract_template_ux( 

369 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

370) -> None: 

371 # https://zh.wiktionary.org/wiki/Template:ux 

372 expanded_node = wxr.wtp.parse( 

373 wxr.wtp.node_to_wikitext(node), expand_all=True 

374 ) 

375 for html_node in expanded_node.find_child_recursively(NodeKind.HTML): 

376 class_names = html_node.attrs.get("class", "") 

377 if "e-example" in class_names: 

378 example_data.text = clean_node(wxr, None, html_node) 

379 calculate_bold_offsets( 

380 wxr, 

381 html_node, 

382 example_data.text, 

383 example_data, 

384 "bold_text_offsets", 

385 ) 

386 elif "e-transliteration" in class_names: 

387 example_data.roman = clean_node(wxr, None, html_node) 

388 calculate_bold_offsets( 

389 wxr, 

390 html_node, 

391 example_data.roman, 

392 example_data, 

393 "bold_roman_offsets", 

394 ) 

395 elif "e-translation" in class_names: 

396 example_data.translation = clean_node(wxr, None, html_node) 

397 calculate_bold_offsets( 

398 wxr, 

399 html_node, 

400 example_data.translation, 

401 example_data, 

402 "bold_translation_offsets", 

403 ) 

404 elif "e-literally" in class_names: 

405 example_data.literal_meaning = clean_node(wxr, None, html_node) 

406 calculate_bold_offsets( 

407 wxr, 

408 html_node, 

409 example_data.literal_meaning, 

410 example_data, 

411 "bold_literal_offsets", 

412 ) 

413 elif "qualifier-content" in class_names: 

414 example_data.raw_tags.extend( 

415 clean_node(wxr, None, html_node).split("、") 

416 ) 

417 translate_raw_tags(example_data) 

418 

419 

420def extract_template_Q( 

421 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

422) -> None: 

423 # https://zh.wiktionary.org/wiki/Template:Q 

424 expanded_node = wxr.wtp.parse( 

425 wxr.wtp.node_to_wikitext(node), expand_all=True 

426 ) 

427 for div_tag in expanded_node.find_html( 

428 "div", attr_name="class", attr_value="wiktQuote" 

429 ): 

430 ref_nodes = [] 

431 for child in div_tag.children: 431 ↛ 446line 431 didn't jump to line 446 because the loop on line 431 didn't complete

432 if isinstance(child, HTMLNode) and child.tag == "dl": 

433 for i_tag in child.find_html_recursively( 

434 "i", attr_name="class", attr_value="e-transliteration" 

435 ): 

436 example_data.roman = clean_node(wxr, None, i_tag) 

437 calculate_bold_offsets( 

438 wxr, 

439 i_tag, 

440 example_data.roman, 

441 example_data, 

442 "bold_roman_offsets", 

443 ) 

444 break 

445 ref_nodes.append(child) 

446 ref_text = clean_node(wxr, None, ref_nodes) 

447 if len(ref_text) > 0: 447 ↛ 449line 447 didn't jump to line 449 because the condition on line 447 was always true

448 example_data.ref = ref_text 

449 for t_arg, field in ( 

450 ("quote", "text"), 

451 ("t", "translation"), 

452 ("trans", "translation"), 

453 ("lit", "literal_meaning"), 

454 ): 

455 t_arg_node = wxr.wtp.parse( 

456 wxr.wtp.node_to_wikitext( 

457 node.template_parameters.get(t_arg, "") 

458 ), 

459 expand_all=True, 

460 ) 

461 value = clean_node(wxr, None, t_arg_node) 

462 if len(value) > 0: 

463 setattr(example_data, field, value) 

464 calculate_bold_offsets( 

465 wxr, 

466 t_arg_node, 

467 value, 

468 example_data, 

469 "bold_" + field.split("_")[0] + "_offsets", 

470 ) 

471 

472 

473def extract_inline_alt_forms_template( 

474 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

475): 

476 sense = " ".join(word_entry.senses[-1].glosses) 

477 forms = [] 

478 raw_tag = "" 

479 expanded_node = wxr.wtp.parse( 

480 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

481 ) 

482 lang = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

483 for span_tag in expanded_node.find_html_recursively("span"): 

484 span_class = span_tag.attrs.get("class", "") 

485 span_lang = span_tag.attrs.get("lang", "") 

486 if "qualifier-content" in span_class: 

487 raw_tag = clean_node(wxr, None, span_tag) 

488 elif span_lang == lang: 

489 word = clean_node(wxr, None, span_tag) 

490 if word != "": 490 ↛ 483line 490 didn't jump to line 483 because the condition on line 490 was always true

491 form = Form(form=word, sense=sense, tags=["alternative"]) 

492 if raw_tag != "": 

493 form.raw_tags.append(raw_tag) 

494 raw_tag = "" 

495 translate_raw_tags(form) 

496 forms.append(form) 

497 elif span_class == "tr Latn" and len(forms) > 0: 

498 forms[-1].roman = clean_node(wxr, None, span_tag) 

499 word_entry.forms.extend(forms)