Coverage for src/wiktextract/extractor/zh/example.py: 97%

213 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-06 08:01 +0000

1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..ruby import extract_ruby 

6from ..share import calculate_bold_offsets 

7from .linkage import process_linkage_templates_in_gloss 

8from .models import Example, Form, Sense, WordEntry 

9from .tags import translate_raw_tags 

10 

11LINKAGE_TEMPLATES = { 

12 "syn": "synonyms", 

13 "synonyms": "synonyms", 

14 "ant": "antonyms", 

15 "antonyms": "antonyms", 

16 "antonym": "antonyms", 

17 "hyper": "hypernyms", 

18 "hypernyms": "hypernyms", 

19 "hypo": "hyponyms", 

20 "hyponyms": "hyponyms", 

21 "cot": "coordinate_terms", 

22 "coo": "coordinate_terms", 

23 "coord": "coordinate_terms", 

24 "coordinate terms": "coordinate_terms", 

25} 

26 

27 

28def extract_example_list_item( 

29 wxr: WiktextractContext, 

30 sense_data: Sense, 

31 list_item: WikiNode, 

32 word_entry: WordEntry, 

33 parent_example: Example | None = None, 

34) -> None: 

35 example_data = parent_example or Example() 

36 if list_item.contain_node(NodeKind.LIST) and not all( 

37 isinstance(n, TemplateNode) 

38 for n in list_item.invert_find_child(NodeKind.LIST) 

39 ): 

40 # plain text in the nested list, not using any template 

41 # https://zh.wiktionary.org/wiki/%, the second example 

42 extract_plain_text_example_list( 

43 wxr, sense_data, list_item, word_entry, example_data 

44 ) 

45 elif list_item.contain_node(NodeKind.TEMPLATE): 

46 # parse example templates 

47 for child in list_item.find_child(NodeKind.TEMPLATE): 

48 template_name = child.template_name 

49 if ( 

50 template_name.startswith(("quote-", "RQ:")) 

51 or template_name == "quote" 

52 ): 

53 extract_quote_templates(wxr, child, example_data) 

54 clean_node(wxr, sense_data, child) # add cat link 

55 elif template_name in ["ja-x", "ja-usex"]: 

56 extract_template_ja_usex(wxr, child, example_data) 

57 clean_node(wxr, sense_data, child) # add cat link 

58 elif template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]: 

59 sense_data.examples.extend( 

60 extract_template_zh_x(wxr, child, example_data) 

61 ) 

62 clean_node(wxr, sense_data, child) # add cat link 

63 elif template_name in [ 

64 "ux", 

65 "eg", 

66 "usex", 

67 "uxi", 

68 "collocation", 

69 "co", 

70 "coi", 

71 "ko-usex", 

72 "ko-x", 

73 "koex", 

74 "th-usex", 

75 "th-x", 

76 "th-xi", 

77 ]: 

78 extract_template_ux(wxr, child, example_data) 

79 clean_node(wxr, sense_data, child) # add cat link 

80 elif template_name == "Q": 

81 extract_template_Q(wxr, child, example_data) 

82 clean_node(wxr, sense_data, child) # add cat link 

83 elif template_name.lower() in LINKAGE_TEMPLATES: 

84 process_linkage_templates_in_gloss( 

85 wxr, 

86 word_entry, 

87 child, 

88 LINKAGE_TEMPLATES[template_name.lower()], 

89 " ".join(sense_data.glosses), 

90 ) 

91 elif template_name.lower() in ["inline alt forms", "alti"]: 91 ↛ 47line 91 didn't jump to line 47 because the condition on line 91 was always true

92 extract_inline_alt_forms_template(wxr, word_entry, child) 

93 

94 for next_list_item in list_item.find_child_recursively( 

95 NodeKind.LIST_ITEM 

96 ): 

97 extract_example_list_item( 

98 wxr, sense_data, next_list_item, word_entry, example_data 

99 ) 

100 elif not list_item.contain_node(NodeKind.LIST): 100 ↛ 103line 100 didn't jump to line 103 because the condition on line 100 was always true

101 example_data.text = clean_node(wxr, None, list_item.children) 

102 

103 if len(example_data.text) > 0 and parent_example is None: 

104 sense_data.examples.append(example_data) 

105 

106 

107def extract_plain_text_example_list( 

108 wxr: WiktextractContext, 

109 sense: Sense, 

110 list_item: WikiNode, 

111 word_entry: WordEntry, 

112 example_data: Example, 

113) -> None: 

114 for index, nested_list in list_item.find_child( 

115 NodeKind.LIST, with_index=True 

116 ): 

117 example_data.ref = clean_node(wxr, None, list_item.children[:index]) 

118 for child_list_item in nested_list.find_child(NodeKind.LIST_ITEM): 

119 extract_example_list_item( 

120 wxr, sense, child_list_item, word_entry, example_data 

121 ) 

122 

123 

124def extract_quote_templates( 

125 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

126) -> None: 

127 """ 

128 Process `quote-*` and "RQ:*" templates. 

129 """ 

130 expanded_node = wxr.wtp.parse( 

131 wxr.wtp.node_to_wikitext(node), expand_all=True 

132 ) 

133 for span_tag in expanded_node.find_html_recursively("span"): 

134 span_class = span_tag.attrs.get("class", "") 

135 if "cited-source" == span_class: 

136 example_data.ref = clean_node(wxr, None, span_tag) 

137 elif "e-quotation" in span_class: 

138 example_data.ruby, node_without_ruby = extract_ruby(wxr, span_tag) 

139 example_data.text = clean_node(wxr, None, node_without_ruby) 

140 calculate_bold_offsets( 

141 wxr, 

142 span_tag, 

143 example_data.text, 

144 example_data, 

145 "bold_text_offsets", 

146 ) 

147 elif "e-translation" in span_class: 

148 example_data.translation = clean_node(wxr, None, span_tag) 

149 calculate_bold_offsets( 

150 wxr, 

151 span_tag, 

152 example_data.translation, 

153 example_data, 

154 "bold_translation_offsets", 

155 ) 

156 for i_tag in expanded_node.find_html_recursively( 

157 "i", attr_name="class", attr_value="e-transliteration" 

158 ): 

159 example_data.roman = clean_node(wxr, None, i_tag) 

160 calculate_bold_offsets( 

161 wxr, 

162 i_tag, 

163 example_data.roman, 

164 example_data, 

165 "bold_roman_offsets", 

166 ) 

167 break 

168 

169 

170def extract_template_ja_usex( 

171 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

172) -> None: 

173 expanded_node = wxr.wtp.parse( 

174 wxr.wtp.node_to_wikitext(node), expand_all=True 

175 ) 

176 for span_tag in expanded_node.find_html( 

177 "span", attr_name="class", attr_value="Jpan" 

178 ): 

179 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

180 example_data.text = clean_node(wxr, None, node_without_ruby) 

181 calculate_bold_offsets( 

182 wxr, 

183 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)), 

184 example_data.text, 

185 example_data, 

186 "bold_text_offsets", 

187 ) 

188 example_data.ruby = ruby_data 

189 for span_tag in expanded_node.find_html_recursively( 

190 "span", attr_name="class", attr_value="tr" 

191 ): 

192 example_data.roman = clean_node(wxr, None, span_tag) 

193 calculate_bold_offsets( 

194 wxr, 

195 span_tag, 

196 example_data.roman, 

197 example_data, 

198 "bold_roman_offsets", 

199 ) 

200 tr_arg = wxr.wtp.parse( 

201 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")), 

202 expand_all=True, 

203 ) 

204 example_data.translation = clean_node(wxr, None, tr_arg) 

205 calculate_bold_offsets( 

206 wxr, 

207 tr_arg, 

208 example_data.translation, 

209 example_data, 

210 "bold_translation_offsets", 

211 ) 

212 lit_arg = wxr.wtp.parse( 

213 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")), 

214 expand_all=True, 

215 ) 

216 example_data.literal_meaning = clean_node(wxr, None, lit_arg) 

217 calculate_bold_offsets( 

218 wxr, 

219 lit_arg, 

220 example_data.literal_meaning, 

221 example_data, 

222 "bold_literal_offsets", 

223 ) 

224 

225 

226def extract_template_zh_x( 

227 wxr: WiktextractContext, 

228 template_node: TemplateNode, 

229 parent_example: Example, 

230) -> list[Example]: 

231 expanded_node = wxr.wtp.parse( 

232 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

233 ) 

234 has_dl_tag = False 

235 results = [] 

236 example_data = parent_example.model_copy(deep=True) 

237 tr_arg = wxr.wtp.parse( 

238 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")), 

239 expand_all=True, 

240 ) 

241 example_data.translation = clean_node(wxr, None, tr_arg) 

242 calculate_bold_offsets( 

243 wxr, 

244 tr_arg, 

245 example_data.translation, 

246 example_data, 

247 "bold_translation_offsets", 

248 ) 

249 lit_arg = wxr.wtp.parse( 

250 wxr.wtp.node_to_wikitext( 

251 template_node.template_parameters.get("lit", "") 

252 ), 

253 expand_all=True, 

254 ) 

255 example_data.literal_meaning = clean_node(wxr, None, lit_arg) 

256 calculate_bold_offsets( 

257 wxr, 

258 lit_arg, 

259 example_data.literal_meaning, 

260 example_data, 

261 "bold_literal_offsets", 

262 ) 

263 for dl_tag in expanded_node.find_html_recursively("dl"): 

264 has_dl_tag = True 

265 for dd_tag in dl_tag.find_html("dd"): 

266 dd_text = clean_node(wxr, None, dd_tag) 

267 if dd_text.startswith("出自:"): 

268 example_data.ref = dd_text.removeprefix("出自:") 

269 elif not dd_text.startswith("(字面義為"): 269 ↛ 265line 269 didn't jump to line 265 because the condition on line 269 was always true

270 for span_tag in dd_tag.find_html_recursively( 

271 "span", attr_name="lang", attr_value="Latn" 

272 ): 

273 example_data.roman = clean_node(wxr, None, span_tag) 

274 calculate_bold_offsets( 

275 wxr, 

276 span_tag, 

277 example_data.roman, 

278 example_data, 

279 "bold_roman_offsets", 

280 ) 

281 for span_tag in dd_tag.find_html_recursively("span"): 

282 span_text = clean_node(wxr, None, span_tag) 

283 if span_text.startswith("[") and span_text.endswith( 

284 "]" 

285 ): 

286 example_data.raw_tags.append(span_text.strip("[]")) 

287 break 

288 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data)) 

289 

290 # no source, single line example 

291 if not has_dl_tag: 

292 for span_tag in expanded_node.find_html( 292 ↛ 304line 292 didn't jump to line 304 because the loop on line 292 didn't complete

293 "span", attr_name="lang", attr_value="Latn" 

294 ): 

295 example_data.roman = clean_node(wxr, None, span_tag) 

296 calculate_bold_offsets( 

297 wxr, 

298 span_tag, 

299 example_data.roman, 

300 example_data, 

301 "bold_roman_offsets", 

302 ) 

303 break 

304 for span_tag in expanded_node.find_html("span"): 

305 span_text = clean_node(wxr, None, span_tag) 

306 if span_text.startswith("[") and span_text.endswith("]"): 

307 example_data.raw_tags.append(span_text.strip("[]")) 

308 for span_tag in expanded_node.find_html("span"): 

309 span_lang = span_tag.attrs.get("lang", "") 

310 if span_lang in ["zh-Hant", "zh-Hans"]: 

311 example_text = clean_node(wxr, None, span_tag) 

312 if len(example_text) > 0: 312 ↛ 308line 312 didn't jump to line 308 because the condition on line 312 was always true

313 new_example = example_data.model_copy(deep=True) 

314 new_example.text = example_text 

315 calculate_bold_offsets( 

316 wxr, 

317 span_tag, 

318 example_text, 

319 new_example, 

320 "bold_text_offsets", 

321 ) 

322 new_example.tags.append( 

323 "Traditional-Chinese" 

324 if span_lang == "zh-Hant" 

325 else "Simplified-Chinese" 

326 ) 

327 translate_raw_tags(new_example) 

328 results.append(new_example) 

329 return results 

330 

331 

332def extract_zh_x_dl_span_tag( 

333 wxr: WiktextractContext, dl_tag: HTMLNode, example: Example 

334) -> list[Example]: 

335 # process example text span tag and dialect span tag 

336 results = [] 

337 is_first_hide = True 

338 for span_tag in dl_tag.find_html("span"): 

339 span_lang = span_tag.attrs.get("lang", "") 

340 if span_lang in ["zh-Hant", "zh-Hans"]: 

341 new_example = example.model_copy(deep=True) 

342 new_example.text = clean_node(wxr, None, span_tag) 

343 calculate_bold_offsets( 

344 wxr, 

345 span_tag, 

346 new_example.text, 

347 new_example, 

348 "bold_text_offsets", 

349 ) 

350 results.append(new_example) 

351 elif "vsHide" in span_tag.attrs.get("class", ""): 

352 # template has arg "collapsed=y" 

353 results.extend( 

354 extract_zh_x_dl_span_tag( 

355 wxr, 

356 span_tag, 

357 results[-1] 

358 if is_first_hide and len(results) > 0 

359 else example, 

360 ) 

361 ) 

362 is_first_hide = False 

363 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 363 ↛ 338line 363 didn't jump to line 338 because the condition on line 363 was always true

364 for link_node in span_tag.find_child(NodeKind.LINK): 

365 raw_tag = clean_node(wxr, None, link_node) 

366 if len(raw_tag) > 0: 366 ↛ 364line 366 didn't jump to line 364 because the condition on line 366 was always true

367 if len(results) > 0: 

368 results[-1].raw_tags.append(raw_tag) 

369 else: 

370 example.raw_tags.append(raw_tag) 

371 

372 if dl_tag.tag == "dl": 

373 for data in results: 

374 translate_raw_tags(data) 

375 return results 

376 

377 

378def extract_template_ux( 

379 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

380) -> None: 

381 # https://zh.wiktionary.org/wiki/Template:ux 

382 expanded_node = wxr.wtp.parse( 

383 wxr.wtp.node_to_wikitext(node), expand_all=True 

384 ) 

385 for html_node in expanded_node.find_child_recursively(NodeKind.HTML): 

386 class_names = html_node.attrs.get("class", "") 

387 if "e-example" in class_names: 

388 example_data.text = clean_node(wxr, None, html_node) 

389 calculate_bold_offsets( 

390 wxr, 

391 html_node, 

392 example_data.text, 

393 example_data, 

394 "bold_text_offsets", 

395 ) 

396 elif "e-transliteration" in class_names: 

397 example_data.roman = clean_node(wxr, None, html_node) 

398 calculate_bold_offsets( 

399 wxr, 

400 html_node, 

401 example_data.roman, 

402 example_data, 

403 "bold_roman_offsets", 

404 ) 

405 elif "e-translation" in class_names: 

406 example_data.translation = clean_node(wxr, None, html_node) 

407 calculate_bold_offsets( 

408 wxr, 

409 html_node, 

410 example_data.translation, 

411 example_data, 

412 "bold_translation_offsets", 

413 ) 

414 elif "e-literally" in class_names: 

415 example_data.literal_meaning = clean_node(wxr, None, html_node) 

416 calculate_bold_offsets( 

417 wxr, 

418 html_node, 

419 example_data.literal_meaning, 

420 example_data, 

421 "bold_literal_offsets", 

422 ) 

423 elif "qualifier-content" in class_names: 

424 example_data.raw_tags.extend( 

425 clean_node(wxr, None, html_node).split("、") 

426 ) 

427 translate_raw_tags(example_data) 

428 

429 

430def extract_template_Q( 

431 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

432) -> None: 

433 # https://zh.wiktionary.org/wiki/Template:Q 

434 expanded_node = wxr.wtp.parse( 

435 wxr.wtp.node_to_wikitext(node), expand_all=True 

436 ) 

437 for div_tag in expanded_node.find_html( 

438 "div", attr_name="class", attr_value="wiktQuote" 

439 ): 

440 ref_nodes = [] 

441 for child in div_tag.children: 441 ↛ 456line 441 didn't jump to line 456 because the loop on line 441 didn't complete

442 if isinstance(child, HTMLNode) and child.tag == "dl": 

443 for i_tag in child.find_html_recursively( 

444 "i", attr_name="class", attr_value="e-transliteration" 

445 ): 

446 example_data.roman = clean_node(wxr, None, i_tag) 

447 calculate_bold_offsets( 

448 wxr, 

449 i_tag, 

450 example_data.roman, 

451 example_data, 

452 "bold_roman_offsets", 

453 ) 

454 break 

455 ref_nodes.append(child) 

456 ref_text = clean_node(wxr, None, ref_nodes) 

457 if len(ref_text) > 0: 457 ↛ 459line 457 didn't jump to line 459 because the condition on line 457 was always true

458 example_data.ref = ref_text 

459 for t_arg, field in ( 

460 ("quote", "text"), 

461 ("t", "translation"), 

462 ("trans", "translation"), 

463 ("lit", "literal_meaning"), 

464 ): 

465 t_arg_node = wxr.wtp.parse( 

466 wxr.wtp.node_to_wikitext( 

467 node.template_parameters.get(t_arg, "") 

468 ), 

469 expand_all=True, 

470 ) 

471 value = clean_node(wxr, None, t_arg_node) 

472 if len(value) > 0: 

473 setattr(example_data, field, value) 

474 calculate_bold_offsets( 

475 wxr, 

476 t_arg_node, 

477 value, 

478 example_data, 

479 "bold_" + field.split("_")[0] + "_offsets", 

480 ) 

481 

482 

483def extract_inline_alt_forms_template( 

484 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

485): 

486 sense = " ".join(word_entry.senses[-1].glosses) 

487 forms = [] 

488 raw_tag = "" 

489 expanded_node = wxr.wtp.parse( 

490 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

491 ) 

492 lang = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

493 for span_tag in expanded_node.find_html_recursively("span"): 

494 span_class = span_tag.attrs.get("class", "") 

495 span_lang = span_tag.attrs.get("lang", "") 

496 if "qualifier-content" in span_class: 

497 raw_tag = clean_node(wxr, None, span_tag) 

498 elif span_lang == lang: 

499 word = clean_node(wxr, None, span_tag) 

500 if word != "": 500 ↛ 493line 500 didn't jump to line 493 because the condition on line 500 was always true

501 form = Form(form=word, sense=sense, tags=["alternative"]) 

502 if raw_tag != "": 

503 form.raw_tags.append(raw_tag) 

504 raw_tag = "" 

505 translate_raw_tags(form) 

506 forms.append(form) 

507 elif span_class == "tr Latn" and len(forms) > 0: 

508 forms[-1].roman = clean_node(wxr, None, span_tag) 

509 word_entry.forms.extend(forms)