Coverage for src/wiktextract/extractor/zh/example.py: 97%

184 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..ruby import extract_ruby 

6from ..share import calculate_bold_offsets 

7from .linkage import process_linkage_templates_in_gloss 

8from .models import Example, Sense, WordEntry 

9from .tags import translate_raw_tags 

10 

11LINKAGE_TEMPLATES = { 

12 "syn": "synonyms", 

13 "synonyms": "synonyms", 

14 "ant": "antonyms", 

15 "antonyms": "antonyms", 

16 "hyper": "hypernyms", 

17 "hypernyms": "hypernyms", 

18 "hypo": "hyponyms", 

19 "hyponyms": "hyponyms", 

20} 

21 

22 

23def extract_example_list_item( 

24 wxr: WiktextractContext, 

25 sense_data: Sense, 

26 list_item: WikiNode, 

27 word_entry: WordEntry, 

28 parent_example: Example | None = None, 

29) -> None: 

30 example_data = parent_example or Example() 

31 if list_item.contain_node(NodeKind.LIST) and not all( 

32 isinstance(n, TemplateNode) 

33 for n in list_item.invert_find_child(NodeKind.LIST) 

34 ): 

35 # plain text in the nested list, not using any template 

36 # https://zh.wiktionary.org/wiki/%, the second example 

37 extract_plain_text_example_list(wxr, list_item, example_data) 

38 else: 

39 # parse example templates 

40 for child in list_item.find_child(NodeKind.TEMPLATE): 

41 template_name = child.template_name 

42 if ( 

43 template_name.startswith(("quote-", "RQ:")) 

44 or template_name == "quote" 

45 ): 

46 extract_quote_templates(wxr, child, example_data) 

47 clean_node(wxr, sense_data, child) # add cat link 

48 elif template_name in ["ja-x", "ja-usex"]: 

49 extract_template_ja_usex(wxr, child, example_data) 

50 clean_node(wxr, sense_data, child) # add cat link 

51 elif template_name in ["zh-x", "zh-usex", "zh-q", "zh-co"]: 

52 sense_data.examples.extend( 

53 extract_template_zh_x(wxr, child, example_data) 

54 ) 

55 clean_node(wxr, sense_data, child) # add cat link 

56 elif template_name in [ 

57 "ux", 

58 "eg", 

59 "usex", 

60 "uxi", 

61 "collocation", 

62 "co", 

63 "coi", 

64 "ko-usex", 

65 "ko-x", 

66 "koex", 

67 "th-usex", 

68 "th-x", 

69 "th-xi", 

70 ]: 

71 extract_template_ux(wxr, child, example_data) 

72 clean_node(wxr, sense_data, child) # add cat link 

73 elif template_name == "Q": 

74 extract_template_Q(wxr, child, example_data) 

75 clean_node(wxr, sense_data, child) # add cat link 

76 elif template_name in LINKAGE_TEMPLATES: 76 ↛ 87line 76 didn't jump to line 87 because the condition on line 76 was always true

77 process_linkage_templates_in_gloss( 

78 wxr, 

79 word_entry, 

80 child, 

81 LINKAGE_TEMPLATES[template_name], 

82 sense_data.glosses[0] 

83 if len(sense_data.glosses) > 0 

84 else "", 

85 ) 

86 else: 

87 example_data.text = clean_node(wxr, None, child) 

88 

89 for next_list_item in list_item.find_child_recursively( 

90 NodeKind.LIST_ITEM 

91 ): 

92 extract_example_list_item( 

93 wxr, sense_data, next_list_item, word_entry, example_data 

94 ) 

95 

96 if len(example_data.text) > 0 and parent_example is None: 

97 sense_data.examples.append(example_data) 

98 

99 

100def extract_plain_text_example_list( 

101 wxr: WiktextractContext, list_item: WikiNode, example_data: Example 

102) -> None: 

103 for index, nested_list in list_item.find_child( 

104 NodeKind.LIST, with_index=True 

105 ): 

106 example_data.ref = clean_node(wxr, None, list_item.children[:index]) 

107 example_data.text = clean_node( 

108 wxr, None, nested_list.children[0].children 

109 ) 

110 

111 

112def extract_quote_templates( 

113 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

114) -> None: 

115 """ 

116 Process `quote-*` and "RQ:*" templates. 

117 """ 

118 expanded_node = wxr.wtp.parse( 

119 wxr.wtp.node_to_wikitext(node), expand_all=True 

120 ) 

121 for span_tag in expanded_node.find_html_recursively("span"): 

122 span_class = span_tag.attrs.get("class", "") 

123 if "cited-source" == span_class: 

124 example_data.ref = clean_node(wxr, None, span_tag) 

125 elif "e-quotation" in span_class: 

126 example_data.text = clean_node(wxr, None, span_tag) 

127 calculate_bold_offsets( 

128 wxr, 

129 span_tag, 

130 example_data.text, 

131 example_data, 

132 "bold_text_offsets", 

133 ) 

134 elif "e-translation" in span_class: 

135 example_data.translation = clean_node(wxr, None, span_tag) 

136 calculate_bold_offsets( 

137 wxr, 

138 span_tag, 

139 example_data.translation, 

140 example_data, 

141 "bold_translation_offsets", 

142 ) 

143 for i_tag in expanded_node.find_html_recursively( 

144 "i", attr_name="class", attr_value="e-transliteration" 

145 ): 

146 example_data.roman = clean_node(wxr, None, i_tag) 

147 calculate_bold_offsets( 

148 wxr, 

149 i_tag, 

150 example_data.roman, 

151 example_data, 

152 "bold_roman_offsets", 

153 ) 

154 break 

155 

156 

157def extract_template_ja_usex( 

158 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

159) -> None: 

160 expanded_node = wxr.wtp.parse( 

161 wxr.wtp.node_to_wikitext(node), expand_all=True 

162 ) 

163 for span_tag in expanded_node.find_html( 

164 "span", attr_name="class", attr_value="Jpan" 

165 ): 

166 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

167 example_data.text = clean_node(wxr, None, node_without_ruby) 

168 calculate_bold_offsets( 

169 wxr, 

170 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)), 

171 example_data.text, 

172 example_data, 

173 "bold_text_offsets", 

174 ) 

175 example_data.ruby = ruby_data 

176 for span_tag in expanded_node.find_html_recursively( 

177 "span", attr_name="class", attr_value="tr" 

178 ): 

179 example_data.roman = clean_node(wxr, None, span_tag) 

180 calculate_bold_offsets( 

181 wxr, 

182 span_tag, 

183 example_data.roman, 

184 example_data, 

185 "bold_roman_offsets", 

186 ) 

187 tr_arg = wxr.wtp.parse( 

188 wxr.wtp.node_to_wikitext(node.template_parameters.get(3, "")), 

189 expand_all=True, 

190 ) 

191 example_data.translation = clean_node(wxr, None, tr_arg) 

192 calculate_bold_offsets( 

193 wxr, 

194 tr_arg, 

195 example_data.translation, 

196 example_data, 

197 "bold_translation_offsets", 

198 ) 

199 lit_arg = wxr.wtp.parse( 

200 wxr.wtp.node_to_wikitext(node.template_parameters.get("lit", "")), 

201 expand_all=True, 

202 ) 

203 example_data.literal_meaning = clean_node(wxr, None, lit_arg) 

204 calculate_bold_offsets( 

205 wxr, 

206 lit_arg, 

207 example_data.literal_meaning, 

208 example_data, 

209 "bold_literal_offsets", 

210 ) 

211 

212 

213def extract_template_zh_x( 

214 wxr: WiktextractContext, 

215 template_node: TemplateNode, 

216 parent_example: Example, 

217) -> list[Example]: 

218 expanded_node = wxr.wtp.parse( 

219 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

220 ) 

221 has_dl_tag = False 

222 results = [] 

223 example_data = parent_example.model_copy(deep=True) 

224 tr_arg = wxr.wtp.parse( 

225 wxr.wtp.node_to_wikitext(template_node.template_parameters.get(2, "")), 

226 expand_all=True, 

227 ) 

228 example_data.translation = clean_node(wxr, None, tr_arg) 

229 calculate_bold_offsets( 

230 wxr, 

231 tr_arg, 

232 example_data.translation, 

233 example_data, 

234 "bold_translation_offsets", 

235 ) 

236 lit_arg = wxr.wtp.parse( 

237 wxr.wtp.node_to_wikitext( 

238 template_node.template_parameters.get("lit", "") 

239 ), 

240 expand_all=True, 

241 ) 

242 example_data.literal_meaning = clean_node(wxr, None, lit_arg) 

243 calculate_bold_offsets( 

244 wxr, 

245 lit_arg, 

246 example_data.literal_meaning, 

247 example_data, 

248 "bold_literal_offsets", 

249 ) 

250 for dl_tag in expanded_node.find_html_recursively("dl"): 

251 has_dl_tag = True 

252 for dd_tag in dl_tag.find_html("dd"): 

253 dd_text = clean_node(wxr, None, dd_tag) 

254 if dd_text.startswith("出自:"): 

255 example_data.ref = dd_text.removeprefix("出自:") 

256 elif not dd_text.startswith("(字面義為"): 256 ↛ 252line 256 didn't jump to line 252 because the condition on line 256 was always true

257 for span_tag in dd_tag.find_html_recursively( 

258 "span", attr_name="lang", attr_value="Latn" 

259 ): 

260 example_data.roman = clean_node(wxr, None, span_tag) 

261 calculate_bold_offsets( 

262 wxr, 

263 span_tag, 

264 example_data.roman, 

265 example_data, 

266 "bold_roman_offsets", 

267 ) 

268 for span_tag in dd_tag.find_html_recursively("span"): 

269 span_text = clean_node(wxr, None, span_tag) 

270 if span_text.startswith("[") and span_text.endswith( 

271 "]" 

272 ): 

273 example_data.raw_tags.append(span_text.strip("[]")) 

274 break 

275 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data)) 

276 

277 # no source, single line example 

278 if not has_dl_tag: 

279 for span_tag in expanded_node.find_html( 279 ↛ 291line 279 didn't jump to line 291 because the loop on line 279 didn't complete

280 "span", attr_name="lang", attr_value="Latn" 

281 ): 

282 example_data.roman = clean_node(wxr, None, span_tag) 

283 calculate_bold_offsets( 

284 wxr, 

285 span_tag, 

286 example_data.roman, 

287 example_data, 

288 "bold_roman_offsets", 

289 ) 

290 break 

291 for span_tag in expanded_node.find_html("span"): 

292 span_text = clean_node(wxr, None, span_tag) 

293 if span_text.startswith("[") and span_text.endswith("]"): 

294 example_data.raw_tags.append(span_text.strip("[]")) 

295 for span_tag in expanded_node.find_html("span"): 

296 span_lang = span_tag.attrs.get("lang", "") 

297 if span_lang in ["zh-Hant", "zh-Hans"]: 

298 example_text = clean_node(wxr, None, span_tag) 

299 if len(example_text) > 0: 299 ↛ 295line 299 didn't jump to line 295 because the condition on line 299 was always true

300 new_example = example_data.model_copy(deep=True) 

301 new_example.text = example_text 

302 calculate_bold_offsets( 

303 wxr, 

304 span_tag, 

305 example_text, 

306 new_example, 

307 "bold_text_offsets", 

308 ) 

309 new_example.tags.append( 

310 "Traditional Chinese" 

311 if span_lang == "zh-Hant" 

312 else "Simplified Chinese" 

313 ) 

314 translate_raw_tags(new_example) 

315 results.append(new_example) 

316 return results 

317 

318 

319def extract_zh_x_dl_span_tag( 

320 wxr: WiktextractContext, dl_tag: HTMLNode, example: Example 

321) -> list[Example]: 

322 # process example text span tag and dialect span tag 

323 results = [] 

324 is_first_hide = True 

325 for span_tag in dl_tag.find_html("span"): 

326 span_lang = span_tag.attrs.get("lang", "") 

327 if span_lang in ["zh-Hant", "zh-Hans"]: 

328 new_example = example.model_copy(deep=True) 

329 new_example.text = clean_node(wxr, None, span_tag) 

330 calculate_bold_offsets( 

331 wxr, 

332 span_tag, 

333 new_example.text, 

334 new_example, 

335 "bold_text_offsets", 

336 ) 

337 results.append(new_example) 

338 elif "vsHide" in span_tag.attrs.get("class", ""): 

339 # template has arg "collapsed=y" 

340 results.extend( 

341 extract_zh_x_dl_span_tag( 

342 wxr, 

343 span_tag, 

344 results[-1] 

345 if is_first_hide and len(results) > 0 

346 else example, 

347 ) 

348 ) 

349 is_first_hide = False 

350 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 350 ↛ 325line 350 didn't jump to line 325 because the condition on line 350 was always true

351 for link_node in span_tag.find_child(NodeKind.LINK): 

352 raw_tag = clean_node(wxr, None, link_node) 

353 if len(raw_tag) > 0: 353 ↛ 351line 353 didn't jump to line 351 because the condition on line 353 was always true

354 if len(results) > 0: 

355 results[-1].raw_tags.append(raw_tag) 

356 else: 

357 example.raw_tags.append(raw_tag) 

358 

359 if dl_tag.tag == "dl": 

360 for data in results: 

361 translate_raw_tags(data) 

362 return results 

363 

364 

365def extract_template_ux( 

366 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

367) -> None: 

368 # https://zh.wiktionary.org/wiki/Template:ux 

369 expanded_node = wxr.wtp.parse( 

370 wxr.wtp.node_to_wikitext(node), expand_all=True 

371 ) 

372 for html_node in expanded_node.find_child_recursively(NodeKind.HTML): 

373 class_names = html_node.attrs.get("class", "") 

374 if "e-example" in class_names: 

375 example_data.text = clean_node(wxr, None, html_node) 

376 calculate_bold_offsets( 

377 wxr, 

378 html_node, 

379 example_data.text, 

380 example_data, 

381 "bold_text_offsets", 

382 ) 

383 elif "e-transliteration" in class_names: 

384 example_data.roman = clean_node(wxr, None, html_node) 

385 calculate_bold_offsets( 

386 wxr, 

387 html_node, 

388 example_data.roman, 

389 example_data, 

390 "bold_roman_offsets", 

391 ) 

392 elif "e-translation" in class_names: 

393 example_data.translation = clean_node(wxr, None, html_node) 

394 calculate_bold_offsets( 

395 wxr, 

396 html_node, 

397 example_data.translation, 

398 example_data, 

399 "bold_translation_offsets", 

400 ) 

401 elif "e-literally" in class_names: 

402 example_data.literal_meaning = clean_node(wxr, None, html_node) 

403 calculate_bold_offsets( 

404 wxr, 

405 html_node, 

406 example_data.literal_meaning, 

407 example_data, 

408 "bold_literal_offsets", 

409 ) 

410 elif "qualifier-content" in class_names: 

411 example_data.raw_tags.extend( 

412 clean_node(wxr, None, html_node).split("、") 

413 ) 

414 translate_raw_tags(example_data) 

415 

416 

417def extract_template_Q( 

418 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

419) -> None: 

420 # https://zh.wiktionary.org/wiki/Template:Q 

421 expanded_node = wxr.wtp.parse( 

422 wxr.wtp.node_to_wikitext(node), expand_all=True 

423 ) 

424 for div_tag in expanded_node.find_html( 

425 "div", attr_name="class", attr_value="wiktQuote" 

426 ): 

427 ref_nodes = [] 

428 for child in div_tag.children: 428 ↛ 443line 428 didn't jump to line 443 because the loop on line 428 didn't complete

429 if isinstance(child, HTMLNode) and child.tag == "dl": 

430 for i_tag in child.find_html_recursively( 

431 "i", attr_name="class", attr_value="e-transliteration" 

432 ): 

433 example_data.roman = clean_node(wxr, None, i_tag) 

434 calculate_bold_offsets( 

435 wxr, 

436 i_tag, 

437 example_data.roman, 

438 example_data, 

439 "bold_roman_offsets", 

440 ) 

441 break 

442 ref_nodes.append(child) 

443 ref_text = clean_node(wxr, None, ref_nodes) 

444 if len(ref_text) > 0: 444 ↛ 446line 444 didn't jump to line 446 because the condition on line 444 was always true

445 example_data.ref = ref_text 

446 for t_arg, field in ( 

447 ("quote", "text"), 

448 ("t", "translation"), 

449 ("trans", "translation"), 

450 ("lit", "literal_meaning"), 

451 ): 

452 t_arg_node = wxr.wtp.parse( 

453 wxr.wtp.node_to_wikitext( 

454 node.template_parameters.get(t_arg, "") 

455 ), 

456 expand_all=True, 

457 ) 

458 value = clean_node(wxr, None, t_arg_node) 

459 if len(value) > 0: 

460 setattr(example_data, field, value) 

461 calculate_bold_offsets( 

462 wxr, 

463 t_arg_node, 

464 value, 

465 example_data, 

466 "bold_" + field.split("_")[0] + "_offsets", 

467 )