Coverage for src / wiktextract / extractor / zh / linkage.py: 91%

256 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-23 01:12 +0000

1import re 

2 

3from wikitextprocessor import ( 

4 HTMLNode, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from ..ruby import extract_ruby 

14from .models import Form, Linkage, WordEntry 

15from .tags import translate_raw_tags 

16 

17 

18def extract_linkage_section( 

19 wxr: WiktextractContext, 

20 page_data: list[WordEntry], 

21 level_node: LevelNode, 

22 linkage_type: str, 

23) -> None: 

24 sense = "" 

25 linkage_list = [] 

26 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST): 

27 if node.kind == NodeKind.LIST: 

28 for item_node in node.find_child(NodeKind.LIST_ITEM): 

29 sense, new_linkage_list = process_linkage_list_item( 

30 wxr, item_node, sense 

31 ) 

32 linkage_list.extend(new_linkage_list) 

33 elif isinstance(node, TemplateNode): 33 ↛ 26line 33 didn't jump to line 26 because the condition on line 33 was always true

34 if node.template_name in ["s", "sense"]: 

35 sense = clean_node(wxr, None, node).strip("(): ") 

36 elif node.template_name == "zh-dial": 

37 linkage_list.extend(extract_zh_dial_template(wxr, node, sense)) 

38 elif re.fullmatch( 

39 r"(?:col|der|rel)\d", node.template_name, re.I 

40 ) or node.template_name.endswith("-saurus"): 

41 linkage_list.extend( 

42 process_linkage_col_template(wxr, node, sense) 

43 ) 

44 elif node.template_name == "ja-r/multi": 

45 linkage_list.extend( 

46 extract_ja_r_multi_template(wxr, node, sense) 

47 ) 

48 

49 if linkage_type == "alt_forms": 

50 forms = [ 

51 Form( 

52 form=l_data.word, 

53 sense=l_data.sense, 

54 tags=l_data.tags + ["alternative"], 

55 raw_tags=l_data.raw_tags, 

56 roman=l_data.roman, 

57 ruby=l_data.ruby, 

58 attestations=l_data.attestations, 

59 ) 

60 for l_data in linkage_list 

61 ] 

62 page_data[-1].forms.extend(forms) 

63 else: 

64 getattr(page_data[-1], linkage_type).extend(linkage_list) 

65 for data in page_data[:-1]: 

66 if ( 66 ↛ 72line 66 didn't jump to line 72 because the condition on line 66 was never true

67 data.lang_code == page_data[-1].lang_code 

68 and data.sounds == page_data[-1].sounds 

69 and data.etymology_texts == page_data[-1].etymology_texts 

70 and data.pos_level == page_data[-1].pos_level == level_node.kind 

71 ): 

72 getattr(data, linkage_type).extend(linkage_list) 

73 

74 

75def process_linkage_list_item( 

76 wxr: WiktextractContext, list_item: WikiNode, sense: str 

77) -> tuple[str, list[Linkage]]: 

78 raw_tags = [] 

79 linkage_list = [] 

80 for item_child in list_item.children: 

81 if isinstance(item_child, TemplateNode): 

82 if item_child.template_name in ["s", "sense"]: 

83 sense = clean_node(wxr, None, item_child).strip("(): ") 

84 elif item_child.template_name in ["qualifier", "qual"]: 

85 raw_tags.append(clean_node(wxr, None, item_child).strip("()")) 

86 elif item_child.template_name == "zh-l": 

87 linkage_list.extend( 

88 process_zh_l_template(wxr, item_child, sense, raw_tags) 

89 ) 

90 raw_tags.clear() 

91 elif item_child.template_name == "ja-r": 

92 linkage_list.append( 

93 process_ja_r_template(wxr, item_child, sense, raw_tags) 

94 ) 

95 raw_tags.clear() 

96 elif item_child.template_name.lower() in [ 

97 "l", 

98 "link", 

99 "alter", 

100 "alt", 

101 ]: 

102 linkage_list.extend( 

103 process_l_template(wxr, item_child, sense, raw_tags) 

104 ) 

105 raw_tags.clear() 

106 elif ( 106 ↛ 80line 106 didn't jump to line 80 because the condition on line 106 was always true

107 item_child.template_name.lower() in ["defdate", "datedef"] 

108 and len(linkage_list) > 0 

109 ): 

110 from .gloss import extract_defdate_template 

111 

112 extract_defdate_template(wxr, linkage_list[-1], item_child) 

113 elif ( 

114 isinstance(item_child, WikiNode) 

115 and item_child.kind == NodeKind.LINK 

116 ): 

117 word = clean_node(wxr, None, item_child) 

118 if len(word) > 0: 118 ↛ 80line 118 didn't jump to line 80 because the condition on line 118 was always true

119 linkage_data = Linkage( 

120 word=word, sense=sense, raw_tags=raw_tags 

121 ) 

122 translate_raw_tags(linkage_data) 

123 linkage_list.append(linkage_data) 

124 raw_tags.clear() 

125 elif ( 125 ↛ 129line 125 didn't jump to line 129 because the condition on line 125 was never true

126 isinstance(item_child, WikiNode) 

127 and item_child.kind == NodeKind.LIST 

128 ): 

129 for child_list_item in item_child.find_child(NodeKind.LIST_ITEM): 

130 _, new_list = process_linkage_list_item( 

131 wxr, child_list_item, sense 

132 ) 

133 linkage_list.extend(new_list) 

134 

135 if len(raw_tags) > 0 and len(linkage_list) > 0: 

136 linkage_list[-1].raw_tags.extend(raw_tags) 

137 translate_raw_tags(linkage_list[-1]) 

138 return sense, linkage_list 

139 

140 

141def extract_zh_dial_template( 

142 wxr: WiktextractContext, template_node: TemplateNode, sense: str 

143) -> list[Linkage]: 

144 from .pronunciation import split_zh_pron_raw_tag 

145 

146 linkage_list = [] 

147 expanded_node = wxr.wtp.parse( 

148 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

149 ) 

150 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE): 

151 is_note_row = False 

152 note_tags = {} 

153 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

154 for cell_node in row_node.find_child( 

155 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

156 ): 

157 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

158 is_note_row = clean_node(wxr, None, cell_node) == "註解" 

159 elif is_note_row: 

160 for note_str in clean_node(wxr, None, cell_node).split(";"): 

161 if "-" in note_str: 161 ↛ 160line 161 didn't jump to line 160 because the condition on line 161 was always true

162 note_symbol, note = note_str.split("-", maxsplit=1) 

163 note_symbol = note_symbol.strip() 

164 note = note.strip() 

165 if note_symbol != "" and note != "": 165 ↛ 160line 165 didn't jump to line 160 because the condition on line 165 was always true

166 note_tags[note_symbol] = note 

167 lang_tags = [] 

168 region_tags = [] 

169 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

170 if not row_node.contain_node(NodeKind.TABLE_CELL): 

171 continue # skip header row 

172 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL): 

173 lang_tags = split_zh_pron_raw_tag( 

174 clean_node(wxr, None, header_node) 

175 ) 

176 if lang_tags == ["註解"]: # skip last note row 

177 continue 

178 for cell_node in row_node.find_child(NodeKind.TABLE_CELL): 

179 for link_node in cell_node.find_child(NodeKind.LINK): 

180 region_tags = split_zh_pron_raw_tag( 

181 clean_node(wxr, None, link_node) 

182 ) 

183 for span_tag in cell_node.find_html("span"): 

184 span_text = clean_node(wxr, None, span_tag) 

185 if span_text == "": 185 ↛ 186line 185 didn't jump to line 186 because the condition on line 185 was never true

186 continue 

187 if ( 

188 span_tag.attrs.get("lang", "") == "zh" 

189 and span_text != wxr.wtp.title 

190 ): 

191 l_data = Linkage(word=span_text, sense=sense) 

192 if len(lang_tags) > 0: 192 ↛ 194line 192 didn't jump to line 194 because the condition on line 192 was always true

193 l_data.raw_tags.extend(lang_tags) 

194 if len(region_tags) > 0: 

195 l_data.raw_tags.extend(region_tags) 

196 translate_raw_tags(l_data) 

197 linkage_list.append(l_data) 

198 elif ( 

199 span_tag.attrs.get("style", "") == "font-size:60%" 

200 and len(linkage_list) > 0 

201 ): 

202 for note_symbol in span_text.split(","): 

203 note_symbol = note_symbol.strip() 

204 raw_tag = note_symbol 

205 if note_symbol in note_tags: 

206 raw_tag = note_tags[note_symbol] 

207 if raw_tag != "": 207 ↛ 202line 207 didn't jump to line 202 because the condition on line 207 was always true

208 linkage_list[-1].raw_tags.append(raw_tag) 

209 translate_raw_tags(linkage_list[-1]) 

210 

211 return linkage_list 

212 

213 

214def process_zh_l_template( 

215 wxr: WiktextractContext, 

216 t_node: TemplateNode, 

217 sense: str, 

218 raw_tags: list[str] = [], 

219) -> list[Linkage]: 

220 # https://zh.wiktionary.org/wiki/Template:Zh-l 

221 expanded_node = wxr.wtp.parse( 

222 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

223 ) 

224 roman = "" 

225 linkage_list = [] 

226 new_sense = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

227 if new_sense != "": 227 ↛ 228line 227 didn't jump to line 228 because the condition on line 227 was never true

228 sense = new_sense 

229 for i_tag in expanded_node.find_html_recursively( 

230 "span", attr_name="class", attr_value="Latn" 

231 ): 

232 roman = clean_node(wxr, None, i_tag) 

233 for span_tag in expanded_node.find_html( 

234 "span", attr_name="lang", attr_value="zh" 

235 ): 

236 linkage_data = Linkage( 

237 sense=sense, 

238 raw_tags=raw_tags, 

239 roman=roman, 

240 word=clean_node(wxr, None, span_tag), 

241 ) 

242 lang_attr = span_tag.attrs.get("lang", "") 

243 if lang_attr == "zh-Hant": 

244 linkage_data.tags.append("Traditional-Chinese") 

245 elif lang_attr == "zh-Hans": 

246 linkage_data.tags.append("Simplified-Chinese") 

247 if linkage_data.word not in ["/", ""]: 

248 translate_raw_tags(linkage_data) 

249 linkage_list.append(linkage_data) 

250 return linkage_list 

251 

252 

253def process_ja_r_template( 

254 wxr: WiktextractContext, 

255 template_node: TemplateNode, 

256 sense: str, 

257 raw_tags: list[str] = [], 

258) -> Linkage: 

259 # https://zh.wiktionary.org/wiki/Template:Ja-r 

260 expanded_node = wxr.wtp.parse( 

261 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

262 ) 

263 return process_expanded_ja_r_node(wxr, expanded_node, sense, raw_tags) 

264 

265 

266def process_expanded_ja_r_node( 

267 wxr: WiktextractContext, 

268 expanded_node: WikiNode, 

269 sense: str, 

270 raw_tags: list[str] = [], 

271) -> Linkage: 

272 linkage_data = Linkage(sense=sense, raw_tags=raw_tags) 

273 for span_node in expanded_node.find_html("span"): 

274 span_class = span_node.attrs.get("class", "") 

275 if "lang" in span_node.attrs: 

276 ruby_data, no_ruby_nodes = extract_ruby(wxr, span_node) 

277 linkage_data.word = clean_node(wxr, None, no_ruby_nodes) 

278 linkage_data.ruby = ruby_data 

279 elif "tr" in span_class: 

280 linkage_data.roman = clean_node(wxr, None, span_node) 

281 elif "mention-gloss" == span_class: 281 ↛ 282line 281 didn't jump to line 282 because the condition on line 281 was never true

282 linkage_data.sense = clean_node(wxr, None, span_node) 

283 

284 translate_raw_tags(linkage_data) 

285 return linkage_data 

286 

287 

288def process_l_template( 

289 wxr: WiktextractContext, 

290 t_node: TemplateNode, 

291 sense: str, 

292 raw_tags: list[str] = [], 

293) -> None: 

294 # https://zh.wiktionary.org/wiki/Template:l 

295 expanded_node = wxr.wtp.parse( 

296 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

297 ) 

298 linkage_list = [] 

299 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

300 for span_tag in expanded_node.find_html("span"): 

301 span_lang = span_tag.attrs.get("lang", "") 

302 span_class = span_tag.attrs.get("class", "").split() 

303 if span_lang == lang_code: 

304 linkage_data = Linkage( 

305 sense=sense, 

306 raw_tags=raw_tags, 

307 word=clean_node(wxr, None, span_tag), 

308 ) 

309 if len(linkage_data.word) > 0: 309 ↛ 300line 309 didn't jump to line 300 because the condition on line 309 was always true

310 translate_raw_tags(linkage_data) 

311 linkage_list.append(linkage_data) 

312 elif span_lang.endswith("-Latn") and len(linkage_list) > 0: 

313 linkage_list[-1].roman = clean_node(wxr, None, span_tag) 

314 elif "mention-gloss" in span_class and len(linkage_list) > 0: 

315 linkage_list[-1].sense = clean_node(wxr, None, span_tag) 

316 elif "ib-content" in span_class and len(linkage_list) > 0: 

317 raw_tag = clean_node(wxr, None, span_tag) 

318 if raw_tag != "": 318 ↛ 300line 318 didn't jump to line 300 because the condition on line 318 was always true

319 linkage_list[-1].raw_tags.append(raw_tag) 

320 translate_raw_tags(linkage_list[-1]) 

321 

322 return linkage_list 

323 

324 

325def process_linkage_col_template( 

326 wxr: WiktextractContext, template_node: TemplateNode, sense: str 

327) -> list[Linkage]: 

328 # https://zh.wiktionary.org/wiki/Template:Col3 

329 linkage_list = [] 

330 expanded_template = wxr.wtp.parse( 

331 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

332 ) 

333 for ui_tag in expanded_template.find_html_recursively("li"): 

334 current_data = [] 

335 roman = "" 

336 raw_tags = [] 

337 for span_tag in ui_tag.find_html("span"): 

338 span_lang = span_tag.attrs.get("lang", "") 

339 if span_lang.endswith("-Latn"): 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true

340 roman = clean_node(wxr, None, span_tag) 

341 elif "qualifier-content" in span_tag.attrs.get("class", ""): 

342 span_text = clean_node(wxr, None, span_tag) 

343 for raw_tag in re.split(r"或|、", span_text): 

344 raw_tag = raw_tag.strip() 

345 if raw_tag != "": 345 ↛ 343line 345 didn't jump to line 343 because the condition on line 345 was always true

346 raw_tags.append(raw_tag) 

347 elif span_lang != "": 

348 l_data = Linkage( 

349 word=clean_node(wxr, None, span_tag), sense=sense 

350 ) 

351 class_names = span_tag.attrs.get("class", "") 

352 if class_names == "Hant": 352 ↛ 353line 352 didn't jump to line 353 because the condition on line 352 was never true

353 l_data.tags.append("Traditional-Chinese") 

354 elif class_names == "Hans": 354 ↛ 355line 354 didn't jump to line 355 because the condition on line 354 was never true

355 l_data.tags.append("Simplified-Chinese") 

356 if l_data.word != "": 356 ↛ 337line 356 didn't jump to line 337 because the condition on line 356 was always true

357 current_data.append(l_data) 

358 

359 for data in current_data: 

360 data.raw_tags.extend(raw_tags) 

361 data.roman = roman 

362 translate_raw_tags(data) 

363 linkage_list.extend(current_data) 

364 

365 return linkage_list 

366 

367 

368def process_linkage_templates_in_gloss( 

369 wxr: WiktextractContext, 

370 word_entry: WordEntry, 

371 t_node: TemplateNode, 

372 linkage_type: str, 

373 sense: str, 

374) -> None: 

375 # https://en.wiktionary.org/wiki/Template:synonyms 

376 expanded_node = wxr.wtp.parse( 

377 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

378 ) 

379 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

380 l_list = [] 

381 raw_tags = [] 

382 for top_span_tag in expanded_node.find_html("span"): 

383 for node in top_span_tag.children: 

384 if isinstance(node, HTMLNode) and node.tag == "span": 

385 span_lang = node.attrs.get("lang", "") 

386 span_class = node.attrs.get("class", "") 

387 if span_lang == lang_code: 

388 l_data = Linkage( 

389 word=clean_node(wxr, None, node), 

390 sense=sense, 

391 raw_tags=raw_tags, 

392 ) 

393 if span_class == "Hant": 

394 l_data.tags.append("Traditional-Chinese") 

395 elif span_class == "Hans": 

396 l_data.tags.append("Simplified-Chinese") 

397 if l_data.word != "": 397 ↛ 383line 397 didn't jump to line 383 because the condition on line 397 was always true

398 l_list.append(l_data) 

399 elif span_lang == f"{lang_code}-Latn" or "tr" in span_class: 

400 roman = clean_node(wxr, None, node) 

401 for d in l_list: 

402 d.roman = roman 

403 elif span_class == "mention-gloss": 403 ↛ 404line 403 didn't jump to line 404 because the condition on line 403 was never true

404 sense = clean_node(wxr, None, node) 

405 for d in l_list: 

406 d.sense = sense 

407 elif "qualifier-content" in span_class: 

408 raw_tag_str = clean_node(wxr, None, node) 

409 for raw_tag in raw_tag_str.split(","): 

410 raw_tag = raw_tag.strip() 

411 if raw_tag != "": 411 ↛ 409line 411 didn't jump to line 409 because the condition on line 411 was always true

412 raw_tags.append(raw_tag) 

413 elif isinstance(node, str) and node.strip() == "、": 

414 getattr(word_entry, linkage_type).extend(l_list) 

415 l_list.clear() 

416 

417 getattr(word_entry, linkage_type).extend(l_list) 

418 for data in getattr(word_entry, linkage_type): 

419 translate_raw_tags(data) 

420 

421 

422def extract_ja_r_multi_template( 

423 wxr: WiktextractContext, template_node: TemplateNode, sense: str 

424) -> Linkage: 

425 expanded_node = wxr.wtp.parse( 

426 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

427 ) 

428 linkage_list = [] 

429 for list_node in expanded_node.find_child(NodeKind.LIST): 

430 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

431 linkage_list.append( 

432 process_expanded_ja_r_node(wxr, list_item, sense, []) 

433 ) 

434 

435 return linkage_list