Coverage for src/wiktextract/extractor/zh/linkage.py: 91%

227 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-11 10:26 +0000

1import re 

2from collections import defaultdict 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..ruby import extract_ruby 

15from .models import Form, Linkage, WordEntry 

16from .tags import translate_raw_tags 

17 

18 

19def extract_linkage_section( 

20 wxr: WiktextractContext, 

21 page_data: list[WordEntry], 

22 level_node: LevelNode, 

23 linkage_type: str, 

24) -> None: 

25 sense = "" 

26 linkage_list = [] 

27 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST): 

28 if node.kind == NodeKind.LIST: 

29 for item_node in node.find_child(NodeKind.LIST_ITEM): 

30 sense, new_linkage_list = process_linkage_list_item( 

31 wxr, item_node, sense 

32 ) 

33 linkage_list.extend(new_linkage_list) 

34 elif isinstance(node, TemplateNode): 34 ↛ 27line 34 didn't jump to line 27 because the condition on line 34 was always true

35 if node.template_name in ["s", "sense"]: 

36 sense = clean_node(wxr, None, node).strip("(): ") 

37 elif node.template_name == "zh-dial": 

38 linkage_list.extend(extract_zh_dial_template(wxr, node, sense)) 

39 elif re.fullmatch( 

40 r"(?:col|der|rel)\d", node.template_name, re.I 

41 ) or node.template_name.endswith("-saurus"): 

42 linkage_list.extend( 

43 process_linkage_col_template(wxr, node, sense) 

44 ) 

45 elif node.template_name == "ja-r/multi": 

46 linkage_list.extend( 

47 extract_ja_r_multi_template(wxr, node, sense) 

48 ) 

49 

50 if linkage_type == "alt_forms": 

51 forms = [ 

52 Form( 

53 form=l_data.word, 

54 sense=l_data.sense, 

55 tags=l_data.tags + ["alternative"], 

56 raw_tags=l_data.raw_tags, 

57 roman=l_data.roman, 

58 ruby=l_data.ruby, 

59 ) 

60 for l_data in linkage_list 

61 ] 

62 page_data[-1].forms.extend(forms) 

63 else: 

64 getattr(page_data[-1], linkage_type).extend(linkage_list) 

65 for data in page_data[:-1]: 

66 if ( 66 ↛ 72line 66 didn't jump to line 72 because the condition on line 66 was never true

67 data.lang_code == page_data[-1].lang_code 

68 and data.sounds == page_data[-1].sounds 

69 and data.etymology_text == page_data[-1].etymology_text 

70 and data.pos_level == page_data[-1].pos_level == level_node.kind 

71 ): 

72 getattr(data, linkage_type).extend(linkage_list) 

73 

74 

75def process_linkage_list_item( 

76 wxr: WiktextractContext, list_item: WikiNode, sense: str 

77) -> tuple[str, list[Linkage]]: 

78 raw_tags = [] 

79 linkage_list = [] 

80 for item_child in list_item.children: 

81 if isinstance(item_child, TemplateNode): 

82 if item_child.template_name in ["s", "sense"]: 

83 sense = clean_node(wxr, None, item_child).strip("(): ") 

84 elif item_child.template_name in ["qualifier", "qual"]: 

85 raw_tags.append(clean_node(wxr, None, item_child).strip("()")) 

86 elif item_child.template_name == "zh-l": 

87 linkage_list.extend( 

88 process_zh_l_template(wxr, item_child, sense, raw_tags) 

89 ) 

90 raw_tags.clear() 

91 elif item_child.template_name == "ja-r": 

92 linkage_list.append( 

93 process_ja_r_template(wxr, item_child, sense, raw_tags) 

94 ) 

95 raw_tags.clear() 

96 elif item_child.template_name in ["l", "link", "alter"]: 96 ↛ 80line 96 didn't jump to line 80 because the condition on line 96 was always true

97 linkage_list.extend( 

98 process_l_template(wxr, item_child, sense, raw_tags) 

99 ) 

100 raw_tags.clear() 

101 elif ( 

102 isinstance(item_child, WikiNode) 

103 and item_child.kind == NodeKind.LINK 

104 ): 

105 word = clean_node(wxr, None, item_child) 

106 if len(word) > 0: 106 ↛ 80line 106 didn't jump to line 80 because the condition on line 106 was always true

107 linkage_data = Linkage( 

108 word=word, sense=sense, raw_tags=raw_tags 

109 ) 

110 translate_raw_tags(linkage_data) 

111 linkage_list.append(linkage_data) 

112 raw_tags.clear() 

113 elif ( 113 ↛ 117line 113 didn't jump to line 117 because the condition on line 113 was never true

114 isinstance(item_child, WikiNode) 

115 and item_child.kind == NodeKind.LIST 

116 ): 

117 for child_list_item in item_child.find_child(NodeKind.LIST_ITEM): 

118 _, new_list = process_linkage_list_item( 

119 wxr, child_list_item, sense 

120 ) 

121 linkage_list.extend(new_list) 

122 

123 return sense, linkage_list 

124 

125 

126def extract_zh_dial_template( 

127 wxr: WiktextractContext, template_node: TemplateNode, sense: str 

128) -> list[Linkage]: 

129 linkage_list = [] 

130 dial_data = defaultdict(set) 

131 tag_data = defaultdict(set) 

132 expanded_node = wxr.wtp.parse( 

133 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

134 ) 

135 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE): 

136 lang_tag = "" 

137 region_tag = "" 

138 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

139 if not row_node.contain_node(NodeKind.TABLE_CELL): 

140 continue # skip header row 

141 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL): 

142 lang_tag = clean_node(wxr, None, header_node) 

143 if lang_tag == "註解": # skip last note row 

144 continue 

145 for cell_node in row_node.find_child(NodeKind.TABLE_CELL): 

146 for link_node in cell_node.find_child(NodeKind.LINK): 

147 region_tag = clean_node(wxr, None, link_node) 

148 word = "" 

149 for span_tag in cell_node.find_html("span"): 

150 span_text = clean_node(wxr, None, span_tag) 

151 if span_text == "": 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true

152 continue 

153 if ( 

154 span_tag.attrs.get("lang", "") == "zh" 

155 and span_text != wxr.wtp.title 

156 ): 

157 word = span_text 

158 if lang_tag != "": 158 ↛ 160line 158 didn't jump to line 160 because the condition on line 158 was always true

159 dial_data[span_text].add(lang_tag) 

160 if region_tag != "": 

161 dial_data[span_text].add(region_tag) 

162 elif ( 

163 span_tag.attrs.get("style", "") == "font-size:60%" 

164 and word != "" 

165 ): 

166 tag_data[word].add(span_text) 

167 

168 for term, lang_tags in dial_data.items(): 

169 linkage_data = Linkage(word=term, sense=sense, raw_tags=list(lang_tags)) 

170 linkage_data.raw_tags.extend(list(tag_data.get(term, {}))) 

171 translate_raw_tags(linkage_data) 

172 linkage_list.append(linkage_data) 

173 return linkage_list 

174 

175 

176def process_zh_l_template( 

177 wxr: WiktextractContext, 

178 template_node: TemplateNode, 

179 sense: str, 

180 raw_tags: list[str] = [], 

181) -> list[Linkage]: 

182 # https://zh.wiktionary.org/wiki/Template:Zh-l 

183 expanded_node = wxr.wtp.parse( 

184 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

185 ) 

186 roman = "" 

187 linkage_list = [] 

188 for i_tag in expanded_node.find_html_recursively( 

189 "span", attr_name="class", attr_value="Latn" 

190 ): 

191 roman = clean_node(wxr, None, i_tag) 

192 for span_tag in expanded_node.find_html( 

193 "span", attr_name="lang", attr_value="zh" 

194 ): 

195 linkage_data = Linkage( 

196 sense=sense, 

197 raw_tags=raw_tags, 

198 roman=roman, 

199 word=clean_node(wxr, None, span_tag), 

200 ) 

201 lang_attr = span_tag.attrs.get("lang", "") 

202 if lang_attr == "zh-Hant": 

203 linkage_data.tags.append("Traditional Chinese") 

204 elif lang_attr == "zh-Hans": 

205 linkage_data.tags.append("Simplified Chinese") 

206 if len(linkage_data.word) > 0 and linkage_data.word != "/": 

207 translate_raw_tags(linkage_data) 

208 linkage_list.append(linkage_data) 

209 return linkage_list 

210 

211 

212def process_ja_r_template( 

213 wxr: WiktextractContext, 

214 template_node: TemplateNode, 

215 sense: str, 

216 raw_tags: list[str] = [], 

217) -> Linkage: 

218 # https://zh.wiktionary.org/wiki/Template:Ja-r 

219 expanded_node = wxr.wtp.parse( 

220 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

221 ) 

222 return process_expanded_ja_r_node(wxr, expanded_node, sense, raw_tags) 

223 

224 

225def process_expanded_ja_r_node( 

226 wxr: WiktextractContext, 

227 expanded_node: WikiNode, 

228 sense: str, 

229 raw_tags: list[str] = [], 

230) -> Linkage: 

231 linkage_data = Linkage(sense=sense, raw_tags=raw_tags) 

232 for span_node in expanded_node.find_html("span"): 

233 span_class = span_node.attrs.get("class", "") 

234 if "lang" in span_node.attrs: 

235 ruby_data, no_ruby_nodes = extract_ruby(wxr, span_node) 

236 linkage_data.word = clean_node(wxr, None, no_ruby_nodes) 

237 linkage_data.ruby = ruby_data 

238 elif "tr" in span_class: 

239 linkage_data.roman = clean_node(wxr, None, span_node) 

240 elif "mention-gloss" == span_class: 240 ↛ 241line 240 didn't jump to line 241 because the condition on line 240 was never true

241 linkage_data.sense = clean_node(wxr, None, span_node) 

242 

243 translate_raw_tags(linkage_data) 

244 return linkage_data 

245 

246 

247def process_l_template( 

248 wxr: WiktextractContext, 

249 t_node: TemplateNode, 

250 sense: str, 

251 raw_tags: list[str] = [], 

252) -> None: 

253 # https://zh.wiktionary.org/wiki/Template:l 

254 expanded_node = wxr.wtp.parse( 

255 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

256 ) 

257 linkage_list = [] 

258 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

259 for span_tag in expanded_node.find_html("span"): 

260 span_lang = span_tag.attrs.get("lang", "") 

261 span_class = span_tag.attrs.get("class", "") 

262 if span_lang == lang_code: 

263 linkage_data = Linkage( 

264 sense=sense, 

265 raw_tags=raw_tags, 

266 word=clean_node(wxr, None, span_tag), 

267 ) 

268 if len(linkage_data.word) > 0: 268 ↛ 259line 268 didn't jump to line 259 because the condition on line 268 was always true

269 translate_raw_tags(linkage_data) 

270 linkage_list.append(linkage_data) 

271 elif span_lang.endswith("-Latn") and len(linkage_list) > 0: 

272 linkage_list[-1].roman = clean_node(wxr, None, span_tag) 

273 elif "mention-gloss" == span_class and len(linkage_list) > 0: 

274 linkage_list[-1].sense = clean_node(wxr, None, span_tag) 

275 

276 return linkage_list 

277 

278 

279def process_linkage_col_template( 

280 wxr: WiktextractContext, template_node: TemplateNode, sense: str 

281) -> list[Linkage]: 

282 # https://zh.wiktionary.org/wiki/Template:Col3 

283 linkage_list = [] 

284 expanded_template = wxr.wtp.parse( 

285 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

286 ) 

287 for ui_tag in expanded_template.find_html_recursively("li"): 

288 current_data = [] 

289 roman = "" 

290 raw_tags = [] 

291 for span_tag in ui_tag.find_html("span"): 

292 span_lang = span_tag.attrs.get("lang", "") 

293 if span_lang.endswith("-Latn"): 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true

294 roman = clean_node(wxr, None, span_tag) 

295 elif "qualifier-content" in span_tag.attrs.get("class", ""): 

296 span_text = clean_node(wxr, None, span_tag) 

297 for raw_tag in re.split(r"或|、", span_text): 

298 raw_tag = raw_tag.strip() 

299 if raw_tag != "": 299 ↛ 297line 299 didn't jump to line 297 because the condition on line 299 was always true

300 raw_tags.append(raw_tag) 

301 elif span_lang != "": 

302 l_data = Linkage( 

303 word=clean_node(wxr, None, span_tag), sense=sense 

304 ) 

305 class_names = span_tag.attrs.get("class", "") 

306 if class_names == "Hant": 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true

307 l_data.tags.append("Traditional Chinese") 

308 elif class_names == "Hans": 308 ↛ 309line 308 didn't jump to line 309 because the condition on line 308 was never true

309 l_data.tags.append("Simplified Chinese") 

310 if l_data.word != "": 310 ↛ 291line 310 didn't jump to line 291 because the condition on line 310 was always true

311 current_data.append(l_data) 

312 

313 for data in current_data: 

314 data.raw_tags.extend(raw_tags) 

315 data.roman = roman 

316 translate_raw_tags(data) 

317 linkage_list.extend(current_data) 

318 

319 return linkage_list 

320 

321 

322def process_linkage_templates_in_gloss( 

323 wxr: WiktextractContext, 

324 word_entry: WordEntry, 

325 t_node: TemplateNode, 

326 linkage_type: str, 

327 sense: str, 

328) -> None: 

329 # https://en.wiktionary.org/wiki/Template:synonyms 

330 expanded_node = wxr.wtp.parse( 

331 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

332 ) 

333 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

334 l_list = [] 

335 raw_tags = [] 

336 for top_span_tag in expanded_node.find_html("span"): 

337 for node in top_span_tag.children: 

338 if isinstance(node, HTMLNode) and node.tag == "span": 

339 span_lang = node.attrs.get("lang", "") 

340 span_class = node.attrs.get("class", "") 

341 if span_lang == lang_code: 

342 l_data = Linkage( 

343 word=clean_node(wxr, None, node), 

344 sense=sense, 

345 raw_tags=raw_tags, 

346 ) 

347 if span_class == "Hant": 

348 l_data.tags.append("Traditional Chinese") 

349 elif span_class == "Hans": 

350 l_data.tags.append("Simplified Chinese") 

351 if l_data.word != "": 351 ↛ 337line 351 didn't jump to line 337 because the condition on line 351 was always true

352 l_list.append(l_data) 

353 elif span_lang == f"{lang_code}-Latn" or "tr" in span_class: 

354 roman = clean_node(wxr, None, node) 

355 for d in l_list: 

356 d.roman = roman 

357 elif span_class == "mention-gloss": 357 ↛ 358line 357 didn't jump to line 358 because the condition on line 357 was never true

358 sense = clean_node(wxr, None, node) 

359 for d in l_list: 

360 d.sense = sense 

361 elif "qualifier-content" in span_class: 

362 raw_tag_str = clean_node(wxr, None, node) 

363 for raw_tag in raw_tag_str.split(","): 

364 raw_tag = raw_tag.strip() 

365 if raw_tag != "": 365 ↛ 363line 365 didn't jump to line 363 because the condition on line 365 was always true

366 raw_tags.append(raw_tag) 

367 elif isinstance(node, str) and node.strip() == "、": 

368 getattr(word_entry, linkage_type).extend(l_list) 

369 l_list.clear() 

370 

371 getattr(word_entry, linkage_type).extend(l_list) 

372 for data in getattr(word_entry, linkage_type): 

373 translate_raw_tags(data) 

374 

375 

376def extract_ja_r_multi_template( 

377 wxr: WiktextractContext, template_node: TemplateNode, sense: str 

378) -> Linkage: 

379 expanded_node = wxr.wtp.parse( 

380 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

381 ) 

382 linkage_list = [] 

383 for list_node in expanded_node.find_child(NodeKind.LIST): 

384 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

385 linkage_list.append( 

386 process_expanded_ja_r_node(wxr, list_item, sense, []) 

387 ) 

388 

389 return linkage_list