Coverage for src/wiktextract/extractor/zh/linkage.py: 87%

169 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import itertools 

2from collections import defaultdict 

3 

4from wikitextprocessor.parser import ( 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from ..ruby import extract_ruby 

14from .models import Linkage, WordEntry 

15from .tags import translate_raw_tags 

16 

17 

18def extract_linkage_section( 

19 wxr: WiktextractContext, 

20 page_data: list[WordEntry], 

21 level_node: LevelNode, 

22 linkage_type: str, 

23) -> None: 

24 sense = "" 

25 linkage_list = [] 

26 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST): 

27 if node.kind == NodeKind.LIST: 

28 for item_node in node.find_child(NodeKind.LIST_ITEM): 

29 sense, new_linkage_list = process_linkage_list_item( 

30 wxr, item_node, sense 

31 ) 

32 linkage_list.extend(new_linkage_list) 

33 elif isinstance(node, TemplateNode): 33 ↛ 26line 33 didn't jump to line 26 because the condition on line 33 was always true

34 if node.template_name in ["s", "sense"]: 

35 sense = clean_node(wxr, None, node).strip("(): ") 

36 elif node.template_name == "zh-dial": 

37 linkage_list.extend(extract_zh_dial_template(wxr, node, sense)) 

38 elif node.template_name.endswith("-saurus"): 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true

39 linkage_list.extend( 

40 extract_saurus_template( 

41 wxr, page_data, node, linkage_type, sense 

42 ) 

43 ) 

44 elif node.template_name.startswith("col"): 44 ↛ 26line 44 didn't jump to line 26 because the condition on line 44 was always true

45 linkage_list.extend( 

46 process_linkage_col_template(wxr, node, sense) 

47 ) 

48 

49 if level_node.kind == NodeKind.LEVEL3: 

50 for data in page_data: 

51 if data.lang_code == page_data[-1].lang_code: 51 ↛ 50line 51 didn't jump to line 50 because the condition on line 51 was always true

52 pre_linkage_list = getattr(data, linkage_type) 

53 pre_linkage_list.extend(linkage_list) 

54 elif len(page_data) > 0: 54 ↛ exitline 54 didn't return from function 'extract_linkage_section' because the condition on line 54 was always true

55 pre_linkage_list = getattr(page_data[-1], linkage_type) 

56 pre_linkage_list.extend(linkage_list) 

57 

58 

59def process_linkage_list_item( 

60 wxr: WiktextractContext, list_item: WikiNode, sense: str 

61) -> tuple[str, list[Linkage]]: 

62 raw_tags = [] 

63 linkage_list = [] 

64 for item_child in list_item.children: 

65 if isinstance(item_child, TemplateNode): 

66 if item_child.template_name in ["s", "sense"]: 

67 sense = clean_node(wxr, None, item_child).strip("(): ") 

68 elif item_child.template_name in ["qualifier", "qual"]: 

69 raw_tags.append(clean_node(wxr, None, item_child).strip("()")) 

70 elif item_child.template_name == "zh-l": 

71 linkage_list.extend( 

72 process_zh_l_template(wxr, item_child, sense, raw_tags) 

73 ) 

74 raw_tags.clear() 

75 elif item_child.template_name == "ja-r": 

76 linkage_list.append( 

77 process_ja_r_template(wxr, item_child, sense, raw_tags) 

78 ) 

79 raw_tags.clear() 

80 elif item_child.template_name in ["l", "link", "alter"]: 80 ↛ 64line 80 didn't jump to line 64 because the condition on line 80 was always true

81 linkage_list.extend( 

82 process_l_template(wxr, item_child, sense, raw_tags) 

83 ) 

84 raw_tags.clear() 

85 elif ( 

86 isinstance(item_child, WikiNode) 

87 and item_child.kind == NodeKind.LINK 

88 ): 

89 word = clean_node(wxr, None, item_child) 

90 if len(word) > 0: 90 ↛ 64line 90 didn't jump to line 64 because the condition on line 90 was always true

91 linkage_data = Linkage( 

92 word=word, sense=sense, raw_tags=raw_tags 

93 ) 

94 translate_raw_tags(linkage_data) 

95 linkage_list.append(linkage_data) 

96 raw_tags.clear() 

97 return sense, linkage_list 

98 

99 

100def extract_saurus_template( 

101 wxr: WiktextractContext, 

102 page_data: list[WordEntry], 

103 node: TemplateNode, 

104 linkage_type: str, 

105 sense: str, 

106) -> list[Linkage]: 

107 """ 

108 Extract data from template names end with "-saurus", like "zh-syn-saurus" 

109 and "zh-ant-saurus". These templates get data from thesaurus pages, search 

110 the thesaurus database to avoid parse these pages again. 

111 

112 https://zh.wiktionary.org/wiki/Template:Syn-saurus 

113 """ 

114 from wiktextract.thesaurus import search_thesaurus 

115 

116 linkage_data = [] 

117 if node.template_name in ("zh-syn-saurus", "zh-ant-saurus"): 

118 # obsolete templates 

119 thesaurus_page_title = node.template_parameters.get(1) 

120 else: 

121 thesaurus_page_title = node.template_parameters.get(2) 

122 

123 for thesaurus in search_thesaurus( 

124 wxr.thesaurus_db_conn, 

125 thesaurus_page_title, 

126 page_data[-1].lang_code, 

127 page_data[-1].pos, 

128 linkage_type, 

129 ): 

130 if thesaurus.term == wxr.wtp.title: 

131 continue 

132 linkage_data.append( 

133 Linkage( 

134 word=thesaurus.term, 

135 roman=thesaurus.roman, 

136 tags=thesaurus.tags, 

137 raw_tags=thesaurus.raw_tags, 

138 sense=sense, 

139 ) 

140 ) 

141 

142 return linkage_data 

143 

144 

145def extract_zh_dial_template( 

146 wxr: WiktextractContext, template_node: TemplateNode, sense: str 

147) -> list[Linkage]: 

148 linkage_list = [] 

149 dial_data = defaultdict(set) 

150 tag_data = defaultdict(set) 

151 expanded_node = wxr.wtp.parse( 

152 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

153 ) 

154 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE): 

155 lang_tag = "" 

156 region_tag = "" 

157 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

158 if not row_node.contain_node(NodeKind.TABLE_CELL): 

159 continue # skip header row 

160 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL): 

161 lang_tag = clean_node(wxr, None, header_node) 

162 if lang_tag == "註解": # skip last note row 

163 continue 

164 for cell_node in row_node.find_child(NodeKind.TABLE_CELL): 

165 for link_node in cell_node.find_child(NodeKind.LINK): 

166 region_tag = clean_node(wxr, None, link_node) 

167 word = "" 

168 for span_tag in cell_node.find_html("span"): 

169 span_text = clean_node(wxr, None, span_tag) 

170 if span_text == "": 170 ↛ 171line 170 didn't jump to line 171 because the condition on line 170 was never true

171 continue 

172 if ( 

173 span_tag.attrs.get("lang", "") == "zh" 

174 and span_text != wxr.wtp.title 

175 ): 

176 word = span_text 

177 if lang_tag != "": 177 ↛ 179line 177 didn't jump to line 179 because the condition on line 177 was always true

178 dial_data[span_text].add(lang_tag) 

179 if region_tag != "": 

180 dial_data[span_text].add(region_tag) 

181 elif ( 

182 span_tag.attrs.get("style", "") == "font-size:60%" 

183 and word != "" 

184 ): 

185 tag_data[word].add(span_text) 

186 

187 for term, lang_tags in dial_data.items(): 

188 linkage_data = Linkage(word=term, sense=sense, raw_tags=list(lang_tags)) 

189 linkage_data.raw_tags.extend(list(tag_data.get(term, {}))) 

190 translate_raw_tags(linkage_data) 

191 linkage_list.append(linkage_data) 

192 return linkage_list 

193 

194 

195def process_zh_l_template( 

196 wxr: WiktextractContext, 

197 template_node: TemplateNode, 

198 sense: str, 

199 raw_tags: list[str] = [], 

200) -> list[Linkage]: 

201 # https://zh.wiktionary.org/wiki/Template:Zh-l 

202 expanded_node = wxr.wtp.parse( 

203 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

204 ) 

205 roman = "" 

206 linkage_list = [] 

207 for i_tag in expanded_node.find_html_recursively( 

208 "span", attr_name="class", attr_value="Latn" 

209 ): 

210 roman = clean_node(wxr, None, i_tag) 

211 for span_tag in expanded_node.find_html( 

212 "span", attr_name="lang", attr_value="zh" 

213 ): 

214 linkage_data = Linkage( 

215 sense=sense, 

216 raw_tags=raw_tags, 

217 roman=roman, 

218 word=clean_node(wxr, None, span_tag), 

219 ) 

220 lang_attr = span_tag.attrs.get("lang", "") 

221 if lang_attr == "zh-Hant": 

222 linkage_data.tags.append("Traditional Chinese") 

223 elif lang_attr == "zh-Hans": 

224 linkage_data.tags.append("Simplified Chinese") 

225 if len(linkage_data.word) > 0 and linkage_data.word != "/": 

226 translate_raw_tags(linkage_data) 

227 linkage_list.append(linkage_data) 

228 return linkage_list 

229 

230 

231def process_ja_r_template( 

232 wxr: WiktextractContext, 

233 template_node: TemplateNode, 

234 sense: str, 

235 raw_tags: list[str] = [], 

236) -> Linkage: 

237 # https://zh.wiktionary.org/wiki/Template:Ja-r 

238 expanded_node = wxr.wtp.parse( 

239 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

240 ) 

241 linkage_data = Linkage(sense=sense, raw_tags=raw_tags) 

242 for span_node in expanded_node.find_html("span"): 

243 span_class = span_node.attrs.get("class", "") 

244 if "lang" in span_node.attrs: 

245 ruby_data, no_ruby_nodes = extract_ruby(wxr, span_node) 

246 linkage_data.word = clean_node(wxr, None, no_ruby_nodes) 

247 linkage_data.ruby = ruby_data 

248 elif "tr" in span_class: 

249 linkage_data.roman = clean_node(wxr, None, span_node) 

250 elif "mention-gloss" == span_class: 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true

251 linkage_data.sense = clean_node(wxr, None, span_node) 

252 

253 translate_raw_tags(linkage_data) 

254 return linkage_data 

255 

256 

257def process_l_template( 

258 wxr: WiktextractContext, 

259 template_node: TemplateNode, 

260 sense: str, 

261 raw_tags: list[str] = [], 

262) -> None: 

263 # https://zh.wiktionary.org/wiki/Template:l 

264 expanded_node = wxr.wtp.parse( 

265 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

266 ) 

267 linkage_list = [] 

268 for span_tag in expanded_node.find_html("span", attr_name="lang"): 

269 linkage_data = Linkage( 

270 sense=sense, raw_tags=raw_tags, word=clean_node(wxr, None, span_tag) 

271 ) 

272 if len(linkage_data.word) > 0: 272 ↛ 268line 272 didn't jump to line 268 because the condition on line 272 was always true

273 translate_raw_tags(linkage_data) 

274 linkage_list.append(linkage_data) 

275 return linkage_list 

276 

277 

278def process_linkage_col_template( 

279 wxr: WiktextractContext, template_node: TemplateNode, sense: str 

280) -> list[Linkage]: 

281 from .thesaurus import process_col_template 

282 

283 linkage_list = [] 

284 for data in process_col_template(wxr, "", "", "", "", "", template_node): 

285 linkage_data = Linkage( 

286 word=data.term, 

287 roman=data.roman, 

288 tags=data.tags, 

289 raw_tags=data.raw_tags, 

290 sense=sense, 

291 ) 

292 if len(linkage_data.word) > 0: 292 ↛ 284line 292 didn't jump to line 284 because the condition on line 292 was always true

293 translate_raw_tags(linkage_data) 

294 linkage_list.append(linkage_data) 

295 return linkage_list 

296 

297 

298def process_linkage_templates_in_gloss( 

299 wxr: WiktextractContext, 

300 page_data: list[WordEntry], 

301 template_node: TemplateNode, 

302 linkage_type: str, 

303 sense: str, 

304) -> None: 

305 for word_index in itertools.count(2): 305 ↛ exitline 305 didn't return from function 'process_linkage_templates_in_gloss' because the loop on line 305 didn't complete

306 if word_index not in template_node.template_parameters: 

307 break 

308 word = clean_node( 

309 wxr, None, template_node.template_parameters[word_index] 

310 ) 

311 if len(word) == 0: 311 ↛ 312line 311 didn't jump to line 312 because the condition on line 311 was never true

312 continue 

313 if word.startswith(wxr.wtp.NAMESPACE_DATA["Thesaurus"]["name"] + ":"): 313 ↛ 314line 313 didn't jump to line 314 because the condition on line 313 was never true

314 continue 

315 linkage = Linkage(word=word, sense=sense) 

316 pre_data = getattr(page_data[-1], linkage_type) 

317 pre_data.append(linkage)