Coverage for src/wiktextract/extractor/zh/thesaurus.py: 92%

124 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import re 

2 

3from mediawiki_langcodes import name_to_code 

4from wikitextprocessor import Page 

5from wikitextprocessor.parser import ( 

6 LEVEL_KIND_FLAGS, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...thesaurus import ThesaurusTerm 

14from ...wxr_context import WiktextractContext 

15from ...wxr_logging import logger 

16from .section_titles import LINKAGE_TITLES, POS_TITLES 

17from .tags import translate_raw_tags 

18 

19SENSE_SUBTITLE_PREFIX = "詞義:" 

20IGNORED_SUBTITLES = frozenset( 

21 [ 

22 "參見", # see also 

23 "参见", 

24 "延伸閱讀", # further reading 

25 "延伸阅读", 

26 ] 

27) 

28 

29 

30def parse_section( 

31 wxr: WiktextractContext, 

32 entry_word: str, 

33 lang_code: str, 

34 pos: str, 

35 sense: str, 

36 linkage_type: str, 

37 level_node: WikiNode, 

38) -> list[ThesaurusTerm]: 

39 data = [] 

40 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

41 next_level_title = clean_node(wxr, None, next_level_node.largs) 

42 if next_level_title in IGNORED_SUBTITLES: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true

43 continue 

44 elif next_level_node.kind == NodeKind.LEVEL3: 

45 local_pos_name = next_level_title 

46 english_pos = POS_TITLES.get(local_pos_name, {}).get("pos") 

47 if english_pos is None: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 logger.warning( 

49 f"Unrecognized POS subtitle: {local_pos_name} in page " 

50 f"Thesaurus:{entry_word}" 

51 ) 

52 english_pos = local_pos_name 

53 data.extend( 

54 parse_section( 

55 wxr, 

56 entry_word, 

57 lang_code, 

58 english_pos, 

59 "", 

60 "", 

61 next_level_node, 

62 ) 

63 ) 

64 elif next_level_node.kind == NodeKind.LEVEL4: 

65 sense_text = next_level_title 

66 sense_text = sense_text.removeprefix(SENSE_SUBTITLE_PREFIX) 

67 data.extend( 

68 parse_section( 

69 wxr, 

70 entry_word, 

71 lang_code, 

72 pos, 

73 sense_text, 

74 "", 

75 next_level_node, 

76 ) 

77 ) 

78 elif next_level_node.kind == NodeKind.LEVEL5: 78 ↛ 40line 78 didn't jump to line 40 because the condition on line 78 was always true

79 local_linkage_name = next_level_title 

80 english_linkage = LINKAGE_TITLES.get(local_linkage_name) 

81 if english_linkage is None: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 logger.warning( 

83 f"Unrecognized linkage subtitle: {local_linkage_name} " 

84 f"in page Thesaurus:{entry_word}" 

85 ) 

86 english_linkage = local_linkage_name 

87 for node in next_level_node.find_child( 

88 NodeKind.LIST | NodeKind.TEMPLATE 

89 ): 

90 if isinstance(node, TemplateNode): 

91 data.extend( 

92 process_linkage_template( 

93 wxr, 

94 entry_word, 

95 lang_code, 

96 pos, 

97 sense, 

98 english_linkage, 

99 node, 

100 ) 

101 ) 

102 elif node.kind == NodeKind.LIST: 102 ↛ 87line 102 didn't jump to line 87 because the condition on line 102 was always true

103 data.extend( 

104 process_list_node( 

105 wxr, 

106 entry_word, 

107 lang_code, 

108 pos, 

109 sense, 

110 english_linkage, 

111 node, 

112 ) 

113 ) 

114 

115 return data 

116 

117 

118def process_linkage_template( 

119 wxr: WiktextractContext, 

120 entry_word: str, 

121 lang_code: str, 

122 pos: str, 

123 sense: str, 

124 linkage_type: str, 

125 template_node: TemplateNode, 

126) -> list[ThesaurusTerm]: 

127 if re.fullmatch(r"col\d", template_node.template_name.strip(), re.I): 

128 return process_col_template( 

129 wxr, entry_word, lang_code, pos, sense, linkage_type, template_node 

130 ) 

131 elif template_node.template_name.lower() in ( 

132 "zh-der", 

133 "zh-syn-list", 

134 "zh-ant-list", 

135 ): 

136 return process_obsolete_zh_der_template( 

137 wxr, entry_word, lang_code, pos, sense, linkage_type, template_node 

138 ) 

139 

140 return [] 

141 

142 

143def process_list_node( 

144 wxr: WiktextractContext, 

145 entry_word: str, 

146 lang_code: str, 

147 pos: str, 

148 sense: str, 

149 linkage_type: str, 

150 list_node: WikiNode, 

151) -> list[ThesaurusTerm]: 

152 term_list = [] 

153 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

154 current_data = [] 

155 raw_tags = [] 

156 for list_child_template in list_item_node.find_child(NodeKind.TEMPLATE): 

157 if list_child_template.template_name.lower() in ( 

158 "qual", 

159 "i", 

160 "qf", 

161 "qualifier", 

162 ): 

163 for ( 

164 param_value 

165 ) in list_child_template.template_parameters.values(): 

166 raw_tags.append(clean_node(wxr, None, param_value)) 

167 elif list_child_template.template_name == "ja-r": 167 ↛ 156line 167 didn't jump to line 156 because the condition on line 167 was always true

168 current_data.append( 

169 process_thesaurus_ja_r_template( 

170 wxr, 

171 entry_word, 

172 lang_code, 

173 pos, 

174 sense, 

175 linkage_type, 

176 list_child_template, 

177 ) 

178 ) 

179 

180 for data in current_data: 

181 data.raw_tags.extend(raw_tags) 

182 translate_raw_tags(data) 

183 term_list.extend(current_data) 

184 

185 return term_list 

186 

187 

188def process_col_template( 

189 wxr: WiktextractContext, 

190 entry_word: str, 

191 lang_code: str, 

192 pos: str, 

193 sense: str, 

194 linkage_type: str, 

195 template_node: TemplateNode, 

196) -> list[ThesaurusTerm]: 

197 # https://zh.wiktionary.org/wiki/Template:Col3 

198 term_list = [] 

199 expanded_template = wxr.wtp.parse( 

200 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

201 ) 

202 for ui_tag in expanded_template.find_html_recursively("li"): 

203 current_data = [] 

204 roman = "" 

205 raw_tags = [] 

206 for span_tag in ui_tag.find_html("span"): 

207 if span_tag.attrs.get("lang", "").endswith("-Latn"): 

208 roman = clean_node(wxr, None, span_tag) 

209 elif "qualifier-content" in span_tag.attrs.get("class", ""): 

210 raw_tags.append(clean_node(wxr, None, span_tag)) 

211 elif span_tag.attrs.get("lang", "") != "": 

212 term_text = clean_node(wxr, None, span_tag) 

213 term_data = ThesaurusTerm( 

214 entry_word, 

215 lang_code, 

216 pos, 

217 linkage_type, 

218 term_text, 

219 sense=sense, 

220 ) 

221 class_names = span_tag.attrs.get("class", "") 

222 if class_names == "Hant": 

223 term_data.tags.append("Traditional Chinese") 

224 elif class_names == "Hans": 

225 term_data.tags.append("Simplified Chinese") 

226 current_data.append(term_data) 

227 

228 for data in current_data: 

229 data.raw_tags.extend(raw_tags) 

230 data.roman = roman 

231 translate_raw_tags(data) 

232 term_list.extend(current_data) 

233 

234 return term_list 

235 

236 

237def process_obsolete_zh_der_template( 

238 wxr: WiktextractContext, 

239 entry_word: str, 

240 lang_code: str, 

241 pos: str, 

242 sense: str, 

243 linkage_type: str, 

244 template_node: TemplateNode, 

245) -> list[ThesaurusTerm]: 

246 # https://zh.wiktionary.org/wiki/Template:Zh-der 

247 term_list = [] 

248 expanded_template = wxr.wtp.parse( 

249 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

250 ) 

251 for list_item_node in expanded_template.find_child_recursively( 

252 NodeKind.LIST_ITEM 

253 ): 

254 current_data = [] 

255 roman = "" 

256 for span_tag in list_item_node.find_html_recursively("span"): 

257 if "Latn" in span_tag.attrs.get("class", ""): 

258 roman = clean_node(wxr, None, span_tag) 

259 elif span_tag.attrs.get("lang", "") != "": 259 ↛ 256line 259 didn't jump to line 256 because the condition on line 259 was always true

260 term_text = clean_node(wxr, None, span_tag) 

261 if term_text == "/": 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true

262 continue 

263 current_data.append( 

264 ThesaurusTerm( 

265 entry_word, 

266 lang_code, 

267 pos, 

268 linkage_type, 

269 term_text, 

270 sense=sense, 

271 ) 

272 ) 

273 for data in current_data: 

274 data.roman = roman 

275 term_list.extend(current_data) 

276 

277 return term_list 

278 

279 

280def process_thesaurus_ja_r_template( 

281 wxr: WiktextractContext, 

282 entry_word: str, 

283 lang_code: str, 

284 pos: str, 

285 sense: str, 

286 linkage_type: str, 

287 template_node: TemplateNode, 

288) -> ThesaurusTerm: 

289 from .linkage import process_ja_r_template 

290 

291 linkage_data = process_ja_r_template(wxr, template_node, "") 

292 return ThesaurusTerm( 

293 entry_word, 

294 lang_code, 

295 pos, 

296 linkage_type, 

297 linkage_data.word, 

298 sense=sense, 

299 roman=linkage_data.roman, 

300 ) 

301 

302 

303def extract_thesaurus_page( 

304 wxr: WiktextractContext, page: Page 

305) -> list[ThesaurusTerm]: 

306 entry = page.title[page.title.find(":") + 1 :] 

307 wxr.wtp.start_page(page.title) 

308 root = wxr.wtp.parse(page.body) 

309 data = [] 

310 for level2_node in root.find_child(NodeKind.LEVEL2): 

311 lang_name = clean_node(wxr, None, level2_node.largs) 

312 lang_code = name_to_code(lang_name, "zh") 

313 if lang_code == "": 313 ↛ 314line 313 didn't jump to line 314 because the condition on line 313 was never true

314 logger.warning( 

315 f"Unrecognized language: {lang_name} in page Thesaurus:{entry}" 

316 ) 

317 data.extend( 

318 parse_section(wxr, entry, lang_code, "", "", "", level2_node) 

319 ) 

320 return data