Coverage for src/wiktextract/extractor/zh/thesaurus.py: 92%

1import re

3from mediawiki_langcodes import name_to_code

4from wikitextprocessor import Page

5from wikitextprocessor.parser import (

6 LEVEL_KIND_FLAGS,

7 NodeKind,

8 TemplateNode,

9 WikiNode,

10)

12from ...page import clean_node

13from ...thesaurus import ThesaurusTerm

14from ...wxr_context import WiktextractContext

15from ...wxr_logging import logger

16from .section_titles import LINKAGE_TITLES, POS_TITLES

17from .tags import translate_raw_tags

19SENSE_SUBTITLE_PREFIX = "詞義："

20IGNORED_SUBTITLES = frozenset(

21 [

22 "參見", # see also

23 "参见",

24 "延伸閱讀", # further reading

25 "延伸阅读",

26 ]

27)

30def parse_section(

31 wxr: WiktextractContext,

32 entry_word: str,

33 lang_code: str,

34 pos: str,

35 sense: str,

36 linkage_type: str,

37 level_node: WikiNode,

38) -> list[ThesaurusTerm]:

39 data = []

40 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):

41 next_level_title = clean_node(wxr, None, next_level_node.largs)

42 if next_level_title in IGNORED_SUBTITLES: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true

43 continue

44 elif next_level_node.kind == NodeKind.LEVEL3:

45 local_pos_name = next_level_title

46 english_pos = POS_TITLES.get(local_pos_name, {}).get("pos")

47 if english_pos is None: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true

48 logger.warning(

49 f"Unrecognized POS subtitle: {local_pos_name} in page "

50 f"Thesaurus:{entry_word}"

51 )

52 english_pos = local_pos_name

53 data.extend(

54 parse_section(

55 wxr,

56 entry_word,

57 lang_code,

58 english_pos,

59 "",

60 "",

61 next_level_node,

62 )

63 )

64 elif next_level_node.kind == NodeKind.LEVEL4:

65 sense_text = next_level_title

66 sense_text = sense_text.removeprefix(SENSE_SUBTITLE_PREFIX)

67 data.extend(

68 parse_section(

69 wxr,

70 entry_word,

71 lang_code,

72 pos,

73 sense_text,

74 "",

75 next_level_node,

76 )

77 )

78 elif next_level_node.kind == NodeKind.LEVEL5: 78 ↛ 40line 78 didn't jump to line 40 because the condition on line 78 was always true

79 local_linkage_name = next_level_title

80 english_linkage = LINKAGE_TITLES.get(local_linkage_name)

81 if english_linkage is None: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 logger.warning(

83 f"Unrecognized linkage subtitle: {local_linkage_name} "

84 f"in page Thesaurus:{entry_word}"

85 )

86 english_linkage = local_linkage_name

87 for node in next_level_node.find_child(

88 NodeKind.LIST | NodeKind.TEMPLATE

89 ):

90 if isinstance(node, TemplateNode):

91 data.extend(

92 process_linkage_template(

93 wxr,

94 entry_word,

95 lang_code,

96 pos,

97 sense,

98 english_linkage,

99 node,

100 )

101 )

102 elif node.kind == NodeKind.LIST: 102 ↛ 87line 102 didn't jump to line 87 because the condition on line 102 was always true

103 data.extend(

104 process_list_node(

105 wxr,

106 entry_word,

107 lang_code,

108 pos,

109 sense,

110 english_linkage,

111 node,

112 )

113 )

114

115 return data

116

117

118def process_linkage_template(

119 wxr: WiktextractContext,

120 entry_word: str,

121 lang_code: str,

122 pos: str,

123 sense: str,

124 linkage_type: str,

125 template_node: TemplateNode,

126) -> list[ThesaurusTerm]:

127 if re.fullmatch(r"col\d", template_node.template_name.strip(), re.I):

128 return process_col_template(

129 wxr, entry_word, lang_code, pos, sense, linkage_type, template_node

130 )

131 elif template_node.template_name.lower() in (

132 "zh-der",

133 "zh-syn-list",

134 "zh-ant-list",

135 ):

136 return process_obsolete_zh_der_template(

137 wxr, entry_word, lang_code, pos, sense, linkage_type, template_node

138 )

139

140 return []

141

142

143def process_list_node(

144 wxr: WiktextractContext,

145 entry_word: str,

146 lang_code: str,

147 pos: str,

148 sense: str,

149 linkage_type: str,

150 list_node: WikiNode,

151) -> list[ThesaurusTerm]:

152 term_list = []

153 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):

154 current_data = []

155 raw_tags = []

156 for list_child_template in list_item_node.find_child(NodeKind.TEMPLATE):

157 if list_child_template.template_name.lower() in (

158 "qual",

159 "i",

160 "qf",

161 "qualifier",

162 ):

163 for (

164 param_value

165 ) in list_child_template.template_parameters.values():

166 raw_tags.append(clean_node(wxr, None, param_value))

167 elif list_child_template.template_name == "ja-r": 167 ↛ 156line 167 didn't jump to line 156 because the condition on line 167 was always true

168 current_data.append(

169 process_thesaurus_ja_r_template(

170 wxr,

171 entry_word,

172 lang_code,

173 pos,

174 sense,

175 linkage_type,

176 list_child_template,

177 )

178 )

179

180 for data in current_data:

181 data.raw_tags.extend(raw_tags)

182 translate_raw_tags(data)

183 term_list.extend(current_data)

184

185 return term_list

186

187

188def process_col_template(

189 wxr: WiktextractContext,

190 entry_word: str,

191 lang_code: str,

192 pos: str,

193 sense: str,

194 linkage_type: str,

195 template_node: TemplateNode,

196) -> list[ThesaurusTerm]:

197 # https://zh.wiktionary.org/wiki/Template:Col3

198 term_list = []

199 expanded_template = wxr.wtp.parse(

200 wxr.wtp.node_to_wikitext(template_node), expand_all=True

201 )

202 for ui_tag in expanded_template.find_html_recursively("li"):

203 current_data = []

204 roman = ""

205 raw_tags = []

206 for span_tag in ui_tag.find_html("span"):

207 if span_tag.attrs.get("lang", "").endswith("-Latn"):

208 roman = clean_node(wxr, None, span_tag)

209 elif "qualifier-content" in span_tag.attrs.get("class", ""):

210 raw_tags.append(clean_node(wxr, None, span_tag))

211 elif span_tag.attrs.get("lang", "") != "":

212 term_text = clean_node(wxr, None, span_tag)

213 term_data = ThesaurusTerm(

214 entry_word,

215 lang_code,

216 pos,

217 linkage_type,

218 term_text,

219 sense=sense,

220 )

221 class_names = span_tag.attrs.get("class", "")

222 if class_names == "Hant":

223 term_data.tags.append("Traditional Chinese")

224 elif class_names == "Hans":

225 term_data.tags.append("Simplified Chinese")

226 current_data.append(term_data)

227

228 for data in current_data:

229 data.raw_tags.extend(raw_tags)

230 data.roman = roman

231 translate_raw_tags(data)

232 term_list.extend(current_data)

233

234 return term_list

235

236

237def process_obsolete_zh_der_template(

238 wxr: WiktextractContext,

239 entry_word: str,

240 lang_code: str,

241 pos: str,

242 sense: str,

243 linkage_type: str,

244 template_node: TemplateNode,

245) -> list[ThesaurusTerm]:

246 # https://zh.wiktionary.org/wiki/Template:Zh-der

247 term_list = []

248 expanded_template = wxr.wtp.parse(

249 wxr.wtp.node_to_wikitext(template_node), expand_all=True

250 )

251 for list_item_node in expanded_template.find_child_recursively(

252 NodeKind.LIST_ITEM

253 ):

254 current_data = []

255 roman = ""

256 for span_tag in list_item_node.find_html_recursively("span"):

257 if "Latn" in span_tag.attrs.get("class", ""):

258 roman = clean_node(wxr, None, span_tag)

259 elif span_tag.attrs.get("lang", "") != "": 259 ↛ 256line 259 didn't jump to line 256 because the condition on line 259 was always true

260 term_text = clean_node(wxr, None, span_tag)

261 if term_text == "／": 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true

262 continue

263 current_data.append(

264 ThesaurusTerm(

265 entry_word,

266 lang_code,

267 pos,

268 linkage_type,

269 term_text,

270 sense=sense,

271 )

272 )

273 for data in current_data:

274 data.roman = roman

275 term_list.extend(current_data)

276

277 return term_list

278

279

280def process_thesaurus_ja_r_template(

281 wxr: WiktextractContext,

282 entry_word: str,

283 lang_code: str,

284 pos: str,

285 sense: str,

286 linkage_type: str,

287 template_node: TemplateNode,

288) -> ThesaurusTerm:

289 from .linkage import process_ja_r_template

290

291 linkage_data = process_ja_r_template(wxr, template_node, "")

292 return ThesaurusTerm(

293 entry_word,

294 lang_code,

295 pos,

296 linkage_type,

297 linkage_data.word,

298 sense=sense,

299 roman=linkage_data.roman,

300 )

301

302

303def extract_thesaurus_page(

304 wxr: WiktextractContext, page: Page

305) -> list[ThesaurusTerm]:

306 entry = page.title[page.title.find(":") + 1 :]

307 wxr.wtp.start_page(page.title)

308 root = wxr.wtp.parse(page.body)

309 data = []

310 for level2_node in root.find_child(NodeKind.LEVEL2):

311 lang_name = clean_node(wxr, None, level2_node.largs)

312 lang_code = name_to_code(lang_name, "zh")

313 if lang_code == "": 313 ↛ 314line 313 didn't jump to line 314 because the condition on line 313 was never true

314 logger.warning(

315 f"Unrecognized language: {lang_name} in page Thesaurus:{entry}"

316 )

317 data.extend(

318 parse_section(wxr, entry, lang_code, "", "", "", level2_node)

319 )

320 return data