Coverage for src/wiktextract/extractor/zh/gloss.py: 95%

176 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-17 08:19 +0000

1import re 

2 

3from wikitextprocessor import NodeKind, WikiNode 

4from wikitextprocessor.parser import TemplateNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from ..ruby import extract_ruby 

9from .example import extract_example_list_item 

10from .models import AltForm, Classifier, Linkage, Sense, WordEntry 

11from .tags import translate_raw_tags 

12 

13# https://zh.wiktionary.org/wiki/Template:Label 

14LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"]) 

15 

16# https://zh.wiktionary.org/wiki/Category:/Category:之形式模板 

17FORM_OF_TEMPLATES = frozenset( 

18 [ 

19 "alt case", 

20 "alt formaltform", 

21 "alt sp", 

22 "construed with", 

23 "honor alt case", 

24 "missp", 

25 "obs sp", 

26 "rare sp", 

27 "rfform", 

28 "short for", 

29 "stand sp", 

30 "sup sp", 

31 ] 

32) 

33ABBR_TEMPALTES = frozenset( 

34 [ 

35 "之縮寫", 

36 "abbreviation of", 

37 "abbr of", 

38 "abbrev of", 

39 "zh-short", 

40 "zh-abbrev", 

41 "中文简称", 

42 ] 

43) 

44ZH_ALT_OF_TEMPLATES = frozenset( 

45 ["zh-altname", "zh-alt-name", "中文別名", "中文别名"] 

46) 

47 

48 

49def extract_gloss( 

50 wxr: WiktextractContext, 

51 page_data: list[WordEntry], 

52 list_node: WikiNode, 

53 parent_gloss_data: Sense, 

54) -> None: 

55 lang_code = page_data[-1].lang_code 

56 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

57 gloss_nodes = [] 

58 raw_tags = [] 

59 gloss_data = parent_gloss_data.model_copy(deep=True) 

60 for node in list_item_node.children: 

61 if isinstance(node, TemplateNode): 

62 if node.template_name == "rfdef": 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 continue 

64 raw_tag = clean_node(wxr, gloss_data, node) 

65 if node.template_name.lower() in LABEL_TEMPLATES: 

66 for r_tag in re.split(r",|或", raw_tag.strip("()")): 

67 r_tag = r_tag.strip() 

68 if r_tag != "": 68 ↛ 66line 68 didn't jump to line 66 because the condition on line 68 was always true

69 raw_tags.append(r_tag) 

70 elif raw_tag.startswith("〈") and raw_tag.endswith("〉"): 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 raw_tags.append(raw_tag.strip("〈〉")) 

72 elif ( 

73 node.template_name 

74 in FORM_OF_TEMPLATES | ABBR_TEMPALTES | ZH_ALT_OF_TEMPLATES 

75 or node.template_name.endswith((" of", " form", "-form")) 

76 ) and process_form_of_template( 

77 wxr, node, gloss_data, page_data 

78 ): 

79 pass 

80 elif node.template_name == "zh-mw": 

81 process_zh_mw_template(wxr, node, gloss_data) 

82 elif node.template_name.lower() in ["zh-obsolete", "†", "zh-o"]: 

83 if "obsolete" not in gloss_data.tags: 83 ↛ 60line 83 didn't jump to line 60 because the condition on line 83 was always true

84 gloss_data.tags.append("obsolete") 

85 elif node.template_name.lower() in ["defdate", "datedef"]: 

86 extract_defdate_template(wxr, gloss_data, node) 

87 else: 

88 gloss_nodes.append(node) 

89 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

90 continue 

91 else: 

92 gloss_nodes.append(node) 

93 

94 if lang_code == "ja": 

95 expanded_node = wxr.wtp.parse( 

96 wxr.wtp.node_to_wikitext(gloss_nodes), expand_all=True 

97 ) 

98 ruby_data, nodes_without_ruby = extract_ruby( 

99 wxr, expanded_node.children 

100 ) 

101 gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby) 

102 else: 

103 ruby_data = [] 

104 gloss_text = clean_node(wxr, gloss_data, gloss_nodes) 

105 

106 gloss_data.raw_tags.extend(raw_tags) 

107 if len(gloss_text) > 0: 

108 gloss_data.glosses.append(gloss_text) 

109 if len(ruby_data) > 0: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 gloss_data.ruby = ruby_data 

111 

112 translate_raw_tags(gloss_data) 

113 if len(gloss_data.glosses) > 0: 

114 page_data[-1].senses.append(gloss_data) 

115 

116 if list_item_node.contain_node(NodeKind.LIST): 

117 for next_list in list_item_node.find_child(NodeKind.LIST): 

118 if next_list.sarg.endswith("#"): # nested gloss 

119 extract_gloss(wxr, page_data, next_list, gloss_data) 

120 else: 

121 for e_list_item in next_list.find_child(NodeKind.LIST_ITEM): 

122 extract_example_list_item( 

123 wxr, gloss_data, e_list_item, page_data[-1] 

124 ) 

125 

126 

127def process_form_of_template( 

128 wxr: WiktextractContext, 

129 t_node: TemplateNode, 

130 sense: Sense, 

131 page_data: list[WordEntry], 

132) -> bool: 

133 # Return `True` if template expands to list or don't want add gloss again 

134 # in `extract_gloss()` 

135 # https://en.wiktionary.org/wiki/Category:Form-of_templates 

136 # https://en.wiktionary.org/wiki/Category:Form-of_templates_by_language 

137 is_alt_of = ( 

138 re.search(r"^alt|alt[\s-]|alternative", t_node.template_name.lower()) 

139 or t_node.template_name.lower() in ZH_ALT_OF_TEMPLATES 

140 ) 

141 is_abbr = t_node.template_name.lower() in ABBR_TEMPALTES 

142 if is_alt_of: 

143 sense.tags.append("alt-of") 

144 elif is_abbr: 

145 sense.tags.extend(["alt-of", "abbreviation"]) 

146 else: 

147 sense.tags.append("form-of") 

148 expanded_template = wxr.wtp.parse( 

149 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

150 ) 

151 if t_node.template_name.endswith("-erhua form of"): 

152 process_erhua_form_of_template(wxr, expanded_template, sense) 

153 return True 

154 elif ( 

155 t_node.template_name.lower() 

156 in {"zh-short", "zh-abbrev", "中文简称"} | ZH_ALT_OF_TEMPLATES 

157 ): 

158 extract_zh_abbr_template(wxr, expanded_template, sense) 

159 return False 

160 

161 form_of_words = [] 

162 for i_tag in expanded_template.find_html_recursively("i"): 

163 form_of_words = process_form_of_template_child(wxr, i_tag) 

164 

165 if len(form_of_words) == 0: 

166 for link_node in expanded_template.find_child_recursively( 166 ↛ 171line 166 didn't jump to line 171 because the loop on line 166 didn't complete

167 NodeKind.LINK 

168 ): 

169 form_of_words = process_form_of_template_child(wxr, link_node) 

170 break 

171 for form_of_word in form_of_words: 

172 form_of = AltForm(word=form_of_word) 

173 if is_alt_of or is_abbr: 

174 sense.alt_of.append(form_of) 

175 else: 

176 sense.form_of.append(form_of) 

177 

178 if expanded_template.contain_node(NodeKind.LIST): 

179 shared_gloss = clean_node( 

180 wxr, 

181 None, 

182 list( 

183 expanded_template.invert_find_child( 

184 NodeKind.LIST, include_empty_str=True 

185 ) 

186 ), 

187 ) 

188 for list_item_node in expanded_template.find_child_recursively( 

189 NodeKind.LIST_ITEM 

190 ): 

191 new_sense = sense.model_copy(deep=True) 

192 new_sense.glosses.append(shared_gloss) 

193 new_sense.glosses.append( 

194 clean_node(wxr, None, list_item_node.children) 

195 ) 

196 page_data[-1].senses.append(new_sense) 

197 return True 

198 

199 return False 

200 

201 

202def process_form_of_template_child( 

203 wxr: WiktextractContext, node: WikiNode 

204) -> list[str]: 

205 form_of_words = [] 

206 span_text = clean_node(wxr, None, node) 

207 for form_of_word in span_text.split("和"): 

208 form_of_word = form_of_word.strip() 

209 if form_of_word != "": 209 ↛ 207line 209 didn't jump to line 207 because the condition on line 209 was always true

210 form_of_words.append(form_of_word) 

211 return form_of_words 

212 

213 

214def process_erhua_form_of_template( 

215 wxr: WiktextractContext, expanded_node: WikiNode, sense: Sense 

216) -> None: 

217 # https://zh.wiktionary.org/wiki/Template:Cmn-erhua_form_of 

218 for index, span_node in enumerate( 

219 expanded_node.find_html("span", attr_name="lang", attr_value="zh") 

220 ): 

221 span_text = clean_node(wxr, None, span_node) 

222 form = AltForm(word=span_text) 

223 if index == 0: 

224 form.tags.append("Traditional-Chinese") 

225 else: 

226 form.tags.append("Simplified-Chinese") 

227 if len(form.word) > 0: 227 ↛ 218line 227 didn't jump to line 218 because the condition on line 227 was always true

228 sense.form_of.append(form) 

229 gloss_text = clean_node(wxr, sense, expanded_node) 

230 if gloss_text.startswith("(官話)"): 230 ↛ 233line 230 didn't jump to line 233 because the condition on line 230 was always true

231 gloss_text = gloss_text.removeprefix("(官話)").strip() 

232 sense.tags.append("Mandarin") 

233 sense.tags.append("Erhua") 

234 if len(gloss_text) > 0: 234 ↛ exitline 234 didn't return from function 'process_erhua_form_of_template' because the condition on line 234 was always true

235 sense.glosses.append(gloss_text) 

236 

237 

238def process_zh_mw_template( 

239 wxr: WiktextractContext, node: TemplateNode, sense: Sense 

240) -> None: 

241 # Chinese inline classifier template 

242 # https://zh.wiktionary.org/wiki/Template:分類詞 

243 expanded_node = wxr.wtp.parse( 

244 wxr.wtp.node_to_wikitext(node), expand_all=True 

245 ) 

246 classifiers = [] 

247 last_word = "" 

248 for span_tag in expanded_node.find_html_recursively("span"): 

249 span_class = span_tag.attrs.get("class", "") 

250 if span_class in ["Hani", "Hant", "Hans"]: 

251 word = clean_node(wxr, None, span_tag) 

252 if word != "/": 

253 classifier = Classifier(classifier=word) 

254 if span_class == "Hant": 

255 classifier.tags.append("Traditional-Chinese") 

256 elif span_class == "Hans": 

257 classifier.tags.append("Simplified-Chinese") 

258 

259 if len(classifiers) > 0 and last_word != "/": 

260 sense.classifiers.extend(classifiers) 

261 classifiers.clear() 

262 classifiers.append(classifier) 

263 last_word = word 

264 elif "title" in span_tag.attrs: 

265 raw_tag = clean_node(wxr, None, span_tag.attrs["title"]) 

266 if len(raw_tag) > 0: 266 ↛ 248line 266 didn't jump to line 248 because the condition on line 266 was always true

267 for classifier in classifiers: 

268 classifier.raw_tags.append(raw_tag) 

269 sense.classifiers.extend(classifiers) 

270 for classifier in sense.classifiers: 

271 translate_raw_tags(classifier) 

272 

273 

274def extract_zh_abbr_template( 

275 wxr: WiktextractContext, expanded_node: WikiNode, sense: Sense 

276): 

277 # https://zh.wiktionary.org/wiki/Template:Zh-short 

278 roman = "" 

279 for i_tag in expanded_node.find_html("i"): 

280 roman = clean_node(wxr, None, i_tag) 

281 for span_tag in expanded_node.find_html("span"): 

282 span_class = span_tag.attrs.get("class", "") 

283 alt_form = AltForm(word=clean_node(wxr, None, span_tag), roman=roman) 

284 if span_class == "Hant": 

285 alt_form.tags.append("Traditional-Chinese") 

286 elif span_class == "Hans": 

287 alt_form.tags.append("Simplified-Chinese") 

288 if alt_form.word not in ["", "/"]: 

289 sense.alt_of.append(alt_form) 

290 

291 

292def extract_defdate_template( 

293 wxr: WiktextractContext, sense: Sense | Linkage, t_node: TemplateNode 

294): 

295 from .models import AttestationData, ReferenceData 

296 

297 expanded_node = wxr.wtp.parse( 

298 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

299 ) 

300 date = clean_node(wxr, None, expanded_node).strip("() ") 

301 if date != "": 301 ↛ exitline 301 didn't return from function 'extract_defdate_template' because the condition on line 301 was always true

302 attestation = AttestationData(date=date) 

303 for ref_tag in expanded_node.find_html_recursively("ref"): 

304 ref_text = clean_node(wxr, None, ref_tag.children) 

305 ref_name = ref_tag.attrs.get("name", "") 

306 if ref_text != "": 306 ↛ 303line 306 didn't jump to line 303 because the condition on line 306 was always true

307 attestation.references.append( 

308 ReferenceData(text=ref_text, refn=ref_name) 

309 ) 

310 sense.attestations.append(attestation)