Coverage for src/wiktextract/extractor/zh/gloss.py: 95%

176 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import re 

2 

3from wikitextprocessor import NodeKind, WikiNode 

4from wikitextprocessor.parser import TemplateNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from ..ruby import extract_ruby 

9from .example import extract_example_list_item 

10from .models import AltForm, Classifier, Linkage, Sense, WordEntry 

11from .tags import translate_raw_tags 

12 

13# https://zh.wiktionary.org/wiki/Template:Label 

14LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"]) 

15 

16# https://zh.wiktionary.org/wiki/Category:/Category:之形式模板 

17FORM_OF_TEMPLATES = frozenset( 

18 [ 

19 "alt case", 

20 "alt formaltform", 

21 "alt sp", 

22 "construed with", 

23 "honor alt case", 

24 "missp", 

25 "obs sp", 

26 "rare sp", 

27 "rfform", 

28 "short for", 

29 "stand sp", 

30 "sup sp", 

31 ] 

32) 

33ABBR_TEMPALTES = frozenset( 

34 [ 

35 "之縮寫", 

36 "abbreviation of", 

37 "abbr of", 

38 "abbrev of", 

39 "zh-short", 

40 "zh-abbrev", 

41 "中文简称", 

42 ] 

43) 

44ZH_ALT_OF_TEMPLATES = frozenset( 

45 ["zh-altname", "zh-alt-name", "中文別名", "中文别名"] 

46) 

47 

48 

49def extract_gloss( 

50 wxr: WiktextractContext, 

51 page_data: list[WordEntry], 

52 list_node: WikiNode, 

53 parent_gloss_data: Sense, 

54) -> None: 

55 lang_code = page_data[-1].lang_code 

56 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

57 gloss_nodes = [] 

58 raw_tags = [] 

59 gloss_data = parent_gloss_data.model_copy(deep=True) 

60 for node in list_item_node.children: 

61 if isinstance(node, TemplateNode): 

62 if node.template_name == "rfdef": 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true

63 continue 

64 raw_tag = clean_node(wxr, gloss_data, node) 

65 if node.template_name.lower() in LABEL_TEMPLATES: 

66 for r_tag in re.split(r",|或", raw_tag.strip("()")): 

67 r_tag = r_tag.strip() 

68 if r_tag != "": 68 ↛ 66line 68 didn't jump to line 66 because the condition on line 68 was always true

69 raw_tags.append(r_tag) 

70 elif raw_tag.startswith("〈") and raw_tag.endswith("〉"): 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 raw_tags.append(raw_tag.strip("〈〉")) 

72 elif ( 

73 node.template_name 

74 in FORM_OF_TEMPLATES | ABBR_TEMPALTES | ZH_ALT_OF_TEMPLATES 

75 or node.template_name.endswith((" of", " form", "-form")) 

76 ) and process_form_of_template( 

77 wxr, node, gloss_data, page_data 

78 ): 

79 pass 

80 elif node.template_name == "zh-mw": 

81 process_zh_mw_template(wxr, node, gloss_data) 

82 elif node.template_name.lower() in ["zh-obsolete", "†", "zh-o"]: 

83 if "obsolete" not in gloss_data.tags: 83 ↛ 60line 83 didn't jump to line 60 because the condition on line 83 was always true

84 gloss_data.tags.append("obsolete") 

85 elif node.template_name.lower() in ["defdate", "datedef"]: 

86 extract_defdate_template(wxr, gloss_data, node) 

87 else: 

88 gloss_nodes.append(node) 

89 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

90 continue 

91 else: 

92 gloss_nodes.append(node) 

93 

94 if lang_code == "ja": 

95 expanded_node = wxr.wtp.parse( 

96 wxr.wtp.node_to_wikitext(gloss_nodes), expand_all=True 

97 ) 

98 ruby_data, nodes_without_ruby = extract_ruby( 

99 wxr, expanded_node.children 

100 ) 

101 gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby) 

102 else: 

103 ruby_data = [] 

104 gloss_text = clean_node(wxr, gloss_data, gloss_nodes) 

105 

106 gloss_data.raw_tags.extend(raw_tags) 

107 if len(gloss_text) > 0: 

108 gloss_data.glosses.append(gloss_text) 

109 if len(ruby_data) > 0: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 gloss_data.ruby = ruby_data 

111 

112 translate_raw_tags(gloss_data) 

113 if len(gloss_data.glosses) > 0: 

114 page_data[-1].senses.append(gloss_data) 

115 

116 if list_item_node.contain_node(NodeKind.LIST): 

117 for next_list in list_item_node.find_child(NodeKind.LIST): 

118 if next_list.sarg.endswith("#"): # nested gloss 

119 extract_gloss(wxr, page_data, next_list, gloss_data) 

120 else: 

121 for e_list_item in next_list.find_child(NodeKind.LIST_ITEM): 

122 extract_example_list_item( 

123 wxr, gloss_data, e_list_item, page_data[-1] 

124 ) 

125 

126 

127def process_form_of_template( 

128 wxr: WiktextractContext, 

129 t_node: TemplateNode, 

130 sense: Sense, 

131 page_data: list[WordEntry], 

132) -> bool: 

133 # Return `True` if template expands to list or don't want add gloss again 

134 # in `extract_gloss()` 

135 # https://en.wiktionary.org/wiki/Category:Form-of_templates 

136 # https://en.wiktionary.org/wiki/Category:Form-of_templates_by_language 

137 is_alt_of = ( 

138 re.search(r"^alt|alt[\s-]|alternative", t_node.template_name.lower()) 

139 or t_node.template_name.lower() in ZH_ALT_OF_TEMPLATES 

140 ) 

141 is_abbr = t_node.template_name.lower() in ABBR_TEMPALTES 

142 if is_alt_of: 

143 sense.tags.append("alt-of") 

144 elif is_abbr: 

145 sense.tags.extend(["alt-of", "abbreviation"]) 

146 else: 

147 sense.tags.append("form-of") 

148 expanded_template = wxr.wtp.parse( 

149 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

150 ) 

151 if t_node.template_name.endswith("-erhua form of"): 

152 process_erhua_form_of_template(wxr, expanded_template, sense) 

153 return True 

154 elif ( 

155 t_node.template_name.lower() 

156 in {"zh-short", "zh-abbrev", "中文简称"} | ZH_ALT_OF_TEMPLATES 

157 ): 

158 extract_zh_abbr_template(wxr, expanded_template, sense) 

159 return False 

160 

161 form_of_words = [] 

162 for i_tag in expanded_template.find_html_recursively("i"): 

163 form_of_words = process_form_of_template_child(wxr, i_tag) 

164 

165 if len(form_of_words) == 0: 

166 for link_node in expanded_template.find_child_recursively( 166 ↛ 171line 166 didn't jump to line 171 because the loop on line 166 didn't complete

167 NodeKind.LINK 

168 ): 

169 form_of_words = process_form_of_template_child(wxr, link_node) 

170 break 

171 for form_of_word in form_of_words: 

172 form_of = AltForm(word=form_of_word) 

173 if is_alt_of or is_abbr: 

174 sense.alt_of.append(form_of) 

175 else: 

176 sense.form_of.append(form_of) 

177 

178 if expanded_template.contain_node(NodeKind.LIST): 

179 shared_gloss = clean_node( 

180 wxr, None, list(expanded_template.invert_find_child(NodeKind.LIST)) 

181 ) 

182 for list_item_node in expanded_template.find_child_recursively( 

183 NodeKind.LIST_ITEM 

184 ): 

185 new_sense = sense.model_copy(deep=True) 

186 new_sense.glosses.append(shared_gloss) 

187 new_sense.glosses.append( 

188 clean_node(wxr, None, list_item_node.children) 

189 ) 

190 page_data[-1].senses.append(new_sense) 

191 return True 

192 

193 return False 

194 

195 

196def process_form_of_template_child( 

197 wxr: WiktextractContext, node: WikiNode 

198) -> list[str]: 

199 form_of_words = [] 

200 span_text = clean_node(wxr, None, node) 

201 for form_of_word in span_text.split("和"): 

202 form_of_word = form_of_word.strip() 

203 if form_of_word != "": 203 ↛ 201line 203 didn't jump to line 201 because the condition on line 203 was always true

204 form_of_words.append(form_of_word) 

205 return form_of_words 

206 

207 

208def process_erhua_form_of_template( 

209 wxr: WiktextractContext, expanded_node: WikiNode, sense: Sense 

210) -> None: 

211 # https://zh.wiktionary.org/wiki/Template:Cmn-erhua_form_of 

212 for index, span_node in enumerate( 

213 expanded_node.find_html("span", attr_name="lang", attr_value="zh") 

214 ): 

215 span_text = clean_node(wxr, None, span_node) 

216 form = AltForm(word=span_text) 

217 if index == 0: 

218 form.tags.append("Traditional-Chinese") 

219 else: 

220 form.tags.append("Simplified-Chinese") 

221 if len(form.word) > 0: 221 ↛ 212line 221 didn't jump to line 212 because the condition on line 221 was always true

222 sense.form_of.append(form) 

223 gloss_text = clean_node(wxr, sense, expanded_node) 

224 if gloss_text.startswith("(官話)"): 224 ↛ 227line 224 didn't jump to line 227 because the condition on line 224 was always true

225 gloss_text = gloss_text.removeprefix("(官話)").strip() 

226 sense.tags.append("Mandarin") 

227 sense.tags.append("Erhua") 

228 if len(gloss_text) > 0: 228 ↛ exitline 228 didn't return from function 'process_erhua_form_of_template' because the condition on line 228 was always true

229 sense.glosses.append(gloss_text) 

230 

231 

232def process_zh_mw_template( 

233 wxr: WiktextractContext, node: TemplateNode, sense: Sense 

234) -> None: 

235 # Chinese inline classifier template 

236 # https://zh.wiktionary.org/wiki/Template:分類詞 

237 expanded_node = wxr.wtp.parse( 

238 wxr.wtp.node_to_wikitext(node), expand_all=True 

239 ) 

240 classifiers = [] 

241 last_word = "" 

242 for span_tag in expanded_node.find_html_recursively("span"): 

243 span_class = span_tag.attrs.get("class", "") 

244 if span_class in ["Hani", "Hant", "Hans"]: 

245 word = clean_node(wxr, None, span_tag) 

246 if word != "/": 

247 classifier = Classifier(classifier=word) 

248 if span_class == "Hant": 

249 classifier.tags.append("Traditional-Chinese") 

250 elif span_class == "Hans": 

251 classifier.tags.append("Simplified-Chinese") 

252 

253 if len(classifiers) > 0 and last_word != "/": 

254 sense.classifiers.extend(classifiers) 

255 classifiers.clear() 

256 classifiers.append(classifier) 

257 last_word = word 

258 elif "title" in span_tag.attrs: 

259 raw_tag = clean_node(wxr, None, span_tag.attrs["title"]) 

260 if len(raw_tag) > 0: 260 ↛ 242line 260 didn't jump to line 242 because the condition on line 260 was always true

261 for classifier in classifiers: 

262 classifier.raw_tags.append(raw_tag) 

263 sense.classifiers.extend(classifiers) 

264 for classifier in sense.classifiers: 

265 translate_raw_tags(classifier) 

266 

267 

268def extract_zh_abbr_template( 

269 wxr: WiktextractContext, expanded_node: WikiNode, sense: Sense 

270): 

271 # https://zh.wiktionary.org/wiki/Template:Zh-short 

272 roman = "" 

273 for i_tag in expanded_node.find_html("i"): 

274 roman = clean_node(wxr, None, i_tag) 

275 for span_tag in expanded_node.find_html("span"): 

276 span_class = span_tag.attrs.get("class", "") 

277 alt_form = AltForm(word=clean_node(wxr, None, span_tag), roman=roman) 

278 if span_class == "Hant": 

279 alt_form.tags.append("Traditional-Chinese") 

280 elif span_class == "Hans": 

281 alt_form.tags.append("Simplified-Chinese") 

282 if alt_form.word not in ["", "/"]: 

283 sense.alt_of.append(alt_form) 

284 

285 

286def extract_defdate_template( 

287 wxr: WiktextractContext, sense: Sense | Linkage, t_node: TemplateNode 

288): 

289 from .models import AttestationData, ReferenceData 

290 

291 expanded_node = wxr.wtp.parse( 

292 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

293 ) 

294 date = clean_node(wxr, None, expanded_node).strip("() ") 

295 if date != "": 295 ↛ exitline 295 didn't return from function 'extract_defdate_template' because the condition on line 295 was always true

296 attestation = AttestationData(date=date) 

297 for ref_tag in expanded_node.find_html_recursively("ref"): 

298 ref_text = clean_node(wxr, None, ref_tag.children) 

299 ref_name = ref_tag.attrs.get("name", "") 

300 if ref_text != "": 300 ↛ 297line 300 didn't jump to line 297 because the condition on line 300 was always true

301 attestation.references.append( 

302 ReferenceData(text=ref_text, refn=ref_name) 

303 ) 

304 sense.attestations.append(attestation)