Coverage for src/wiktextract/extractor/zh/gloss.py: 94%

138 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1import re 

2 

3from wikitextprocessor import NodeKind, WikiNode 

4from wikitextprocessor.parser import TemplateNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from ..ruby import extract_ruby 

9from .example import extract_example_list_item 

10from .models import AltForm, Classifier, Sense, WordEntry 

11from .tags import translate_raw_tags 

12 

13# https://zh.wiktionary.org/wiki/Template:Label 

14LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"]) 

15 

16 

17def extract_gloss( 

18 wxr: WiktextractContext, 

19 page_data: list[WordEntry], 

20 list_node: WikiNode, 

21 parent_gloss_data: Sense, 

22) -> None: 

23 lang_code = page_data[-1].lang_code 

24 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

25 gloss_nodes = [] 

26 raw_tags = [] 

27 gloss_data = parent_gloss_data.model_copy(deep=True) 

28 for node in list_item_node.children: 

29 if isinstance(node, TemplateNode): 

30 if node.template_name == "rfdef": 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true

31 continue 

32 raw_tag = clean_node(wxr, gloss_data, node) 

33 if node.template_name in LABEL_TEMPLATES: 

34 for r_tag in re.split(r",|或", raw_tag.strip("()")): 

35 r_tag = r_tag.strip() 

36 if r_tag != "": 36 ↛ 34line 36 didn't jump to line 34 because the condition on line 36 was always true

37 raw_tags.append(r_tag) 

38 elif raw_tag.startswith("〈") and raw_tag.endswith("〉"): 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true

39 raw_tags.append(raw_tag.strip("〈〉")) 

40 elif ( 

41 node.template_name in FORM_OF_TEMPLATES 

42 or node.template_name.endswith((" of", " form", "-form")) 

43 ) and process_form_of_template( 

44 wxr, node, gloss_data, page_data 

45 ): 

46 pass 

47 elif node.template_name == "zh-mw": 

48 process_zh_mw_template(wxr, node, gloss_data) 

49 else: 

50 gloss_nodes.append(node) 

51 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

52 continue 

53 else: 

54 gloss_nodes.append(node) 

55 

56 if lang_code == "ja": 

57 expanded_node = wxr.wtp.parse( 

58 wxr.wtp.node_to_wikitext(gloss_nodes), expand_all=True 

59 ) 

60 ruby_data, nodes_without_ruby = extract_ruby( 

61 wxr, expanded_node.children 

62 ) 

63 gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby) 

64 else: 

65 ruby_data = [] 

66 gloss_text = clean_node(wxr, gloss_data, gloss_nodes) 

67 

68 gloss_data.raw_tags.extend(raw_tags) 

69 if len(gloss_text) > 0: 

70 gloss_data.glosses.append(gloss_text) 

71 if len(ruby_data) > 0: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true

72 gloss_data.ruby = ruby_data 

73 

74 has_nested_gloss = False 

75 if list_item_node.contain_node(NodeKind.LIST): 

76 for next_list in list_item_node.find_child(NodeKind.LIST): 

77 if next_list.sarg.endswith("#"): # nested gloss 

78 has_nested_gloss = True 

79 extract_gloss(wxr, page_data, next_list, gloss_data) 

80 else: 

81 for e_list_item in next_list.find_child(NodeKind.LIST_ITEM): 

82 extract_example_list_item( 

83 wxr, gloss_data, e_list_item, page_data[-1] 

84 ) 

85 

86 if not has_nested_gloss and len(gloss_data.glosses) > 0: 

87 translate_raw_tags(gloss_data) 

88 page_data[-1].senses.append(gloss_data) 

89 

90 

91def process_form_of_template( 

92 wxr: WiktextractContext, 

93 template_node: TemplateNode, 

94 sense: Sense, 

95 page_data: list[WordEntry], 

96) -> bool: 

97 # Return `True` if template expands to list or don't want add gloss again 

98 # in `extract_gloss()` 

99 # https://en.wiktionary.org/wiki/Category:Form-of_templates 

100 # https://en.wiktionary.org/wiki/Category:Form-of_templates_by_language 

101 is_alt_of = re.search( 

102 r"^alt|alt[\s-]|alternative", template_node.template_name.lower() 

103 ) 

104 sense.tags.append("alt-of" if is_alt_of else "form-of") 

105 expanded_template = wxr.wtp.parse( 

106 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

107 ) 

108 if template_node.template_name.endswith("-erhua form of"): 

109 process_erhua_form_of_template(wxr, expanded_template, sense) 

110 return True 

111 

112 form_of_words = [] 

113 for i_tag in expanded_template.find_html_recursively("i"): 

114 form_of_words = process_form_of_template_child(wxr, i_tag) 

115 

116 if len(form_of_words) == 0: 

117 for link_node in expanded_template.find_child_recursively( 117 ↛ 122line 117 didn't jump to line 122 because the loop on line 117 didn't complete

118 NodeKind.LINK 

119 ): 

120 form_of_words = process_form_of_template_child(wxr, link_node) 

121 break 

122 for form_of_word in form_of_words: 

123 form_of = AltForm(word=form_of_word) 

124 if is_alt_of: 

125 sense.alt_of.append(form_of) 

126 else: 

127 sense.form_of.append(form_of) 

128 

129 if expanded_template.contain_node(NodeKind.LIST): 

130 shared_gloss = clean_node( 

131 wxr, None, list(expanded_template.invert_find_child(NodeKind.LIST)) 

132 ) 

133 for list_item_node in expanded_template.find_child_recursively( 

134 NodeKind.LIST_ITEM 

135 ): 

136 new_sense = sense.model_copy(deep=True) 

137 new_sense.glosses.append(shared_gloss) 

138 new_sense.glosses.append( 

139 clean_node(wxr, None, list_item_node.children) 

140 ) 

141 page_data[-1].senses.append(new_sense) 

142 return True 

143 

144 return False 

145 

146 

147def process_form_of_template_child( 

148 wxr: WiktextractContext, node: WikiNode 

149) -> list[str]: 

150 form_of_words = [] 

151 span_text = clean_node(wxr, None, node) 

152 for form_of_word in span_text.split("和"): 

153 form_of_word = form_of_word.strip() 

154 if form_of_word != "": 154 ↛ 152line 154 didn't jump to line 152 because the condition on line 154 was always true

155 form_of_words.append(form_of_word) 

156 return form_of_words 

157 

158 

159def process_erhua_form_of_template( 

160 wxr: WiktextractContext, expanded_node: WikiNode, sense: Sense 

161) -> None: 

162 # https://zh.wiktionary.org/wiki/Template:Cmn-erhua_form_of 

163 for index, span_node in enumerate( 

164 expanded_node.find_html("span", attr_name="lang", attr_value="zh") 

165 ): 

166 span_text = clean_node(wxr, None, span_node) 

167 form = AltForm(word=span_text) 

168 if index == 0: 

169 form.tags.append("Traditional Chinese") 

170 else: 

171 form.tags.append("Simplified Chinese") 

172 if len(form.word) > 0: 172 ↛ 163line 172 didn't jump to line 163 because the condition on line 172 was always true

173 sense.form_of.append(form) 

174 gloss_text = clean_node(wxr, sense, expanded_node) 

175 if gloss_text.startswith("(官話)"): 175 ↛ 178line 175 didn't jump to line 178 because the condition on line 175 was always true

176 gloss_text = gloss_text.removeprefix("(官話)").strip() 

177 sense.tags.append("Mandarin") 

178 sense.tags.append("Erhua") 

179 if len(gloss_text) > 0: 179 ↛ exitline 179 didn't return from function 'process_erhua_form_of_template' because the condition on line 179 was always true

180 sense.glosses.append(gloss_text) 

181 

182 

183# https://zh.wiktionary.org/wiki/Category:/Category:之形式模板 

184FORM_OF_TEMPLATES = { 

185 "alt case, altcaps", 

186 "alt form, altform", 

187 "alt sp", 

188 "construed with", 

189 "honor alt case", 

190 "missp", 

191 "obs sp", 

192 "rare sp", 

193 "rfform", 

194 "short for", 

195 "stand sp", 

196 "sup sp", 

197} 

198 

199 

200def process_zh_mw_template( 

201 wxr: WiktextractContext, node: TemplateNode, sense: Sense 

202) -> None: 

203 # Chinese inline classifier template 

204 # https://zh.wiktionary.org/wiki/Template:分類詞 

205 expanded_node = wxr.wtp.parse( 

206 wxr.wtp.node_to_wikitext(node), expand_all=True 

207 ) 

208 classifiers = [] 

209 last_word = "" 

210 for span_tag in expanded_node.find_html_recursively("span"): 

211 span_class = span_tag.attrs.get("class", "") 

212 if span_class in ["Hani", "Hant", "Hans"]: 

213 word = clean_node(wxr, None, span_tag) 

214 if word != "/": 

215 classifier = Classifier(classifier=word) 

216 if span_class == "Hant": 

217 classifier.tags.append("Traditional Chinese") 

218 elif span_class == "Hans": 

219 classifier.tags.append("Simplified Chinese") 

220 

221 if len(classifiers) > 0 and last_word != "/": 

222 sense.classifiers.extend(classifiers) 

223 classifiers.clear() 

224 classifiers.append(classifier) 

225 last_word = word 

226 elif "title" in span_tag.attrs: 

227 raw_tag = clean_node(wxr, None, span_tag.attrs["title"]) 

228 if len(raw_tag) > 0: 228 ↛ 210line 228 didn't jump to line 210 because the condition on line 228 was always true

229 for classifier in classifiers: 

230 classifier.raw_tags.append(raw_tag) 

231 sense.classifiers.extend(classifiers) 

232 for classifier in sense.classifiers: 

233 translate_raw_tags(classifier)