Coverage for src/wiktextract/extractor/zh/gloss.py: 95%

135 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2 

3from wikitextprocessor import NodeKind, WikiNode 

4from wikitextprocessor.parser import TemplateNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from ..ruby import extract_ruby 

9from .example import extract_example_list_item 

10from .models import AltForm, Classifier, Sense, WordEntry 

11from .tags import translate_raw_tags 

12 

13# https://zh.wiktionary.org/wiki/Template:Label 

14LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"]) 

15 

16 

17def extract_gloss( 

18 wxr: WiktextractContext, 

19 page_data: list[WordEntry], 

20 list_node: WikiNode, 

21 parent_gloss_data: Sense, 

22) -> None: 

23 lang_code = page_data[-1].lang_code 

24 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

25 gloss_nodes = [] 

26 raw_tags = [] 

27 gloss_data = parent_gloss_data.model_copy(deep=True) 

28 for node in list_item_node.children: 

29 if isinstance(node, TemplateNode): 

30 if node.template_name == "rfdef": 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true

31 continue 

32 raw_tag = clean_node(wxr, gloss_data, node) 

33 if node.template_name in LABEL_TEMPLATES: 

34 raw_tags.extend(raw_tag.strip("()").split(",")) 

35 elif raw_tag.startswith("〈") and raw_tag.endswith("〉"): 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true

36 raw_tags.append(raw_tag.strip("〈〉")) 

37 elif ( 

38 node.template_name in FORM_OF_TEMPLATES 

39 or node.template_name.endswith((" of", " form", "-form")) 

40 ) and process_form_of_template( 

41 wxr, node, gloss_data, page_data 

42 ): 

43 pass 

44 elif node.template_name == "zh-mw": 

45 process_zh_mw_template(wxr, node, gloss_data) 

46 else: 

47 gloss_nodes.append(node) 

48 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

49 continue 

50 else: 

51 gloss_nodes.append(node) 

52 

53 if lang_code == "ja": 

54 expanded_node = wxr.wtp.parse( 

55 wxr.wtp.node_to_wikitext(gloss_nodes), expand_all=True 

56 ) 

57 ruby_data, nodes_without_ruby = extract_ruby( 

58 wxr, expanded_node.children 

59 ) 

60 gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby) 

61 else: 

62 ruby_data = [] 

63 gloss_text = clean_node(wxr, gloss_data, gloss_nodes) 

64 

65 gloss_data.raw_tags.extend(raw_tags) 

66 if len(gloss_text) > 0: 

67 gloss_data.glosses.append(gloss_text) 

68 if len(ruby_data) > 0: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 gloss_data.ruby = ruby_data 

70 

71 has_nested_gloss = False 

72 if list_item_node.contain_node(NodeKind.LIST): 

73 for next_list in list_item_node.find_child(NodeKind.LIST): 

74 if next_list.sarg.endswith("#"): # nested gloss 

75 has_nested_gloss = True 

76 extract_gloss(wxr, page_data, next_list, gloss_data) 

77 else: 

78 for e_list_item in next_list.find_child(NodeKind.LIST_ITEM): 

79 extract_example_list_item( 

80 wxr, gloss_data, e_list_item, page_data 

81 ) 

82 

83 if not has_nested_gloss and len(gloss_data.glosses) > 0: 

84 translate_raw_tags(gloss_data) 

85 page_data[-1].senses.append(gloss_data) 

86 

87 

88def process_form_of_template( 

89 wxr: WiktextractContext, 

90 template_node: TemplateNode, 

91 sense: Sense, 

92 page_data: list[WordEntry], 

93) -> bool: 

94 # Return `True` if template expands to list or don't want add gloss again 

95 # in `extract_gloss()` 

96 # https://en.wiktionary.org/wiki/Category:Form-of_templates 

97 # https://en.wiktionary.org/wiki/Category:Form-of_templates_by_language 

98 is_alt_of = re.search( 

99 r"^alt|alt[\s-]|alternative", template_node.template_name.lower() 

100 ) 

101 sense.tags.append("alt-of" if is_alt_of else "form-of") 

102 expanded_template = wxr.wtp.parse( 

103 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

104 ) 

105 if template_node.template_name.endswith("-erhua form of"): 

106 process_erhua_form_of_template(wxr, expanded_template, sense) 

107 return True 

108 

109 form_of_words = [] 

110 for i_tag in expanded_template.find_html_recursively("i"): 

111 form_of_words = process_form_of_template_child(wxr, i_tag) 

112 

113 if len(form_of_words) == 0: 

114 for link_node in expanded_template.find_child_recursively( 114 ↛ 119line 114 didn't jump to line 119 because the loop on line 114 didn't complete

115 NodeKind.LINK 

116 ): 

117 form_of_words = process_form_of_template_child(wxr, link_node) 

118 break 

119 for form_of_word in form_of_words: 

120 form_of = AltForm(word=form_of_word) 

121 if is_alt_of: 

122 sense.alt_of.append(form_of) 

123 else: 

124 sense.form_of.append(form_of) 

125 

126 if expanded_template.contain_node(NodeKind.LIST): 

127 shared_gloss = clean_node( 

128 wxr, None, list(expanded_template.invert_find_child(NodeKind.LIST)) 

129 ) 

130 for list_item_node in expanded_template.find_child_recursively( 

131 NodeKind.LIST_ITEM 

132 ): 

133 new_sense = sense.model_copy(deep=True) 

134 new_sense.glosses.append(shared_gloss) 

135 new_sense.glosses.append( 

136 clean_node(wxr, None, list_item_node.children) 

137 ) 

138 page_data[-1].senses.append(new_sense) 

139 return True 

140 

141 return False 

142 

143 

144def process_form_of_template_child( 

145 wxr: WiktextractContext, node: WikiNode 

146) -> list[str]: 

147 form_of_words = [] 

148 span_text = clean_node(wxr, None, node) 

149 for form_of_word in span_text.split("和"): 

150 form_of_word = form_of_word.strip() 

151 if form_of_word != "": 151 ↛ 149line 151 didn't jump to line 149 because the condition on line 151 was always true

152 form_of_words.append(form_of_word) 

153 return form_of_words 

154 

155 

156def process_erhua_form_of_template( 

157 wxr: WiktextractContext, expanded_node: WikiNode, sense: Sense 

158) -> None: 

159 # https://zh.wiktionary.org/wiki/Template:Cmn-erhua_form_of 

160 for index, span_node in enumerate( 

161 expanded_node.find_html("span", attr_name="lang", attr_value="zh") 

162 ): 

163 span_text = clean_node(wxr, None, span_node) 

164 form = AltForm(word=span_text) 

165 if index == 0: 

166 form.tags.append("Traditional Chinese") 

167 else: 

168 form.tags.append("Simplified Chinese") 

169 if len(form.word) > 0: 169 ↛ 160line 169 didn't jump to line 160 because the condition on line 169 was always true

170 sense.form_of.append(form) 

171 gloss_text = clean_node(wxr, sense, expanded_node) 

172 if gloss_text.startswith("(官話)"): 172 ↛ 175line 172 didn't jump to line 175 because the condition on line 172 was always true

173 gloss_text = gloss_text.removeprefix("(官話)").strip() 

174 sense.tags.append("Mandarin") 

175 sense.tags.append("Erhua") 

176 if len(gloss_text) > 0: 176 ↛ exitline 176 didn't return from function 'process_erhua_form_of_template' because the condition on line 176 was always true

177 sense.glosses.append(gloss_text) 

178 

179 

180# https://zh.wiktionary.org/wiki/Category:/Category:之形式模板 

181FORM_OF_TEMPLATES = { 

182 "alt case, altcaps", 

183 "alt form, altform", 

184 "alt sp", 

185 "construed with", 

186 "honor alt case", 

187 "missp", 

188 "obs sp", 

189 "rare sp", 

190 "rfform", 

191 "short for", 

192 "stand sp", 

193 "sup sp", 

194} 

195 

196 

197def process_zh_mw_template( 

198 wxr: WiktextractContext, node: TemplateNode, sense: Sense 

199) -> None: 

200 # Chinese inline classifier template 

201 # https://zh.wiktionary.org/wiki/Template:分類詞 

202 expanded_node = wxr.wtp.parse( 

203 wxr.wtp.node_to_wikitext(node), expand_all=True 

204 ) 

205 classifiers = [] 

206 last_word = "" 

207 for span_tag in expanded_node.find_html_recursively("span"): 

208 span_class = span_tag.attrs.get("class", "") 

209 if span_class in ["Hani", "Hant", "Hans"]: 

210 word = clean_node(wxr, None, span_tag) 

211 if word != "/": 

212 classifier = Classifier(classifier=word) 

213 if span_class == "Hant": 

214 classifier.tags.append("Traditional Chinese") 

215 elif span_class == "Hans": 

216 classifier.tags.append("Simplified Chinese") 

217 

218 if len(classifiers) > 0 and last_word != "/": 

219 sense.classifiers.extend(classifiers) 

220 classifiers.clear() 

221 classifiers.append(classifier) 

222 last_word = word 

223 elif "title" in span_tag.attrs: 

224 raw_tag = clean_node(wxr, None, span_tag.attrs["title"]) 

225 if len(raw_tag) > 0: 225 ↛ 207line 225 didn't jump to line 207 because the condition on line 225 was always true

226 for classifier in classifiers: 

227 classifier.raw_tags.append(raw_tag) 

228 sense.classifiers.extend(classifiers) 

229 for classifier in sense.classifiers: 

230 translate_raw_tags(classifier)