Coverage for src/wiktextract/extractor/zh/gloss.py: 95%

133 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import re 

2 

3from wikitextprocessor import NodeKind, WikiNode 

4from wikitextprocessor.parser import TemplateNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from ..ruby import extract_ruby 

9from .example import extract_example_list_item 

10from .models import AltForm, Classifier, Sense, WordEntry 

11from .tags import translate_raw_tags 

12 

13# https://zh.wiktionary.org/wiki/Template:Label 

14LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"]) 

15 

16 

17def extract_gloss( 

18 wxr: WiktextractContext, 

19 page_data: list[WordEntry], 

20 list_node: WikiNode, 

21 parent_gloss_data: Sense, 

22) -> None: 

23 lang_code = page_data[-1].lang_code 

24 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

25 gloss_nodes = [] 

26 raw_tags = [] 

27 gloss_data = parent_gloss_data.model_copy(deep=True) 

28 for node in list_item_node.children: 

29 if isinstance(node, TemplateNode): 

30 raw_tag = clean_node(wxr, gloss_data, node) 

31 if node.template_name in LABEL_TEMPLATES: 

32 raw_tags.extend(raw_tag.strip("()").split(",")) 

33 elif raw_tag.startswith("〈") and raw_tag.endswith("〉"): 33 ↛ 34line 33 didn't jump to line 34 because the condition on line 33 was never true

34 raw_tags.append(raw_tag.strip("〈〉")) 

35 elif ( 

36 node.template_name in FORM_OF_TEMPLATES 

37 or node.template_name.endswith((" of", " form", "-form")) 

38 ) and process_form_of_template( 

39 wxr, node, gloss_data, page_data 

40 ): 

41 pass 

42 elif node.template_name == "zh-mw": 

43 process_zh_mw_template(wxr, node, gloss_data) 

44 else: 

45 gloss_nodes.append(node) 

46 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

47 continue 

48 else: 

49 gloss_nodes.append(node) 

50 

51 if lang_code == "ja": 

52 expanded_node = wxr.wtp.parse( 

53 wxr.wtp.node_to_wikitext(gloss_nodes), expand_all=True 

54 ) 

55 ruby_data, nodes_without_ruby = extract_ruby( 

56 wxr, expanded_node.children 

57 ) 

58 gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby) 

59 else: 

60 ruby_data = [] 

61 gloss_text = clean_node(wxr, gloss_data, gloss_nodes) 

62 

63 gloss_data.raw_tags.extend(raw_tags) 

64 if len(gloss_text) > 0: 

65 gloss_data.glosses.append(gloss_text) 

66 if len(ruby_data) > 0: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 gloss_data.ruby = ruby_data 

68 

69 has_nested_gloss = False 

70 if list_item_node.contain_node(NodeKind.LIST): 

71 for next_list in list_item_node.find_child(NodeKind.LIST): 

72 if next_list.sarg.endswith("#"): # nested gloss 

73 has_nested_gloss = True 

74 extract_gloss(wxr, page_data, next_list, gloss_data) 

75 else: 

76 for e_list_item in next_list.find_child(NodeKind.LIST_ITEM): 

77 extract_example_list_item( 

78 wxr, gloss_data, e_list_item, page_data 

79 ) 

80 

81 if not has_nested_gloss and len(gloss_data.glosses) > 0: 

82 translate_raw_tags(gloss_data) 

83 page_data[-1].senses.append(gloss_data) 

84 

85 

86def process_form_of_template( 

87 wxr: WiktextractContext, 

88 template_node: TemplateNode, 

89 sense: Sense, 

90 page_data: list[WordEntry], 

91) -> bool: 

92 # Return `True` if template expands to list or don't want add gloss again 

93 # in `extract_gloss()` 

94 # https://en.wiktionary.org/wiki/Category:Form-of_templates 

95 # https://en.wiktionary.org/wiki/Category:Form-of_templates_by_language 

96 is_alt_of = re.search( 

97 r"^alt|alt[\s-]|alternative", template_node.template_name.lower() 

98 ) 

99 sense.tags.append("alt-of" if is_alt_of else "form-of") 

100 expanded_template = wxr.wtp.parse( 

101 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

102 ) 

103 if template_node.template_name.endswith("-erhua form of"): 

104 process_erhua_form_of_template(wxr, expanded_template, sense) 

105 return True 

106 

107 form_of_words = [] 

108 for i_tag in expanded_template.find_html_recursively("i"): 

109 form_of_words = process_form_of_template_child(wxr, i_tag) 

110 

111 if len(form_of_words) == 0: 

112 for link_node in expanded_template.find_child_recursively( 112 ↛ 117line 112 didn't jump to line 117 because the loop on line 112 didn't complete

113 NodeKind.LINK 

114 ): 

115 form_of_words = process_form_of_template_child(wxr, link_node) 

116 break 

117 for form_of_word in form_of_words: 

118 form_of = AltForm(word=form_of_word) 

119 if is_alt_of: 

120 sense.alt_of.append(form_of) 

121 else: 

122 sense.form_of.append(form_of) 

123 

124 if expanded_template.contain_node(NodeKind.LIST): 

125 shared_gloss = clean_node( 

126 wxr, None, list(expanded_template.invert_find_child(NodeKind.LIST)) 

127 ) 

128 for list_item_node in expanded_template.find_child_recursively( 

129 NodeKind.LIST_ITEM 

130 ): 

131 new_sense = sense.model_copy(deep=True) 

132 new_sense.glosses.append(shared_gloss) 

133 new_sense.glosses.append( 

134 clean_node(wxr, None, list_item_node.children) 

135 ) 

136 page_data[-1].senses.append(new_sense) 

137 return True 

138 

139 return False 

140 

141 

142def process_form_of_template_child( 

143 wxr: WiktextractContext, node: WikiNode 

144) -> list[str]: 

145 form_of_words = [] 

146 span_text = clean_node(wxr, None, node) 

147 for form_of_word in span_text.split("和"): 

148 form_of_word = form_of_word.strip() 

149 if form_of_word != "": 149 ↛ 147line 149 didn't jump to line 147 because the condition on line 149 was always true

150 form_of_words.append(form_of_word) 

151 return form_of_words 

152 

153 

154def process_erhua_form_of_template( 

155 wxr: WiktextractContext, expanded_node: WikiNode, sense: Sense 

156) -> None: 

157 # https://zh.wiktionary.org/wiki/Template:Cmn-erhua_form_of 

158 for index, span_node in enumerate( 

159 expanded_node.find_html("span", attr_name="lang", attr_value="zh") 

160 ): 

161 span_text = clean_node(wxr, None, span_node) 

162 form = AltForm(word=span_text) 

163 if index == 0: 

164 form.tags.append("Traditional Chinese") 

165 else: 

166 form.tags.append("Simplified Chinese") 

167 if len(form.word) > 0: 167 ↛ 158line 167 didn't jump to line 158 because the condition on line 167 was always true

168 sense.form_of.append(form) 

169 gloss_text = clean_node(wxr, sense, expanded_node) 

170 if gloss_text.startswith("(官話)"): 170 ↛ 173line 170 didn't jump to line 173 because the condition on line 170 was always true

171 gloss_text = gloss_text.removeprefix("(官話)").strip() 

172 sense.tags.append("Mandarin") 

173 sense.tags.append("Erhua") 

174 if len(gloss_text) > 0: 174 ↛ exitline 174 didn't return from function 'process_erhua_form_of_template' because the condition on line 174 was always true

175 sense.glosses.append(gloss_text) 

176 

177 

178# https://zh.wiktionary.org/wiki/Category:/Category:之形式模板 

179FORM_OF_TEMPLATES = { 

180 "alt case, altcaps", 

181 "alt form, altform", 

182 "alt sp", 

183 "construed with", 

184 "honor alt case", 

185 "missp", 

186 "obs sp", 

187 "rare sp", 

188 "rfform", 

189 "short for", 

190 "stand sp", 

191 "sup sp", 

192} 

193 

194 

195def process_zh_mw_template( 

196 wxr: WiktextractContext, node: TemplateNode, sense: Sense 

197) -> None: 

198 # Chinese inline classifier template 

199 # https://zh.wiktionary.org/wiki/Template:分類詞 

200 expanded_node = wxr.wtp.parse( 

201 wxr.wtp.node_to_wikitext(node), expand_all=True 

202 ) 

203 classifiers = [] 

204 last_word = "" 

205 for span_tag in expanded_node.find_html_recursively("span"): 

206 span_class = span_tag.attrs.get("class", "") 

207 if span_class in ["Hani", "Hant", "Hans"]: 

208 word = clean_node(wxr, None, span_tag) 

209 if word != "/": 

210 classifier = Classifier(classifier=word) 

211 if span_class == "Hant": 

212 classifier.tags.append("Traditional Chinese") 

213 elif span_class == "Hans": 

214 classifier.tags.append("Simplified Chinese") 

215 

216 if len(classifiers) > 0 and last_word != "/": 

217 sense.classifiers.extend(classifiers) 

218 classifiers.clear() 

219 classifiers.append(classifier) 

220 last_word = word 

221 elif "title" in span_tag.attrs: 

222 raw_tag = clean_node(wxr, None, span_tag.attrs["title"]) 

223 if len(raw_tag) > 0: 223 ↛ 205line 223 didn't jump to line 205 because the condition on line 223 was always true

224 for classifier in classifiers: 

225 classifier.raw_tags.append(raw_tag) 

226 sense.classifiers.extend(classifiers) 

227 for classifier in sense.classifiers: 

228 translate_raw_tags(classifier)