Coverage for src/wiktextract/extractor/zh/gloss.py: 95%

1import re

3from wikitextprocessor import NodeKind, WikiNode

4from wikitextprocessor.parser import TemplateNode

6from ...page import clean_node

7from ...wxr_context import WiktextractContext

8from ..ruby import extract_ruby

9from .example import extract_example_list_item

10from .models import AltForm, Classifier, Sense, WordEntry

11from .tags import translate_raw_tags

13# https://zh.wiktionary.org/wiki/Template:Label

14LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"])

17def extract_gloss(

18 wxr: WiktextractContext,

19 page_data: list[WordEntry],

20 list_node: WikiNode,

21 parent_gloss_data: Sense,

22) -> None:

23 lang_code = page_data[-1].lang_code

24 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):

25 gloss_nodes = []

26 raw_tags = []

27 gloss_data = parent_gloss_data.model_copy(deep=True)

28 for node in list_item_node.children:

29 if isinstance(node, TemplateNode):

30 raw_tag = clean_node(wxr, gloss_data, node)

31 if node.template_name in LABEL_TEMPLATES:

32 raw_tags.extend(raw_tag.strip("()").split("，"))

33 elif raw_tag.startswith("〈") and raw_tag.endswith("〉"): 33 ↛ 34line 33 didn't jump to line 34 because the condition on line 33 was never true

34 raw_tags.append(raw_tag.strip("〈〉"))

35 elif (

36 node.template_name in FORM_OF_TEMPLATES

37 or node.template_name.endswith((" of", " form", "-form"))

38 ) and process_form_of_template(

39 wxr, node, gloss_data, page_data

40 ):

41 pass

42 elif node.template_name == "zh-mw":

43 process_zh_mw_template(wxr, node, gloss_data)

44 else:

45 gloss_nodes.append(node)

46 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

47 continue

48 else:

49 gloss_nodes.append(node)

51 if lang_code == "ja":

52 expanded_node = wxr.wtp.parse(

53 wxr.wtp.node_to_wikitext(gloss_nodes), expand_all=True

54 )

55 ruby_data, nodes_without_ruby = extract_ruby(

56 wxr, expanded_node.children

57 )

58 gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby)

59 else:

60 ruby_data = []

61 gloss_text = clean_node(wxr, gloss_data, gloss_nodes)

63 gloss_data.raw_tags.extend(raw_tags)

64 if len(gloss_text) > 0:

65 gloss_data.glosses.append(gloss_text)

66 if len(ruby_data) > 0: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 gloss_data.ruby = ruby_data

69 has_nested_gloss = False

70 if list_item_node.contain_node(NodeKind.LIST):

71 for next_list in list_item_node.find_child(NodeKind.LIST):

72 if next_list.sarg.endswith("#"): # nested gloss

73 has_nested_gloss = True

74 extract_gloss(wxr, page_data, next_list, gloss_data)

75 else:

76 for e_list_item in next_list.find_child(NodeKind.LIST_ITEM):

77 extract_example_list_item(

78 wxr, gloss_data, e_list_item, page_data

79 )

81 if not has_nested_gloss and len(gloss_data.glosses) > 0:

82 translate_raw_tags(gloss_data)

83 page_data[-1].senses.append(gloss_data)

86def process_form_of_template(

87 wxr: WiktextractContext,

88 template_node: TemplateNode,

89 sense: Sense,

90 page_data: list[WordEntry],

91) -> bool:

92 # Return `True` if template expands to list or don't want add gloss again

93 # in `extract_gloss()`

94 # https://en.wiktionary.org/wiki/Category:Form-of_templates

95 # https://en.wiktionary.org/wiki/Category:Form-of_templates_by_language

96 is_alt_of = re.search(

97 r"^alt|alt[\s-]|alternative", template_node.template_name.lower()

98 )

99 sense.tags.append("alt-of" if is_alt_of else "form-of")

100 expanded_template = wxr.wtp.parse(

101 wxr.wtp.node_to_wikitext(template_node), expand_all=True

102 )

103 if template_node.template_name.endswith("-erhua form of"):

104 process_erhua_form_of_template(wxr, expanded_template, sense)

105 return True

106

107 form_of_words = []

108 for i_tag in expanded_template.find_html_recursively("i"):

109 form_of_words = process_form_of_template_child(wxr, i_tag)

110

111 if len(form_of_words) == 0:

112 for link_node in expanded_template.find_child_recursively( 112 ↛ 117line 112 didn't jump to line 117 because the loop on line 112 didn't complete

113 NodeKind.LINK

114 ):

115 form_of_words = process_form_of_template_child(wxr, link_node)

116 break

117 for form_of_word in form_of_words:

118 form_of = AltForm(word=form_of_word)

119 if is_alt_of:

120 sense.alt_of.append(form_of)

121 else:

122 sense.form_of.append(form_of)

123

124 if expanded_template.contain_node(NodeKind.LIST):

125 shared_gloss = clean_node(

126 wxr, None, list(expanded_template.invert_find_child(NodeKind.LIST))

127 )

128 for list_item_node in expanded_template.find_child_recursively(

129 NodeKind.LIST_ITEM

130 ):

131 new_sense = sense.model_copy(deep=True)

132 new_sense.glosses.append(shared_gloss)

133 new_sense.glosses.append(

134 clean_node(wxr, None, list_item_node.children)

135 )

136 page_data[-1].senses.append(new_sense)

137 return True

138

139 return False

140

141

142def process_form_of_template_child(

143 wxr: WiktextractContext, node: WikiNode

144) -> list[str]:

145 form_of_words = []

146 span_text = clean_node(wxr, None, node)

147 for form_of_word in span_text.split("和"):

148 form_of_word = form_of_word.strip()

149 if form_of_word != "": 149 ↛ 147line 149 didn't jump to line 147 because the condition on line 149 was always true

150 form_of_words.append(form_of_word)

151 return form_of_words

152

153

154def process_erhua_form_of_template(

155 wxr: WiktextractContext, expanded_node: WikiNode, sense: Sense

156) -> None:

157 # https://zh.wiktionary.org/wiki/Template:Cmn-erhua_form_of

158 for index, span_node in enumerate(

159 expanded_node.find_html("span", attr_name="lang", attr_value="zh")

160 ):

161 span_text = clean_node(wxr, None, span_node)

162 form = AltForm(word=span_text)

163 if index == 0:

164 form.tags.append("Traditional Chinese")

165 else:

166 form.tags.append("Simplified Chinese")

167 if len(form.word) > 0: 167 ↛ 158line 167 didn't jump to line 158 because the condition on line 167 was always true

168 sense.form_of.append(form)

169 gloss_text = clean_node(wxr, sense, expanded_node)

170 if gloss_text.startswith("(官話)"): 170 ↛ 173line 170 didn't jump to line 173 because the condition on line 170 was always true

171 gloss_text = gloss_text.removeprefix("(官話)").strip()

172 sense.tags.append("Mandarin")

173 sense.tags.append("Erhua")

174 if len(gloss_text) > 0: 174 ↛ exitline 174 didn't return from function 'process_erhua_form_of_template' because the condition on line 174 was always true

175 sense.glosses.append(gloss_text)

176

177

178# https://zh.wiktionary.org/wiki/Category:/Category:之形式模板

179FORM_OF_TEMPLATES = {

180 "alt case, altcaps",

181 "alt form, altform",

182 "alt sp",

183 "construed with",

184 "honor alt case",

185 "missp",

186 "obs sp",

187 "rare sp",

188 "rfform",

189 "short for",

190 "stand sp",

191 "sup sp",

192}

193

194

195def process_zh_mw_template(

196 wxr: WiktextractContext, node: TemplateNode, sense: Sense

197) -> None:

198 # Chinese inline classifier template

199 # https://zh.wiktionary.org/wiki/Template:分類詞

200 expanded_node = wxr.wtp.parse(

201 wxr.wtp.node_to_wikitext(node), expand_all=True

202 )

203 classifiers = []

204 last_word = ""

205 for span_tag in expanded_node.find_html_recursively("span"):

206 span_class = span_tag.attrs.get("class", "")

207 if span_class in ["Hani", "Hant", "Hans"]:

208 word = clean_node(wxr, None, span_tag)

209 if word != "／":

210 classifier = Classifier(classifier=word)

211 if span_class == "Hant":

212 classifier.tags.append("Traditional Chinese")

213 elif span_class == "Hans":

214 classifier.tags.append("Simplified Chinese")

215

216 if len(classifiers) > 0 and last_word != "／":

217 sense.classifiers.extend(classifiers)

218 classifiers.clear()

219 classifiers.append(classifier)

220 last_word = word

221 elif "title" in span_tag.attrs:

222 raw_tag = clean_node(wxr, None, span_tag.attrs["title"])

223 if len(raw_tag) > 0: 223 ↛ 205line 223 didn't jump to line 205 because the condition on line 223 was always true

224 for classifier in classifiers:

225 classifier.raw_tags.append(raw_tag)

226 sense.classifiers.extend(classifiers)

227 for classifier in sense.classifiers:

228 translate_raw_tags(classifier)