Coverage for src/wiktextract/extractor/en/example.py: 8%

1from copy import deepcopy

3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode

5from ...page import clean_node

6from ...tags import valid_tags

7from ...wxr_context import WiktextractContext

8from ..ruby import extract_ruby

9from .type_utils import ExampleData, SenseData

12def extract_example_list_item(

13 wxr: WiktextractContext,

14 list_item: WikiNode,

15 sense_data: SenseData,

16 parent_data: ExampleData,

17) -> list[ExampleData]:

18 examples = []

19 for template_node in list_item.find_child(NodeKind.TEMPLATE): 19 ↛ 20line 19 didn't jump to line 20 because the loop on line 19 never started

20 if template_node.template_name in ["zh-x", "zh-q"]:

21 examples.extend(

22 extract_template_zh_x(

23 wxr,

24 template_node,

25 sense_data,

26 parent_data,

27 )

28 )

29 elif template_node.template_name in ["ja-usex", "ja-x"]:

30 examples.append(

31 extract_template_ja_usex(

32 wxr,

33 template_node,

34 sense_data,

35 parent_data,

36 )

37 )

38 elif (

39 template_node.template_name.startswith(("quote-", "RQ:"))

40 or template_node.template_name == "quote"

41 ):

42 q_example = extract_quote_templates(wxr, template_node, sense_data)

43 if list_item.contain_node(NodeKind.LIST):

44 for next_list_item in list_item.find_child_recursively(

45 NodeKind.LIST_ITEM

46 ):

47 for key in ["tags", "raw_tags"]:

48 if key not in q_example:

49 q_example[key] = []

50 examples.extend(

51 extract_example_list_item(

52 wxr, next_list_item, sense_data, q_example

53 )

54 )

55 else:

56 examples.append(q_example)

58 return examples

61def extract_quote_templates(

62 wxr: WiktextractContext, node: TemplateNode, sense_data: SenseData

63) -> ExampleData:

64 expanded_node = wxr.wtp.parse(

65 wxr.wtp.node_to_wikitext(node), expand_all=True

66 )

67 clean_node(wxr, sense_data, expanded_node)

68 ref = ""

69 text = ""

70 translation = ""

71 roman = ""

72 for span_tag in expanded_node.find_html_recursively("span"):

73 span_class = span_tag.attrs.get("class", "")

74 if "cited-source" == span_class:

75 ref = clean_node(wxr, None, span_tag)

76 elif "e-quotation" in span_class:

77 text = clean_node(wxr, None, span_tag)

78 elif "e-translation" in span_class:

79 translation = clean_node(wxr, None, span_tag)

80 for i_tag in expanded_node.find_html_recursively(

81 "i", attr_name="class", attr_value="e-transliteration"

82 ):

83 roman = clean_node(wxr, None, i_tag)

84 break

85 example_data = ExampleData(

86 text=text, ref=ref, english=translation, roman=roman, type="quote"

87 )

88 clean_example_empty_data(example_data)

89 return example_data

92def extract_template_ja_usex(

93 wxr: WiktextractContext,

94 node: TemplateNode,

95 sense_data: SenseData,

96 example_data: ExampleData,

97) -> ExampleData:

98 # https://en.wiktionary.org/wiki/Template:ja-usex

99 expanded_node = wxr.wtp.parse(

100 wxr.wtp.node_to_wikitext(node), expand_all=True

101 )

102 clean_node(wxr, sense_data, expanded_node)

103 for span_tag in expanded_node.find_html(

104 "span", attr_name="class", attr_value="Jpan"

105 ):

106 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)

107 example_data["text"] = clean_node(wxr, None, node_without_ruby)

108 example_data["ruby"] = ruby_data

109 for span_tag in expanded_node.find_html_recursively(

110 "span", attr_name="class", attr_value="tr"

111 ):

112 example_data["roman"] = clean_node(wxr, None, span_tag)

113 example_data["english"] = clean_node(

114 wxr, None, node.template_parameters.get(3, "")

115 )

116 example_data["literal_meaning"] = clean_node(

117 wxr, None, node.template_parameters.get("lit", "")

118 )

119 clean_example_empty_data(example_data)

120 return example_data

121

122

123def extract_template_zh_x(

124 wxr: WiktextractContext,

125 template_node: TemplateNode,

126 sense_data: SenseData,

127 parent_example: ExampleData,

128) -> list[ExampleData]:

129 # https://en.wiktionary.org/wiki/Template:zh-x

130 expanded_node = wxr.wtp.parse(

131 wxr.wtp.node_to_wikitext(template_node), expand_all=True

132 )

133 clean_node(wxr, sense_data, expanded_node)

134 has_dl_tag = False

135 results = []

136 for dl_tag in expanded_node.find_html_recursively("dl"):

137 has_dl_tag = True

138 example_data = deepcopy(parent_example)

139 example_data["english"] = clean_node(

140 wxr, None, template_node.template_parameters.get(2, "")

141 )

142 for dd_tag in dl_tag.find_html("dd"):

143 dd_text = clean_node(wxr, None, dd_tag)

144 if dd_text.startswith("From:"):

145 example_data["ref"] = dd_text.removeprefix("From:")

146 else:

147 for span_tag in dd_tag.find_html_recursively(

148 "span", attr_name="lang", attr_value="Latn"

149 ):

150 example_data["roman"] = clean_node(wxr, None, span_tag)

151 for span_tag in dd_tag.find_html_recursively("span"):

152 span_text = clean_node(wxr, None, span_tag)

153 if span_text.startswith("[") and span_text.endswith(

154 "]"

155 ):

156 example_data["raw_tags"].append(

157 span_text.strip("[]")

158 )

159 break

160 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data))

161

162 # no source, single line example

163 if not has_dl_tag:

164 example_data = deepcopy(parent_example)

165 for span_tag in expanded_node.find_html(

166 "span", attr_name="lang", attr_value="Latn"

167 ):

168 example_data["roman"] = clean_node(wxr, None, span_tag)

169 break

170 for span_tag in expanded_node.find_html("span"):

171 span_text = clean_node(wxr, None, span_tag)

172 if span_text.startswith("[") and span_text.endswith("]"):

173 example_data["raw_tags"].append(span_text.strip("[]"))

174 example_data["english"] = clean_node(

175 wxr, None, template_node.template_parameters.get(2, "")

176 )

177 example_data["literal_meaning"] = clean_node(

178 wxr, None, template_node.template_parameters.get("lit", "")

179 )

180 for span_tag in expanded_node.find_html("span"):

181 span_lang = span_tag.attrs.get("lang", "")

182 if span_lang in ["zh-Hant", "zh-Hans"]:

183 example_text = clean_node(wxr, None, span_tag)

184 if len(example_text) > 0:

185 new_example = deepcopy(example_data)

186 new_example["text"] = example_text

187 new_example["tags"].append(

188 "Traditional Chinese"

189 if span_lang == "zh-Hant"

190 else "Simplified Chinese"

191 )

192 clean_example_empty_data(new_example)

193 results.append(new_example)

194 return results

195

196

197def extract_zh_x_dl_span_tag(

198 wxr: WiktextractContext, dl_tag: HTMLNode, example: ExampleData

199) -> list[ExampleData]:

200 # process example text span tag and dialect span tag

201 results = []

202 is_first_hide = True

203 for span_tag in dl_tag.find_html("span"):

204 span_lang = span_tag.attrs.get("lang", "")

205 if span_lang in ["zh-Hant", "zh-Hans"]:

206 new_example = deepcopy(example)

207 new_example["text"] = clean_node(wxr, None, span_tag)

208 results.append(new_example)

209 elif "vsHide" in span_tag.attrs.get("class", ""):

210 # template has arg "collapsed=y"

211 results.extend(

212 extract_zh_x_dl_span_tag(

213 wxr,

214 span_tag,

215 results[-1]

216 if is_first_hide and len(results) > 0

217 else example,

218 )

219 )

220 is_first_hide = False

221 elif "font-size:x-small" in span_tag.attrs.get("style", ""):

222 for link_node in span_tag.find_child_recursively(NodeKind.LINK):

223 raw_tag = clean_node(wxr, None, link_node)

224 if len(raw_tag) > 0:

225 if len(results) > 0:

226 results[-1]["raw_tags"].append(raw_tag)

227 else:

228 example["raw_tags"].append(raw_tag)

229

230 if dl_tag.tag == "dl":

231 for data in results:

232 clean_example_empty_data(data)

233 return results

234

235

236ZH_X_TAGS = {

237 "trad.": "Traditional Chinese",

238 "simp.": "Simplified Chinese",

239}

240

241

242def clean_example_empty_data(data: ExampleData) -> None:

243 # remove empty data and convert raw tags

244 raw_tags = data.get("raw_tags", [])

245 new_raw_tags = []

246 for raw_tag in raw_tags:

247 if raw_tag in ZH_X_TAGS:

248 data["tags"].append(ZH_X_TAGS[raw_tag])

249 elif raw_tag in valid_tags:

250 data["tags"].append(raw_tag)

251 else:

252 new_raw_tags.append(raw_tag)

253 data["raw_tags"] = new_raw_tags

254 if len(data.get("ref", "")) > 0:

255 data["type"] = "quote"

256 else:

257 data["type"] = "example"

258 for key, value in data.copy().items():

259 if len(value) == 0:

260 del data[key]