Coverage for src/wiktextract/extractor/en/example.py: 8%

141 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from copy import deepcopy 

2 

3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...tags import valid_tags 

7from ...wxr_context import WiktextractContext 

8from ..ruby import extract_ruby 

9from .type_utils import ExampleData, SenseData 

10 

11 

12def extract_example_list_item( 

13 wxr: WiktextractContext, 

14 list_item: WikiNode, 

15 sense_data: SenseData, 

16 parent_data: ExampleData, 

17) -> list[ExampleData]: 

18 examples = [] 

19 for template_node in list_item.find_child(NodeKind.TEMPLATE): 19 ↛ 20line 19 didn't jump to line 20 because the loop on line 19 never started

20 if template_node.template_name in ["zh-x", "zh-q"]: 

21 examples.extend( 

22 extract_template_zh_x( 

23 wxr, 

24 template_node, 

25 sense_data, 

26 parent_data, 

27 ) 

28 ) 

29 elif template_node.template_name in ["ja-usex", "ja-x"]: 

30 examples.append( 

31 extract_template_ja_usex( 

32 wxr, 

33 template_node, 

34 sense_data, 

35 parent_data, 

36 ) 

37 ) 

38 elif ( 

39 template_node.template_name.startswith(("quote-", "RQ:")) 

40 or template_node.template_name == "quote" 

41 ): 

42 q_example = extract_quote_templates(wxr, template_node, sense_data) 

43 if list_item.contain_node(NodeKind.LIST): 

44 for next_list_item in list_item.find_child_recursively( 

45 NodeKind.LIST_ITEM 

46 ): 

47 for key in ["tags", "raw_tags"]: 

48 if key not in q_example: 

49 q_example[key] = [] 

50 examples.extend( 

51 extract_example_list_item( 

52 wxr, next_list_item, sense_data, q_example 

53 ) 

54 ) 

55 else: 

56 examples.append(q_example) 

57 

58 return examples 

59 

60 

61def extract_quote_templates( 

62 wxr: WiktextractContext, node: TemplateNode, sense_data: SenseData 

63) -> ExampleData: 

64 expanded_node = wxr.wtp.parse( 

65 wxr.wtp.node_to_wikitext(node), expand_all=True 

66 ) 

67 clean_node(wxr, sense_data, expanded_node) 

68 ref = "" 

69 text = "" 

70 translation = "" 

71 roman = "" 

72 for span_tag in expanded_node.find_html_recursively("span"): 

73 span_class = span_tag.attrs.get("class", "") 

74 if "cited-source" == span_class: 

75 ref = clean_node(wxr, None, span_tag) 

76 elif "e-quotation" in span_class: 

77 text = clean_node(wxr, None, span_tag) 

78 elif "e-translation" in span_class: 

79 translation = clean_node(wxr, None, span_tag) 

80 for i_tag in expanded_node.find_html_recursively( 

81 "i", attr_name="class", attr_value="e-transliteration" 

82 ): 

83 roman = clean_node(wxr, None, i_tag) 

84 break 

85 example_data = ExampleData( 

86 text=text, ref=ref, english=translation, roman=roman, type="quote" 

87 ) 

88 clean_example_empty_data(example_data) 

89 return example_data 

90 

91 

92def extract_template_ja_usex( 

93 wxr: WiktextractContext, 

94 node: TemplateNode, 

95 sense_data: SenseData, 

96 example_data: ExampleData, 

97) -> ExampleData: 

98 # https://en.wiktionary.org/wiki/Template:ja-usex 

99 expanded_node = wxr.wtp.parse( 

100 wxr.wtp.node_to_wikitext(node), expand_all=True 

101 ) 

102 clean_node(wxr, sense_data, expanded_node) 

103 for span_tag in expanded_node.find_html( 

104 "span", attr_name="class", attr_value="Jpan" 

105 ): 

106 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

107 example_data["text"] = clean_node(wxr, None, node_without_ruby) 

108 example_data["ruby"] = ruby_data 

109 for span_tag in expanded_node.find_html_recursively( 

110 "span", attr_name="class", attr_value="tr" 

111 ): 

112 example_data["roman"] = clean_node(wxr, None, span_tag) 

113 example_data["english"] = clean_node( 

114 wxr, None, node.template_parameters.get(3, "") 

115 ) 

116 example_data["literal_meaning"] = clean_node( 

117 wxr, None, node.template_parameters.get("lit", "") 

118 ) 

119 clean_example_empty_data(example_data) 

120 return example_data 

121 

122 

123def extract_template_zh_x( 

124 wxr: WiktextractContext, 

125 template_node: TemplateNode, 

126 sense_data: SenseData, 

127 parent_example: ExampleData, 

128) -> list[ExampleData]: 

129 # https://en.wiktionary.org/wiki/Template:zh-x 

130 expanded_node = wxr.wtp.parse( 

131 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

132 ) 

133 clean_node(wxr, sense_data, expanded_node) 

134 has_dl_tag = False 

135 results = [] 

136 for dl_tag in expanded_node.find_html_recursively("dl"): 

137 has_dl_tag = True 

138 example_data = deepcopy(parent_example) 

139 example_data["english"] = clean_node( 

140 wxr, None, template_node.template_parameters.get(2, "") 

141 ) 

142 for dd_tag in dl_tag.find_html("dd"): 

143 dd_text = clean_node(wxr, None, dd_tag) 

144 if dd_text.startswith("From:"): 

145 example_data["ref"] = dd_text.removeprefix("From:") 

146 else: 

147 for span_tag in dd_tag.find_html_recursively( 

148 "span", attr_name="lang", attr_value="Latn" 

149 ): 

150 example_data["roman"] = clean_node(wxr, None, span_tag) 

151 for span_tag in dd_tag.find_html_recursively("span"): 

152 span_text = clean_node(wxr, None, span_tag) 

153 if span_text.startswith("[") and span_text.endswith( 

154 "]" 

155 ): 

156 example_data["raw_tags"].append( 

157 span_text.strip("[]") 

158 ) 

159 break 

160 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data)) 

161 

162 # no source, single line example 

163 if not has_dl_tag: 

164 example_data = deepcopy(parent_example) 

165 for span_tag in expanded_node.find_html( 

166 "span", attr_name="lang", attr_value="Latn" 

167 ): 

168 example_data["roman"] = clean_node(wxr, None, span_tag) 

169 break 

170 for span_tag in expanded_node.find_html("span"): 

171 span_text = clean_node(wxr, None, span_tag) 

172 if span_text.startswith("[") and span_text.endswith("]"): 

173 example_data["raw_tags"].append(span_text.strip("[]")) 

174 example_data["english"] = clean_node( 

175 wxr, None, template_node.template_parameters.get(2, "") 

176 ) 

177 example_data["literal_meaning"] = clean_node( 

178 wxr, None, template_node.template_parameters.get("lit", "") 

179 ) 

180 for span_tag in expanded_node.find_html("span"): 

181 span_lang = span_tag.attrs.get("lang", "") 

182 if span_lang in ["zh-Hant", "zh-Hans"]: 

183 example_text = clean_node(wxr, None, span_tag) 

184 if len(example_text) > 0: 

185 new_example = deepcopy(example_data) 

186 new_example["text"] = example_text 

187 new_example["tags"].append( 

188 "Traditional Chinese" 

189 if span_lang == "zh-Hant" 

190 else "Simplified Chinese" 

191 ) 

192 clean_example_empty_data(new_example) 

193 results.append(new_example) 

194 return results 

195 

196 

197def extract_zh_x_dl_span_tag( 

198 wxr: WiktextractContext, dl_tag: HTMLNode, example: ExampleData 

199) -> list[ExampleData]: 

200 # process example text span tag and dialect span tag 

201 results = [] 

202 is_first_hide = True 

203 for span_tag in dl_tag.find_html("span"): 

204 span_lang = span_tag.attrs.get("lang", "") 

205 if span_lang in ["zh-Hant", "zh-Hans"]: 

206 new_example = deepcopy(example) 

207 new_example["text"] = clean_node(wxr, None, span_tag) 

208 results.append(new_example) 

209 elif "vsHide" in span_tag.attrs.get("class", ""): 

210 # template has arg "collapsed=y" 

211 results.extend( 

212 extract_zh_x_dl_span_tag( 

213 wxr, 

214 span_tag, 

215 results[-1] 

216 if is_first_hide and len(results) > 0 

217 else example, 

218 ) 

219 ) 

220 is_first_hide = False 

221 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 

222 for link_node in span_tag.find_child_recursively(NodeKind.LINK): 

223 raw_tag = clean_node(wxr, None, link_node) 

224 if len(raw_tag) > 0: 

225 if len(results) > 0: 

226 results[-1]["raw_tags"].append(raw_tag) 

227 else: 

228 example["raw_tags"].append(raw_tag) 

229 

230 if dl_tag.tag == "dl": 

231 for data in results: 

232 clean_example_empty_data(data) 

233 return results 

234 

235 

236ZH_X_TAGS = { 

237 "trad.": "Traditional Chinese", 

238 "simp.": "Simplified Chinese", 

239} 

240 

241 

242def clean_example_empty_data(data: ExampleData) -> None: 

243 # remove empty data and convert raw tags 

244 raw_tags = data.get("raw_tags", []) 

245 new_raw_tags = [] 

246 for raw_tag in raw_tags: 

247 if raw_tag in ZH_X_TAGS: 

248 data["tags"].append(ZH_X_TAGS[raw_tag]) 

249 elif raw_tag in valid_tags: 

250 data["tags"].append(raw_tag) 

251 else: 

252 new_raw_tags.append(raw_tag) 

253 data["raw_tags"] = new_raw_tags 

254 if len(data.get("ref", "")) > 0: 

255 data["type"] = "quote" 

256 else: 

257 data["type"] = "example" 

258 for key, value in data.copy().items(): 

259 if len(value) == 0: 

260 del data[key]