Coverage for src/wiktextract/extractor/zh/example.py: 97%

166 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from typing import Optional 

2 

3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..ruby import extract_ruby 

8from .linkage import process_linkage_templates_in_gloss 

9from .models import Example, Sense, WordEntry 

10from .tags import translate_raw_tags 

11 

12LINKAGE_TEMPLATES = { 

13 "syn": "synonyms", 

14 "synonyms": "synonyms", 

15 "ant": "antonyms", 

16 "antonyms": "antonyms", 

17 "hyper": "hypernyms", 

18 "hypernyms": "hypernyms", 

19 "hypo": "hyponyms", 

20 "hyponyms": "hyponyms", 

21} 

22 

23 

24def extract_example_list_item( 

25 wxr: WiktextractContext, 

26 sense_data: Sense, 

27 list_item: WikiNode, 

28 page_data: list[WordEntry], 

29 parent_example: Optional[Example] = None, 

30) -> None: 

31 example_data = parent_example or Example() 

32 if list_item.contain_node(NodeKind.LIST) and not all( 

33 isinstance(n, TemplateNode) 

34 for n in list_item.invert_find_child(NodeKind.LIST) 

35 ): 

36 # plain text in the nested list, not using any template 

37 # https://zh.wiktionary.org/wiki/%, the second example 

38 extract_plain_text_example_list(wxr, list_item, example_data) 

39 else: 

40 # parse example templates 

41 for child in list_item.find_child(NodeKind.TEMPLATE): 

42 template_name = child.template_name 

43 if ( 

44 template_name.startswith(("quote-", "RQ:")) 

45 or template_name == "quote" 

46 ): 

47 extract_quote_templates(wxr, child, example_data) 

48 clean_node(wxr, sense_data, child) # add cat link 

49 elif template_name in ["ja-x", "ja-usex"]: 

50 extract_template_ja_usex(wxr, child, example_data) 

51 clean_node(wxr, sense_data, child) # add cat link 

52 elif template_name in ["zh-x", "zh-usex", "zh-q"]: 

53 sense_data.examples.extend( 

54 extract_template_zh_x(wxr, child, example_data) 

55 ) 

56 clean_node(wxr, sense_data, child) # add cat link 

57 elif template_name in ["ux", "eg", "usex", "uxi", "coi"]: 

58 extract_template_ux(wxr, child, example_data) 

59 clean_node(wxr, sense_data, child) # add cat link 

60 elif template_name == "Q": 

61 extract_template_Q(wxr, child, example_data) 

62 clean_node(wxr, sense_data, child) # add cat link 

63 elif template_name in LINKAGE_TEMPLATES: 63 ↛ 74line 63 didn't jump to line 74 because the condition on line 63 was always true

64 process_linkage_templates_in_gloss( 

65 wxr, 

66 page_data, 

67 child, 

68 LINKAGE_TEMPLATES[template_name], 

69 sense_data.glosses[0] 

70 if len(sense_data.glosses) > 0 

71 else "", 

72 ) 

73 else: 

74 example_data.text = clean_node(wxr, None, child) 

75 

76 for next_list_item in list_item.find_child_recursively( 

77 NodeKind.LIST_ITEM 

78 ): 

79 extract_example_list_item( 

80 wxr, sense_data, next_list_item, page_data, example_data 

81 ) 

82 

83 if len(example_data.text) > 0 and parent_example is None: 

84 sense_data.examples.append(example_data) 

85 

86 

87def extract_plain_text_example_list( 

88 wxr: WiktextractContext, list_item: WikiNode, example_data: Example 

89) -> None: 

90 for index, nested_list in list_item.find_child( 

91 NodeKind.LIST, with_index=True 

92 ): 

93 example_data.ref = clean_node(wxr, None, list_item.children[:index]) 

94 example_data.text = clean_node( 

95 wxr, None, nested_list.children[0].children 

96 ) 

97 

98 

99def extract_quote_templates( 

100 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

101) -> None: 

102 """ 

103 Process `quote-*` and "RQ:*" templates. 

104 """ 

105 expanded_node = wxr.wtp.parse( 

106 wxr.wtp.node_to_wikitext(node), expand_all=True 

107 ) 

108 for span_tag in expanded_node.find_html_recursively("span"): 

109 span_class = span_tag.attrs.get("class", "") 

110 if "cited-source" == span_class: 

111 example_data.ref = clean_node(wxr, None, span_tag) 

112 elif "e-quotation" in span_class: 

113 example_data.text = clean_node(wxr, None, span_tag) 

114 elif "e-translation" in span_class: 

115 example_data.translation = clean_node(wxr, None, span_tag) 

116 for i_tag in expanded_node.find_html_recursively( 

117 "i", attr_name="class", attr_value="e-transliteration" 

118 ): 

119 example_data.roman = clean_node(wxr, None, i_tag) 

120 break 

121 

122 

123def extract_template_ja_usex( 

124 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

125) -> None: 

126 expanded_node = wxr.wtp.parse( 

127 wxr.wtp.node_to_wikitext(node), expand_all=True 

128 ) 

129 for span_tag in expanded_node.find_html( 

130 "span", attr_name="class", attr_value="Jpan" 

131 ): 

132 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

133 example_data.text = clean_node(wxr, None, node_without_ruby) 

134 example_data.ruby = ruby_data 

135 for span_tag in expanded_node.find_html_recursively( 

136 "span", attr_name="class", attr_value="tr" 

137 ): 

138 example_data.roman = clean_node(wxr, None, span_tag) 

139 example_data.translation = clean_node( 

140 wxr, None, node.template_parameters.get(3, "") 

141 ) 

142 example_data.literal_meaning = clean_node( 

143 wxr, None, node.template_parameters.get("lit", "") 

144 ) 

145 

146 

147def extract_template_zh_x( 

148 wxr: WiktextractContext, 

149 template_node: TemplateNode, 

150 parent_example: Example, 

151) -> list[Example]: 

152 expanded_node = wxr.wtp.parse( 

153 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

154 ) 

155 has_dl_tag = False 

156 results = [] 

157 for dl_tag in expanded_node.find_html_recursively("dl"): 

158 example_data = parent_example.model_copy(deep=True) 

159 has_dl_tag = True 

160 for dd_tag in dl_tag.find_html("dd"): 

161 dd_text = clean_node(wxr, None, dd_tag) 

162 if dd_text.startswith("出自:"): 

163 example_data.ref = dd_text.removeprefix("出自:") 

164 else: 

165 is_roman = False 

166 for span_tag in dd_tag.find_html_recursively( 

167 "span", attr_name="lang", attr_value="Latn" 

168 ): 

169 example_data.roman = clean_node(wxr, None, span_tag) 

170 is_roman = True 

171 for span_tag in dd_tag.find_html_recursively("span"): 

172 span_text = clean_node(wxr, None, span_tag) 

173 if span_text.startswith("[") and span_text.endswith( 

174 "]" 

175 ): 

176 example_data.raw_tags.append(span_text.strip("[]")) 

177 break 

178 if not is_roman: 

179 example_data.translation = dd_text 

180 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data)) 

181 

182 # no source, single line example 

183 if not has_dl_tag: 

184 example_data = parent_example.model_copy(deep=True) 

185 for span_tag in expanded_node.find_html( 185 ↛ 190line 185 didn't jump to line 190 because the loop on line 185 didn't complete

186 "span", attr_name="lang", attr_value="Latn" 

187 ): 

188 example_data.roman = clean_node(wxr, None, span_tag) 

189 break 

190 for span_tag in expanded_node.find_html("span"): 

191 span_text = clean_node(wxr, None, span_tag) 

192 if span_text.startswith("[") and span_text.endswith("]"): 

193 example_data.raw_tags.append(span_text.strip("[]")) 

194 example_data.translation = clean_node( 

195 wxr, None, template_node.template_parameters.get(2, "") 

196 ) 

197 example_data.literal_meaning = clean_node( 

198 wxr, None, template_node.template_parameters.get("lit", "") 

199 ) 

200 for span_tag in expanded_node.find_html("span"): 

201 span_lang = span_tag.attrs.get("lang", "") 

202 if span_lang in ["zh-Hant", "zh-Hans"]: 

203 example_text = clean_node(wxr, None, span_tag) 

204 if len(example_text) > 0: 204 ↛ 200line 204 didn't jump to line 200 because the condition on line 204 was always true

205 new_example = example_data.model_copy(deep=True) 

206 new_example.text = example_text 

207 new_example.tags.append( 

208 "Traditional Chinese" 

209 if span_lang == "zh-Hant" 

210 else "Simplified Chinese" 

211 ) 

212 translate_raw_tags(new_example) 

213 results.append(new_example) 

214 return results 

215 

216 

217def extract_zh_x_dl_span_tag( 

218 wxr: WiktextractContext, dl_tag: HTMLNode, example: Example 

219) -> list[Example]: 

220 # process example text span tag and dialect span tag 

221 results = [] 

222 is_first_hide = True 

223 for span_tag in dl_tag.find_html("span"): 

224 span_lang = span_tag.attrs.get("lang", "") 

225 if span_lang in ["zh-Hant", "zh-Hans"]: 

226 new_example = example.model_copy(deep=True) 

227 new_example.text = clean_node(wxr, None, span_tag) 

228 results.append(new_example) 

229 elif "vsHide" in span_tag.attrs.get("class", ""): 

230 # template has arg "collapsed=y" 

231 results.extend( 

232 extract_zh_x_dl_span_tag( 

233 wxr, 

234 span_tag, 

235 results[-1] 

236 if is_first_hide and len(results) > 0 

237 else example, 

238 ) 

239 ) 

240 is_first_hide = False 

241 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 241 ↛ 223line 241 didn't jump to line 223 because the condition on line 241 was always true

242 for link_node in span_tag.find_child(NodeKind.LINK): 

243 raw_tag = clean_node(wxr, None, link_node) 

244 if len(raw_tag) > 0: 244 ↛ 242line 244 didn't jump to line 242 because the condition on line 244 was always true

245 if len(results) > 0: 

246 results[-1].raw_tags.append(raw_tag) 

247 else: 

248 example.raw_tags.append(raw_tag) 

249 

250 if dl_tag.tag == "dl": 

251 for data in results: 

252 translate_raw_tags(data) 

253 return results 

254 

255 

256def extract_template_ux( 

257 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

258) -> None: 

259 # https://zh.wiktionary.org/wiki/Template:ux 

260 expanded_node = wxr.wtp.parse( 

261 wxr.wtp.node_to_wikitext(node), expand_all=True 

262 ) 

263 for i_tag in expanded_node.find_html_recursively("i"): 

264 i_class = i_tag.attrs.get("class", "") 

265 if "e-example" in i_class: 

266 example_data.text = clean_node(wxr, None, i_tag) 

267 elif "e-transliteration" in i_class: 267 ↛ 263line 267 didn't jump to line 263 because the condition on line 267 was always true

268 example_data.roman = clean_node(wxr, None, i_tag) 

269 for span_tag in expanded_node.find_html_recursively("span"): 

270 span_class = span_tag.attrs.get("class", "") 

271 if "e-translation" in span_class: 

272 example_data.translation = clean_node(wxr, None, span_tag) 

273 elif "e-literally" in span_class: 

274 example_data.literal_meaning = clean_node(wxr, None, span_tag) 

275 elif "qualifier-content" in span_class: 

276 example_data.raw_tags.extend( 

277 clean_node(wxr, None, span_tag).split("、") 

278 ) 

279 translate_raw_tags(example_data) 

280 

281 

282def extract_template_Q( 

283 wxr: WiktextractContext, node: TemplateNode, example_data: Example 

284) -> None: 

285 # https://zh.wiktionary.org/wiki/Template:Q 

286 expanded_node = wxr.wtp.parse( 

287 wxr.wtp.node_to_wikitext(node), expand_all=True 

288 ) 

289 for div_tag in expanded_node.find_html( 

290 "div", attr_name="class", attr_value="wiktQuote" 

291 ): 

292 ref_nodes = [] 

293 for child in div_tag.children: 293 ↛ 301line 293 didn't jump to line 301 because the loop on line 293 didn't complete

294 if isinstance(child, HTMLNode) and child.tag == "dl": 

295 for i_tag in child.find_html_recursively( 

296 "i", attr_name="class", attr_value="e-transliteration" 

297 ): 

298 example_data.roman = clean_node(wxr, None, i_tag) 

299 break 

300 ref_nodes.append(child) 

301 ref_text = clean_node(wxr, None, ref_nodes) 

302 if len(ref_text) > 0: 302 ↛ 304line 302 didn't jump to line 304 because the condition on line 302 was always true

303 example_data.ref = ref_text 

304 for t_arg, field in ( 

305 ("quote", "text"), 

306 ("t", "translation"), 

307 ("trans", "translation"), 

308 ("lit", "literal_meaning"), 

309 ): 

310 value = clean_node( 

311 wxr, None, node.template_parameters.get(t_arg, "") 

312 ) 

313 if len(value) > 0: 

314 setattr(example_data, field, value)