Coverage for src/wiktextract/extractor/zh/example.py: 97%

1from typing import Optional

3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from ..ruby import extract_ruby

8from .linkage import process_linkage_templates_in_gloss

9from .models import Example, Sense, WordEntry

10from .tags import translate_raw_tags

12LINKAGE_TEMPLATES = {

13 "syn": "synonyms",

14 "synonyms": "synonyms",

15 "ant": "antonyms",

16 "antonyms": "antonyms",

17 "hyper": "hypernyms",

18 "hypernyms": "hypernyms",

19 "hypo": "hyponyms",

20 "hyponyms": "hyponyms",

21}

24def extract_example_list_item(

25 wxr: WiktextractContext,

26 sense_data: Sense,

27 list_item: WikiNode,

28 page_data: list[WordEntry],

29 parent_example: Optional[Example] = None,

30) -> None:

31 example_data = parent_example or Example()

32 if list_item.contain_node(NodeKind.LIST) and not all(

33 isinstance(n, TemplateNode)

34 for n in list_item.invert_find_child(NodeKind.LIST)

35 ):

36 # plain text in the nested list, not using any template

37 # https://zh.wiktionary.org/wiki/%, the second example

38 extract_plain_text_example_list(wxr, list_item, example_data)

39 else:

40 # parse example templates

41 for child in list_item.find_child(NodeKind.TEMPLATE):

42 template_name = child.template_name

43 if (

44 template_name.startswith(("quote-", "RQ:"))

45 or template_name == "quote"

46 ):

47 extract_quote_templates(wxr, child, example_data)

48 clean_node(wxr, sense_data, child) # add cat link

49 elif template_name in ["ja-x", "ja-usex"]:

50 extract_template_ja_usex(wxr, child, example_data)

51 clean_node(wxr, sense_data, child) # add cat link

52 elif template_name in ["zh-x", "zh-usex", "zh-q"]:

53 sense_data.examples.extend(

54 extract_template_zh_x(wxr, child, example_data)

55 )

56 clean_node(wxr, sense_data, child) # add cat link

57 elif template_name in ["ux", "eg", "usex", "uxi", "coi"]:

58 extract_template_ux(wxr, child, example_data)

59 clean_node(wxr, sense_data, child) # add cat link

60 elif template_name == "Q":

61 extract_template_Q(wxr, child, example_data)

62 clean_node(wxr, sense_data, child) # add cat link

63 elif template_name in LINKAGE_TEMPLATES: 63 ↛ 74line 63 didn't jump to line 74 because the condition on line 63 was always true

64 process_linkage_templates_in_gloss(

65 wxr,

66 page_data,

67 child,

68 LINKAGE_TEMPLATES[template_name],

69 sense_data.glosses[0]

70 if len(sense_data.glosses) > 0

71 else "",

72 )

73 else:

74 example_data.text = clean_node(wxr, None, child)

76 for next_list_item in list_item.find_child_recursively(

77 NodeKind.LIST_ITEM

78 ):

79 extract_example_list_item(

80 wxr, sense_data, next_list_item, page_data, example_data

81 )

83 if len(example_data.text) > 0 and parent_example is None:

84 sense_data.examples.append(example_data)

87def extract_plain_text_example_list(

88 wxr: WiktextractContext, list_item: WikiNode, example_data: Example

89) -> None:

90 for index, nested_list in list_item.find_child(

91 NodeKind.LIST, with_index=True

92 ):

93 example_data.ref = clean_node(wxr, None, list_item.children[:index])

94 example_data.text = clean_node(

95 wxr, None, nested_list.children[0].children

96 )

99def extract_quote_templates(

100 wxr: WiktextractContext, node: TemplateNode, example_data: Example

101) -> None:

102 """

103 Process `quote-*` and "RQ:*" templates.

104 """

105 expanded_node = wxr.wtp.parse(

106 wxr.wtp.node_to_wikitext(node), expand_all=True

107 )

108 for span_tag in expanded_node.find_html_recursively("span"):

109 span_class = span_tag.attrs.get("class", "")

110 if "cited-source" == span_class:

111 example_data.ref = clean_node(wxr, None, span_tag)

112 elif "e-quotation" in span_class:

113 example_data.text = clean_node(wxr, None, span_tag)

114 elif "e-translation" in span_class:

115 example_data.translation = clean_node(wxr, None, span_tag)

116 for i_tag in expanded_node.find_html_recursively(

117 "i", attr_name="class", attr_value="e-transliteration"

118 ):

119 example_data.roman = clean_node(wxr, None, i_tag)

120 break

121

122

123def extract_template_ja_usex(

124 wxr: WiktextractContext, node: TemplateNode, example_data: Example

125) -> None:

126 expanded_node = wxr.wtp.parse(

127 wxr.wtp.node_to_wikitext(node), expand_all=True

128 )

129 for span_tag in expanded_node.find_html(

130 "span", attr_name="class", attr_value="Jpan"

131 ):

132 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)

133 example_data.text = clean_node(wxr, None, node_without_ruby)

134 example_data.ruby = ruby_data

135 for span_tag in expanded_node.find_html_recursively(

136 "span", attr_name="class", attr_value="tr"

137 ):

138 example_data.roman = clean_node(wxr, None, span_tag)

139 example_data.translation = clean_node(

140 wxr, None, node.template_parameters.get(3, "")

141 )

142 example_data.literal_meaning = clean_node(

143 wxr, None, node.template_parameters.get("lit", "")

144 )

145

146

147def extract_template_zh_x(

148 wxr: WiktextractContext,

149 template_node: TemplateNode,

150 parent_example: Example,

151) -> list[Example]:

152 expanded_node = wxr.wtp.parse(

153 wxr.wtp.node_to_wikitext(template_node), expand_all=True

154 )

155 has_dl_tag = False

156 results = []

157 for dl_tag in expanded_node.find_html_recursively("dl"):

158 example_data = parent_example.model_copy(deep=True)

159 has_dl_tag = True

160 for dd_tag in dl_tag.find_html("dd"):

161 dd_text = clean_node(wxr, None, dd_tag)

162 if dd_text.startswith("出自："):

163 example_data.ref = dd_text.removeprefix("出自：")

164 else:

165 is_roman = False

166 for span_tag in dd_tag.find_html_recursively(

167 "span", attr_name="lang", attr_value="Latn"

168 ):

169 example_data.roman = clean_node(wxr, None, span_tag)

170 is_roman = True

171 for span_tag in dd_tag.find_html_recursively("span"):

172 span_text = clean_node(wxr, None, span_tag)

173 if span_text.startswith("[") and span_text.endswith(

174 "]"

175 ):

176 example_data.raw_tags.append(span_text.strip("[]"))

177 break

178 if not is_roman:

179 example_data.translation = dd_text

180 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data))

181

182 # no source, single line example

183 if not has_dl_tag:

184 example_data = parent_example.model_copy(deep=True)

185 for span_tag in expanded_node.find_html( 185 ↛ 190line 185 didn't jump to line 190 because the loop on line 185 didn't complete

186 "span", attr_name="lang", attr_value="Latn"

187 ):

188 example_data.roman = clean_node(wxr, None, span_tag)

189 break

190 for span_tag in expanded_node.find_html("span"):

191 span_text = clean_node(wxr, None, span_tag)

192 if span_text.startswith("[") and span_text.endswith("]"):

193 example_data.raw_tags.append(span_text.strip("[]"))

194 example_data.translation = clean_node(

195 wxr, None, template_node.template_parameters.get(2, "")

196 )

197 example_data.literal_meaning = clean_node(

198 wxr, None, template_node.template_parameters.get("lit", "")

199 )

200 for span_tag in expanded_node.find_html("span"):

201 span_lang = span_tag.attrs.get("lang", "")

202 if span_lang in ["zh-Hant", "zh-Hans"]:

203 example_text = clean_node(wxr, None, span_tag)

204 if len(example_text) > 0: 204 ↛ 200line 204 didn't jump to line 200 because the condition on line 204 was always true

205 new_example = example_data.model_copy(deep=True)

206 new_example.text = example_text

207 new_example.tags.append(

208 "Traditional Chinese"

209 if span_lang == "zh-Hant"

210 else "Simplified Chinese"

211 )

212 translate_raw_tags(new_example)

213 results.append(new_example)

214 return results

215

216

217def extract_zh_x_dl_span_tag(

218 wxr: WiktextractContext, dl_tag: HTMLNode, example: Example

219) -> list[Example]:

220 # process example text span tag and dialect span tag

221 results = []

222 is_first_hide = True

223 for span_tag in dl_tag.find_html("span"):

224 span_lang = span_tag.attrs.get("lang", "")

225 if span_lang in ["zh-Hant", "zh-Hans"]:

226 new_example = example.model_copy(deep=True)

227 new_example.text = clean_node(wxr, None, span_tag)

228 results.append(new_example)

229 elif "vsHide" in span_tag.attrs.get("class", ""):

230 # template has arg "collapsed=y"

231 results.extend(

232 extract_zh_x_dl_span_tag(

233 wxr,

234 span_tag,

235 results[-1]

236 if is_first_hide and len(results) > 0

237 else example,

238 )

239 )

240 is_first_hide = False

241 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 241 ↛ 223line 241 didn't jump to line 223 because the condition on line 241 was always true

242 for link_node in span_tag.find_child(NodeKind.LINK):

243 raw_tag = clean_node(wxr, None, link_node)

244 if len(raw_tag) > 0: 244 ↛ 242line 244 didn't jump to line 242 because the condition on line 244 was always true

245 if len(results) > 0:

246 results[-1].raw_tags.append(raw_tag)

247 else:

248 example.raw_tags.append(raw_tag)

249

250 if dl_tag.tag == "dl":

251 for data in results:

252 translate_raw_tags(data)

253 return results

254

255

256def extract_template_ux(

257 wxr: WiktextractContext, node: TemplateNode, example_data: Example

258) -> None:

259 # https://zh.wiktionary.org/wiki/Template:ux

260 expanded_node = wxr.wtp.parse(

261 wxr.wtp.node_to_wikitext(node), expand_all=True

262 )

263 for i_tag in expanded_node.find_html_recursively("i"):

264 i_class = i_tag.attrs.get("class", "")

265 if "e-example" in i_class:

266 example_data.text = clean_node(wxr, None, i_tag)

267 elif "e-transliteration" in i_class: 267 ↛ 263line 267 didn't jump to line 263 because the condition on line 267 was always true

268 example_data.roman = clean_node(wxr, None, i_tag)

269 for span_tag in expanded_node.find_html_recursively("span"):

270 span_class = span_tag.attrs.get("class", "")

271 if "e-translation" in span_class:

272 example_data.translation = clean_node(wxr, None, span_tag)

273 elif "e-literally" in span_class:

274 example_data.literal_meaning = clean_node(wxr, None, span_tag)

275 elif "qualifier-content" in span_class:

276 example_data.raw_tags.extend(

277 clean_node(wxr, None, span_tag).split("、")

278 )

279 translate_raw_tags(example_data)

280

281

282def extract_template_Q(

283 wxr: WiktextractContext, node: TemplateNode, example_data: Example

284) -> None:

285 # https://zh.wiktionary.org/wiki/Template:Q

286 expanded_node = wxr.wtp.parse(

287 wxr.wtp.node_to_wikitext(node), expand_all=True

288 )

289 for div_tag in expanded_node.find_html(

290 "div", attr_name="class", attr_value="wiktQuote"

291 ):

292 ref_nodes = []

293 for child in div_tag.children: 293 ↛ 301line 293 didn't jump to line 301 because the loop on line 293 didn't complete

294 if isinstance(child, HTMLNode) and child.tag == "dl":

295 for i_tag in child.find_html_recursively(

296 "i", attr_name="class", attr_value="e-transliteration"

297 ):

298 example_data.roman = clean_node(wxr, None, i_tag)

299 break

300 ref_nodes.append(child)

301 ref_text = clean_node(wxr, None, ref_nodes)

302 if len(ref_text) > 0: 302 ↛ 304line 302 didn't jump to line 304 because the condition on line 302 was always true

303 example_data.ref = ref_text

304 for t_arg, field in (

305 ("quote", "text"),

306 ("t", "translation"),

307 ("trans", "translation"),

308 ("lit", "literal_meaning"),

309 ):

310 value = clean_node(

311 wxr, None, node.template_parameters.get(t_arg, "")

312 )

313 if len(value) > 0:

314 setattr(example_data, field, value)