Coverage for src/wiktextract/extractor/vi/example.py: 25%

1from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode

3from ...page import clean_node

4from ...wxr_context import WiktextractContext

5from ..ruby import extract_ruby

6from ..share import calculate_bold_offsets

7from .linkage import (

8 GLOSS_LIST_LINKAGE_TEMPLATES,

9 extract_gloss_list_linkage_template,

10)

11from .models import Example, Sense, WordEntry

12from .tags import translate_raw_tags

15def extract_example_list_item(

16 wxr: WiktextractContext,

17 word_entry: WordEntry,

18 sense: Sense,

19 list_item: WikiNode,

20 ref: str = "",

21):

22 for index, node in enumerate(list_item.children):

23 if (

24 isinstance(node, WikiNode)

25 and node.kind == NodeKind.ITALIC

26 and node.contain_node(NodeKind.BOLD)

27 ):

28 e_text = clean_node(wxr, None, node)

29 if e_text != "": 29 ↛ 22line 29 didn't jump to line 22 because the condition on line 29 was always true

30 e_data = Example(text=e_text)

31 calculate_bold_offsets(

32 wxr, node, e_text, e_data, "bold_text_offsets"

33 )

34 e_data.translation = clean_node(

35 wxr, None, list_item.children[index + 1 :]

36 ).strip("—- \n")

37 sense.examples.append(e_data)

38 break

39 elif isinstance(node, TemplateNode):

40 if node.template_name in [ 40 ↛ 52line 40 didn't jump to line 52 because the condition on line 40 was never true

41 "ux",

42 "usex",

43 "ux2",

44 "uxi",

45 "collocation",

46 "th-usex",

47 "th-x",

48 "tha-x",

49 "tha-usex",

50 "uxa",

51 ]:

52 extract_ux_template(wxr, sense, node)

53 elif node.template_name.startswith(("quote-", "RQ:")):

54 ref = extract_quote_template(wxr, sense, node)

55 elif node.template_name in GLOSS_LIST_LINKAGE_TEMPLATES: 55 ↛ 65line 55 didn't jump to line 65 because the condition on line 55 was always true

56 extract_gloss_list_linkage_template(

57 wxr,

58 word_entry,

59 node,

60 GLOSS_LIST_LINKAGE_TEMPLATES[node.template_name],

61 " ".join(word_entry.senses[-1].glosses)

62 if len(word_entry.senses) > 0

63 else "",

64 )

65 elif node.template_name in ["ja-usex", "ja-x", "jpn-usex"]:

66 extract_ja_x_template(wxr, node, sense, ref)

67 elif node.template_name in [

68 "zho-x",

69 "zh-x",

70 "zh-usex",

71 "zho-usex",

72 "zhex",

73 ]:

74 extract_zh_x_template(wxr, node, sense, ref)

75 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 for child_list_item in node.find_child(NodeKind.LIST_ITEM):

77 extract_example_list_item(

78 wxr, word_entry, sense, child_list_item, ref

79 )

82def extract_ux_template(

83 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode

84):

85 expanded_node = wxr.wtp.parse(

86 wxr.wtp.node_to_wikitext(t_node), expand_all=True

87 )

88 e_data = Example(text="")

89 for i_tag in expanded_node.find_html_recursively("i"):

90 i_class = i_tag.attrs.get("class", "")

91 if "e-example" in i_class:

92 e_data.text = clean_node(wxr, None, i_tag)

93 calculate_bold_offsets(

94 wxr, i_tag, e_data.text, e_data, "bold_text_offsets"

95 )

96 elif "e-transliteration" in i_class:

97 e_data.roman = clean_node(wxr, None, i_tag)

98 calculate_bold_offsets(

99 wxr, i_tag, e_data.roman, e_data, "bold_roman_offsets"

100 )

101 for span_tag in expanded_node.find_html_recursively("span"):

102 span_class = span_tag.attrs.get("class", "")

103 if "e-translation" in span_class:

104 e_data.translation = clean_node(wxr, None, span_tag)

105 calculate_bold_offsets(

106 wxr,

107 span_tag,

108 e_data.translation,

109 e_data,

110 "bold_translation_offsets",

111 )

112 elif "e-literally" in span_class:

113 e_data.literal_meaning = clean_node(wxr, None, span_tag)

114 calculate_bold_offsets(

115 wxr,

116 span_tag,

117 e_data.literal_meaning,

118 e_data,

119 "bold_literal_offsets",

120 )

121 elif "qualifier-content" in span_class:

122 raw_tag = clean_node(wxr, None, span_tag)

123 if raw_tag != "":

124 e_data.raw_tags.append(raw_tag)

125

126 e_data.ref = clean_node(

127 wxr, None, t_node.template_parameters.get("ref", "")

128 )

129 if e_data.text != "":

130 translate_raw_tags(e_data)

131 sense.examples.append(e_data)

132 for link_node in expanded_node.find_child(NodeKind.LINK):

133 clean_node(wxr, sense, link_node)

134

135

136def extract_quote_template(

137 wxr: WiktextractContext,

138 sense: Sense,

139 t_node: TemplateNode,

140) -> str:

141 ref = ""

142 if all( 142 ↛ 145line 142 didn't jump to line 145 because the condition on line 142 was never true

143 arg not in t_node.template_parameters for arg in ["text", "passage", 7]

144 ):

145 ref = clean_node(wxr, sense, t_node)

146 else:

147 expanded_node = wxr.wtp.parse(

148 wxr.wtp.node_to_wikitext(t_node), expand_all=True

149 )

150 example = Example(text="")

151 for span_tag in expanded_node.find_html_recursively("span"):

152 span_class = span_tag.attrs.get("class", "")

153 if "cited-source" == span_class:

154 example.ref = clean_node(wxr, None, span_tag)

155 elif "e-quotation" in span_class:

156 example.ruby, node_without_ruby = extract_ruby(wxr, span_tag)

157 example.text = clean_node(wxr, None, node_without_ruby)

158 calculate_bold_offsets(

159 wxr, span_tag, example.text, example, "bold_text_offsets"

160 )

161 elif "e-translation" in span_class: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 example.translation = clean_node(wxr, None, span_tag)

163 calculate_bold_offsets(

164 wxr,

165 span_tag,

166 example.translation,

167 example,

168 "bold_translation_text",

169 )

170 for i_tag in expanded_node.find_html_recursively( 170 ↛ 173line 170 didn't jump to line 173 because the loop on line 170 never started

171 "i", attr_name="class", attr_value="e-transliteration"

172 ):

173 example.roman = clean_node(wxr, None, i_tag)

174 calculate_bold_offsets(

175 wxr, i_tag, example.roman, example, "bold_roman_offsets"

176 )

177 break

178 if example.text != "": 178 ↛ 180line 178 didn't jump to line 180 because the condition on line 178 was always true

179 sense.examples.append(example)

180 clean_node(wxr, sense, expanded_node)

181

182 return ref

183

184

185def extract_ja_x_template(

186 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense, ref: str

187) -> None:

188 expanded_node = wxr.wtp.parse(

189 wxr.wtp.node_to_wikitext(t_node), expand_all=True

190 )

191 example = Example(text="", ref=ref)

192 for span_tag in expanded_node.find_html(

193 "span", attr_name="class", attr_value="Jpan"

194 ):

195 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)

196 example.text = clean_node(wxr, None, node_without_ruby)

197 example.ruby = ruby_data

198 calculate_bold_offsets(

199 wxr,

200 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),

201 example.text,

202 example,

203 "bold_text_offsets",

204 )

205 for span_tag in expanded_node.find_html_recursively(

206 "span", attr_name="class", attr_value="tr"

207 ):

208 example.roman = clean_node(wxr, None, span_tag)

209 calculate_bold_offsets(

210 wxr, span_tag, example.roman, example, "bold_roman_offsets"

211 )

212 third_arg = t_node.template_parameters.get(3, "")

213 example.translation = clean_node(wxr, None, third_arg)

214 calculate_bold_offsets(

215 wxr,

216 wxr.wtp.parse(wxr.wtp.node_to_wikitext(third_arg)),

217 example.translation,

218 example,

219 "bold_translation_offsets",

220 )

221 lit_arg = t_node.template_parameters.get("lit", "")

222 example.literal_meaning = clean_node(wxr, None, lit_arg)

223 calculate_bold_offsets(

224 wxr,

225 wxr.wtp.parse(wxr.wtp.node_to_wikitext(lit_arg)),

226 example.literal_meaning,

227 example,

228 "bold_literal_offsets",

229 )

230 if example.text != "":

231 sense.examples.append(example)

232 for link_node in expanded_node.find_child(NodeKind.LINK):

233 clean_node(wxr, sense, link_node)

234

235

236def extract_zh_x_template(

237 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense, ref: str

238):

239 expanded_node = wxr.wtp.parse(

240 wxr.wtp.node_to_wikitext(t_node), expand_all=True

241 )

242 examples = []

243 for dl_tag in expanded_node.find_html("dl"):

244 examples.extend(extract_zh_x_dl_tag(wxr, dl_tag))

245 if len(examples) == 0:

246 examples.extend(extract_zh_x_no_dl_tag(wxr, expanded_node))

247

248 second_arg = t_node.template_parameters.get(2, "")

249 translation = clean_node(wxr, None, second_arg)

250 for e_data in examples:

251 e_data.translation = translation

252 calculate_bold_offsets(

253 wxr,

254 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)),

255 translation,

256 e_data,

257 "bold_translation_offsets",

258 )

259 translate_raw_tags(e_data)

260

261 for link_node in expanded_node.find_child(NodeKind.LINK):

262 clean_node(wxr, sense, link_node)

263 sense.examples.extend(examples)

264

265

266def extract_zh_x_dl_tag(

267 wxr: WiktextractContext, dl_tag: HTMLNode

268) -> list[Example]:

269 examples = []

270 for span_tag in dl_tag.find_html("span"):

271 if "lang" in span_tag.attrs:

272 e_text = clean_node(wxr, None, span_tag)

273 if e_text != "":

274 e_data = Example(text=e_text)

275 calculate_bold_offsets(

276 wxr, span_tag, e_text, e_data, "bold_text_offsets"

277 )

278 examples.append(e_data)

279 else:

280 raw_tags = clean_node(wxr, None, span_tag).strip("[] ")

281 for raw_tag in raw_tags.split(","):

282 raw_tag = raw_tag.strip()

283 if raw_tag != "" and len(examples) > 0:

284 examples[-1].raw_tags.append(raw_tag)

285 ref = ""

286 for dd_tag in dl_tag.find_html("dd"):

287 for span_tag in dd_tag.find_html("span"):

288 if "Latn" in span_tag.attrs.get("lang", ""):

289 roman = clean_node(wxr, None, span_tag)

290 for e_data in examples:

291 e_data.roman = roman

292 calculate_bold_offsets(

293 wxr, span_tag, roman, e_data, "bold_roman_offsets"

294 )

295 else:

296 raw_tag = clean_node(wxr, None, span_tag).strip("[] ")

297 if raw_tag != "":

298 for e_data in examples:

299 e_data.raw_tags.append(raw_tag)

300 for small_tag in dd_tag.find_html("small"):

301 ref = clean_node(wxr, None, small_tag).removeprefix("Từ:").strip()

302 for e_data in examples:

303 e_data.ref = ref

304

305 return examples

306

307

308def extract_zh_x_no_dl_tag(

309 wxr: WiktextractContext, expanded_node: WikiNode

310) -> list[Example]:

311 examples = []

312 for span_tag in expanded_node.find_html("span"):

313 lang = span_tag.attrs.get("lang", "")

314 match lang:

315 case "zh-Latn":

316 roman = clean_node(wxr, None, span_tag)

317 for e_data in examples:

318 e_data.roman = roman

319 calculate_bold_offsets(

320 wxr, span_tag, roman, e_data, "bold_roman_offsets"

321 )

322 case "zh-Hant" | "zh-Hans":

323 e_text = clean_node(wxr, None, span_tag)

324 example = Example(text=e_text)

325 example.tags.append(

326 "Traditional-Chinese"

327 if lang == "zh-Hant"

328 else "Simplified-Chinese"

329 )

330 if example.text != "":

331 calculate_bold_offsets(

332 wxr, span_tag, e_text, example, "bold_text_offsets"

333 )

334 examples.append(example)

335

336 return examples

Coverage for src / wiktextract / extractor / vi / example.py: 25%

172 statements