Coverage for src / wiktextract / extractor / vi / example.py: 25%

172 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-05 07:46 +0000

1from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..ruby import extract_ruby 

6from ..share import calculate_bold_offsets 

7from .linkage import ( 

8 GLOSS_LIST_LINKAGE_TEMPLATES, 

9 extract_gloss_list_linkage_template, 

10) 

11from .models import Example, Sense, WordEntry 

12from .tags import translate_raw_tags 

13 

14 

15def extract_example_list_item( 

16 wxr: WiktextractContext, 

17 word_entry: WordEntry, 

18 sense: Sense, 

19 list_item: WikiNode, 

20 ref: str = "", 

21): 

22 for index, node in enumerate(list_item.children): 

23 if ( 

24 isinstance(node, WikiNode) 

25 and node.kind == NodeKind.ITALIC 

26 and node.contain_node(NodeKind.BOLD) 

27 ): 

28 e_text = clean_node(wxr, None, node) 

29 if e_text != "": 29 ↛ 22line 29 didn't jump to line 22 because the condition on line 29 was always true

30 e_data = Example(text=e_text) 

31 calculate_bold_offsets( 

32 wxr, node, e_text, e_data, "bold_text_offsets" 

33 ) 

34 e_data.translation = clean_node( 

35 wxr, None, list_item.children[index + 1 :] 

36 ).strip("—- \n") 

37 sense.examples.append(e_data) 

38 break 

39 elif isinstance(node, TemplateNode): 

40 if node.template_name in [ 40 ↛ 52line 40 didn't jump to line 52 because the condition on line 40 was never true

41 "ux", 

42 "usex", 

43 "ux2", 

44 "uxi", 

45 "collocation", 

46 "th-usex", 

47 "th-x", 

48 "tha-x", 

49 "tha-usex", 

50 "uxa", 

51 ]: 

52 extract_ux_template(wxr, sense, node) 

53 elif node.template_name.startswith(("quote-", "RQ:")): 

54 ref = extract_quote_template(wxr, sense, node) 

55 elif node.template_name in GLOSS_LIST_LINKAGE_TEMPLATES: 55 ↛ 65line 55 didn't jump to line 65 because the condition on line 55 was always true

56 extract_gloss_list_linkage_template( 

57 wxr, 

58 word_entry, 

59 node, 

60 GLOSS_LIST_LINKAGE_TEMPLATES[node.template_name], 

61 " ".join(word_entry.senses[-1].glosses) 

62 if len(word_entry.senses) > 0 

63 else "", 

64 ) 

65 elif node.template_name in ["ja-usex", "ja-x", "jpn-usex"]: 

66 extract_ja_x_template(wxr, node, sense, ref) 

67 elif node.template_name in [ 

68 "zho-x", 

69 "zh-x", 

70 "zh-usex", 

71 "zho-usex", 

72 "zhex", 

73 ]: 

74 extract_zh_x_template(wxr, node, sense, ref) 

75 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

77 extract_example_list_item( 

78 wxr, word_entry, sense, child_list_item, ref 

79 ) 

80 

81 

82def extract_ux_template( 

83 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

84): 

85 expanded_node = wxr.wtp.parse( 

86 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

87 ) 

88 e_data = Example(text="") 

89 for i_tag in expanded_node.find_html_recursively("i"): 

90 i_class = i_tag.attrs.get("class", "") 

91 if "e-example" in i_class: 

92 e_data.text = clean_node(wxr, None, i_tag) 

93 calculate_bold_offsets( 

94 wxr, i_tag, e_data.text, e_data, "bold_text_offsets" 

95 ) 

96 elif "e-transliteration" in i_class: 

97 e_data.roman = clean_node(wxr, None, i_tag) 

98 calculate_bold_offsets( 

99 wxr, i_tag, e_data.roman, e_data, "bold_roman_offsets" 

100 ) 

101 for span_tag in expanded_node.find_html_recursively("span"): 

102 span_class = span_tag.attrs.get("class", "") 

103 if "e-translation" in span_class: 

104 e_data.translation = clean_node(wxr, None, span_tag) 

105 calculate_bold_offsets( 

106 wxr, 

107 span_tag, 

108 e_data.translation, 

109 e_data, 

110 "bold_translation_offsets", 

111 ) 

112 elif "e-literally" in span_class: 

113 e_data.literal_meaning = clean_node(wxr, None, span_tag) 

114 calculate_bold_offsets( 

115 wxr, 

116 span_tag, 

117 e_data.literal_meaning, 

118 e_data, 

119 "bold_literal_offsets", 

120 ) 

121 elif "qualifier-content" in span_class: 

122 raw_tag = clean_node(wxr, None, span_tag) 

123 if raw_tag != "": 

124 e_data.raw_tags.append(raw_tag) 

125 

126 e_data.ref = clean_node( 

127 wxr, None, t_node.template_parameters.get("ref", "") 

128 ) 

129 if e_data.text != "": 

130 translate_raw_tags(e_data) 

131 sense.examples.append(e_data) 

132 for link_node in expanded_node.find_child(NodeKind.LINK): 

133 clean_node(wxr, sense, link_node) 

134 

135 

136def extract_quote_template( 

137 wxr: WiktextractContext, 

138 sense: Sense, 

139 t_node: TemplateNode, 

140) -> str: 

141 ref = "" 

142 if all( 142 ↛ 145line 142 didn't jump to line 145 because the condition on line 142 was never true

143 arg not in t_node.template_parameters for arg in ["text", "passage", 7] 

144 ): 

145 ref = clean_node(wxr, sense, t_node) 

146 else: 

147 expanded_node = wxr.wtp.parse( 

148 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

149 ) 

150 example = Example(text="") 

151 for span_tag in expanded_node.find_html_recursively("span"): 

152 span_class = span_tag.attrs.get("class", "") 

153 if "cited-source" == span_class: 

154 example.ref = clean_node(wxr, None, span_tag) 

155 elif "e-quotation" in span_class: 

156 example.ruby, node_without_ruby = extract_ruby(wxr, span_tag) 

157 example.text = clean_node(wxr, None, node_without_ruby) 

158 calculate_bold_offsets( 

159 wxr, span_tag, example.text, example, "bold_text_offsets" 

160 ) 

161 elif "e-translation" in span_class: 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 example.translation = clean_node(wxr, None, span_tag) 

163 calculate_bold_offsets( 

164 wxr, 

165 span_tag, 

166 example.translation, 

167 example, 

168 "bold_translation_text", 

169 ) 

170 for i_tag in expanded_node.find_html_recursively( 170 ↛ 173line 170 didn't jump to line 173 because the loop on line 170 never started

171 "i", attr_name="class", attr_value="e-transliteration" 

172 ): 

173 example.roman = clean_node(wxr, None, i_tag) 

174 calculate_bold_offsets( 

175 wxr, i_tag, example.roman, example, "bold_roman_offsets" 

176 ) 

177 break 

178 if example.text != "": 178 ↛ 180line 178 didn't jump to line 180 because the condition on line 178 was always true

179 sense.examples.append(example) 

180 clean_node(wxr, sense, expanded_node) 

181 

182 return ref 

183 

184 

185def extract_ja_x_template( 

186 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense, ref: str 

187) -> None: 

188 expanded_node = wxr.wtp.parse( 

189 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

190 ) 

191 example = Example(text="", ref=ref) 

192 for span_tag in expanded_node.find_html( 

193 "span", attr_name="class", attr_value="Jpan" 

194 ): 

195 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

196 example.text = clean_node(wxr, None, node_without_ruby) 

197 example.ruby = ruby_data 

198 calculate_bold_offsets( 

199 wxr, 

200 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)), 

201 example.text, 

202 example, 

203 "bold_text_offsets", 

204 ) 

205 for span_tag in expanded_node.find_html_recursively( 

206 "span", attr_name="class", attr_value="tr" 

207 ): 

208 example.roman = clean_node(wxr, None, span_tag) 

209 calculate_bold_offsets( 

210 wxr, span_tag, example.roman, example, "bold_roman_offsets" 

211 ) 

212 third_arg = t_node.template_parameters.get(3, "") 

213 example.translation = clean_node(wxr, None, third_arg) 

214 calculate_bold_offsets( 

215 wxr, 

216 wxr.wtp.parse(wxr.wtp.node_to_wikitext(third_arg)), 

217 example.translation, 

218 example, 

219 "bold_translation_offsets", 

220 ) 

221 lit_arg = t_node.template_parameters.get("lit", "") 

222 example.literal_meaning = clean_node(wxr, None, lit_arg) 

223 calculate_bold_offsets( 

224 wxr, 

225 wxr.wtp.parse(wxr.wtp.node_to_wikitext(lit_arg)), 

226 example.literal_meaning, 

227 example, 

228 "bold_literal_offsets", 

229 ) 

230 if example.text != "": 

231 sense.examples.append(example) 

232 for link_node in expanded_node.find_child(NodeKind.LINK): 

233 clean_node(wxr, sense, link_node) 

234 

235 

236def extract_zh_x_template( 

237 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense, ref: str 

238): 

239 expanded_node = wxr.wtp.parse( 

240 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

241 ) 

242 examples = [] 

243 for dl_tag in expanded_node.find_html("dl"): 

244 examples.extend(extract_zh_x_dl_tag(wxr, dl_tag)) 

245 if len(examples) == 0: 

246 examples.extend(extract_zh_x_no_dl_tag(wxr, expanded_node)) 

247 

248 second_arg = t_node.template_parameters.get(2, "") 

249 translation = clean_node(wxr, None, second_arg) 

250 for e_data in examples: 

251 e_data.translation = translation 

252 calculate_bold_offsets( 

253 wxr, 

254 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)), 

255 translation, 

256 e_data, 

257 "bold_translation_offsets", 

258 ) 

259 translate_raw_tags(e_data) 

260 

261 for link_node in expanded_node.find_child(NodeKind.LINK): 

262 clean_node(wxr, sense, link_node) 

263 sense.examples.extend(examples) 

264 

265 

266def extract_zh_x_dl_tag( 

267 wxr: WiktextractContext, dl_tag: HTMLNode 

268) -> list[Example]: 

269 examples = [] 

270 for span_tag in dl_tag.find_html("span"): 

271 if "lang" in span_tag.attrs: 

272 e_text = clean_node(wxr, None, span_tag) 

273 if e_text != "": 

274 e_data = Example(text=e_text) 

275 calculate_bold_offsets( 

276 wxr, span_tag, e_text, e_data, "bold_text_offsets" 

277 ) 

278 examples.append(e_data) 

279 else: 

280 raw_tags = clean_node(wxr, None, span_tag).strip("[] ") 

281 for raw_tag in raw_tags.split(","): 

282 raw_tag = raw_tag.strip() 

283 if raw_tag != "" and len(examples) > 0: 

284 examples[-1].raw_tags.append(raw_tag) 

285 ref = "" 

286 for dd_tag in dl_tag.find_html("dd"): 

287 for span_tag in dd_tag.find_html("span"): 

288 if "Latn" in span_tag.attrs.get("lang", ""): 

289 roman = clean_node(wxr, None, span_tag) 

290 for e_data in examples: 

291 e_data.roman = roman 

292 calculate_bold_offsets( 

293 wxr, span_tag, roman, e_data, "bold_roman_offsets" 

294 ) 

295 else: 

296 raw_tag = clean_node(wxr, None, span_tag).strip("[] ") 

297 if raw_tag != "": 

298 for e_data in examples: 

299 e_data.raw_tags.append(raw_tag) 

300 for small_tag in dd_tag.find_html("small"): 

301 ref = clean_node(wxr, None, small_tag).removeprefix("Từ:").strip() 

302 for e_data in examples: 

303 e_data.ref = ref 

304 

305 return examples 

306 

307 

308def extract_zh_x_no_dl_tag( 

309 wxr: WiktextractContext, expanded_node: WikiNode 

310) -> list[Example]: 

311 examples = [] 

312 for span_tag in expanded_node.find_html("span"): 

313 lang = span_tag.attrs.get("lang", "") 

314 match lang: 

315 case "zh-Latn": 

316 roman = clean_node(wxr, None, span_tag) 

317 for e_data in examples: 

318 e_data.roman = roman 

319 calculate_bold_offsets( 

320 wxr, span_tag, roman, e_data, "bold_roman_offsets" 

321 ) 

322 case "zh-Hant" | "zh-Hans": 

323 e_text = clean_node(wxr, None, span_tag) 

324 example = Example(text=e_text) 

325 example.tags.append( 

326 "Traditional-Chinese" 

327 if lang == "zh-Hant" 

328 else "Simplified-Chinese" 

329 ) 

330 if example.text != "": 

331 calculate_bold_offsets( 

332 wxr, span_tag, e_text, example, "bold_text_offsets" 

333 ) 

334 examples.append(example) 

335 

336 return examples