Coverage for src/wiktextract/extractor/vi/example.py: 25%

172 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from ..ruby import extract_ruby 

6from ..share import calculate_bold_offsets 

7from .linkage import ( 

8 GLOSS_LIST_LINKAGE_TEMPLATES, 

9 extract_gloss_list_linkage_template, 

10) 

11from .models import Example, Sense, WordEntry 

12from .tags import translate_raw_tags 

13 

14 

15def extract_example_list_item( 

16 wxr: WiktextractContext, 

17 word_entry: WordEntry, 

18 sense: Sense, 

19 list_item: WikiNode, 

20 ref: str = "", 

21): 

22 for index, node in enumerate(list_item.children): 

23 if ( 

24 isinstance(node, WikiNode) 

25 and node.kind == NodeKind.ITALIC 

26 and node.contain_node(NodeKind.BOLD) 

27 ): 

28 e_text = clean_node(wxr, None, node) 

29 if e_text != "": 29 ↛ 22line 29 didn't jump to line 22 because the condition on line 29 was always true

30 e_data = Example(text=e_text) 

31 calculate_bold_offsets( 

32 wxr, node, e_text, e_data, "bold_text_offsets" 

33 ) 

34 e_data.translation = clean_node( 

35 wxr, None, list_item.children[index + 1 :] 

36 ).strip("—- \n") 

37 sense.examples.append(e_data) 

38 break 

39 elif isinstance(node, TemplateNode): 

40 if node.template_name in [ 40 ↛ 51line 40 didn't jump to line 51 because the condition on line 40 was never true

41 "ux", 

42 "usex", 

43 "ux2", 

44 "uxi", 

45 "collocation", 

46 "th-usex", 

47 "th-x", 

48 "tha-x", 

49 "tha-usex", 

50 ]: 

51 extract_ux_template(wxr, sense, node) 

52 elif node.template_name.startswith(("quote-", "RQ:")): 

53 ref = extract_quote_template(wxr, sense, node) 

54 elif node.template_name in GLOSS_LIST_LINKAGE_TEMPLATES: 54 ↛ 64line 54 didn't jump to line 64 because the condition on line 54 was always true

55 extract_gloss_list_linkage_template( 

56 wxr, 

57 word_entry, 

58 node, 

59 GLOSS_LIST_LINKAGE_TEMPLATES[node.template_name], 

60 " ".join(word_entry.senses[-1].glosses) 

61 if len(word_entry.senses) > 0 

62 else "", 

63 ) 

64 elif node.template_name in ["ja-usex", "ja-x", "jpn-usex"]: 

65 extract_ja_x_template(wxr, node, sense, ref) 

66 elif node.template_name in [ 

67 "zho-x", 

68 "zh-x", 

69 "zh-usex", 

70 "zho-usex", 

71 "zhex", 

72 ]: 

73 extract_zh_x_template(wxr, node, sense, ref) 

74 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

76 extract_example_list_item( 

77 wxr, word_entry, sense, child_list_item, ref 

78 ) 

79 

80 

81def extract_ux_template( 

82 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

83): 

84 expanded_node = wxr.wtp.parse( 

85 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

86 ) 

87 e_data = Example(text="") 

88 for i_tag in expanded_node.find_html_recursively("i"): 

89 i_class = i_tag.attrs.get("class", "") 

90 if "e-example" in i_class: 

91 e_data.text = clean_node(wxr, None, i_tag) 

92 calculate_bold_offsets( 

93 wxr, i_tag, e_data.text, e_data, "bold_text_offsets" 

94 ) 

95 elif "e-transliteration" in i_class: 

96 e_data.roman = clean_node(wxr, None, i_tag) 

97 calculate_bold_offsets( 

98 wxr, i_tag, e_data.roman, e_data, "bold_roman_offsets" 

99 ) 

100 for span_tag in expanded_node.find_html_recursively("span"): 

101 span_class = span_tag.attrs.get("class", "") 

102 if "e-translation" in span_class: 

103 e_data.translation = clean_node(wxr, None, span_tag) 

104 calculate_bold_offsets( 

105 wxr, 

106 span_tag, 

107 e_data.translation, 

108 e_data, 

109 "bold_translation_offsets", 

110 ) 

111 elif "e-literally" in span_class: 

112 e_data.literal_meaning = clean_node(wxr, None, span_tag) 

113 calculate_bold_offsets( 

114 wxr, 

115 span_tag, 

116 e_data.literal_meaning, 

117 e_data, 

118 "bold_literal_offsets", 

119 ) 

120 elif "qualifier-content" in span_class: 

121 raw_tag = clean_node(wxr, None, span_tag) 

122 if raw_tag != "": 

123 e_data.raw_tags.append(raw_tag) 

124 

125 e_data.ref = clean_node( 

126 wxr, None, t_node.template_parameters.get("ref", "") 

127 ) 

128 if e_data.text != "": 

129 translate_raw_tags(e_data) 

130 sense.examples.append(e_data) 

131 for link_node in expanded_node.find_child(NodeKind.LINK): 

132 clean_node(wxr, sense, link_node) 

133 

134 

135def extract_quote_template( 

136 wxr: WiktextractContext, 

137 sense: Sense, 

138 t_node: TemplateNode, 

139) -> str: 

140 ref = "" 

141 if all( 141 ↛ 144line 141 didn't jump to line 144 because the condition on line 141 was never true

142 arg not in t_node.template_parameters for arg in ["text", "passage", 7] 

143 ): 

144 ref = clean_node(wxr, sense, t_node) 

145 else: 

146 expanded_node = wxr.wtp.parse( 

147 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

148 ) 

149 example = Example(text="") 

150 for span_tag in expanded_node.find_html_recursively("span"): 

151 span_class = span_tag.attrs.get("class", "") 

152 if "cited-source" == span_class: 

153 example.ref = clean_node(wxr, None, span_tag) 

154 elif "e-quotation" in span_class: 

155 example.ruby, node_without_ruby = extract_ruby(wxr, span_tag) 

156 example.text = clean_node(wxr, None, node_without_ruby) 

157 calculate_bold_offsets( 

158 wxr, span_tag, example.text, example, "bold_text_offsets" 

159 ) 

160 elif "e-translation" in span_class: 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true

161 example.translation = clean_node(wxr, None, span_tag) 

162 calculate_bold_offsets( 

163 wxr, 

164 span_tag, 

165 example.translation, 

166 example, 

167 "bold_translation_text", 

168 ) 

169 for i_tag in expanded_node.find_html_recursively( 169 ↛ 172line 169 didn't jump to line 172 because the loop on line 169 never started

170 "i", attr_name="class", attr_value="e-transliteration" 

171 ): 

172 example.roman = clean_node(wxr, None, i_tag) 

173 calculate_bold_offsets( 

174 wxr, i_tag, example.roman, example, "bold_roman_offsets" 

175 ) 

176 break 

177 if example.text != "": 177 ↛ 179line 177 didn't jump to line 179 because the condition on line 177 was always true

178 sense.examples.append(example) 

179 clean_node(wxr, sense, expanded_node) 

180 

181 return ref 

182 

183 

184def extract_ja_x_template( 

185 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense, ref: str 

186) -> None: 

187 expanded_node = wxr.wtp.parse( 

188 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

189 ) 

190 example = Example(text="", ref=ref) 

191 for span_tag in expanded_node.find_html( 

192 "span", attr_name="class", attr_value="Jpan" 

193 ): 

194 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

195 example.text = clean_node(wxr, None, node_without_ruby) 

196 example.ruby = ruby_data 

197 calculate_bold_offsets( 

198 wxr, 

199 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)), 

200 example.text, 

201 example, 

202 "bold_text_offsets", 

203 ) 

204 for span_tag in expanded_node.find_html_recursively( 

205 "span", attr_name="class", attr_value="tr" 

206 ): 

207 example.roman = clean_node(wxr, None, span_tag) 

208 calculate_bold_offsets( 

209 wxr, span_tag, example.roman, example, "bold_roman_offsets" 

210 ) 

211 third_arg = t_node.template_parameters.get(3, "") 

212 example.translation = clean_node(wxr, None, third_arg) 

213 calculate_bold_offsets( 

214 wxr, 

215 wxr.wtp.parse(wxr.wtp.node_to_wikitext(third_arg)), 

216 example.translation, 

217 example, 

218 "bold_translation_offsets", 

219 ) 

220 lit_arg = t_node.template_parameters.get("lit", "") 

221 example.literal_meaning = clean_node(wxr, None, lit_arg) 

222 calculate_bold_offsets( 

223 wxr, 

224 wxr.wtp.parse(wxr.wtp.node_to_wikitext(lit_arg)), 

225 example.literal_meaning, 

226 example, 

227 "bold_literal_offsets", 

228 ) 

229 if example.text != "": 

230 sense.examples.append(example) 

231 for link_node in expanded_node.find_child(NodeKind.LINK): 

232 clean_node(wxr, sense, link_node) 

233 

234 

235def extract_zh_x_template( 

236 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense, ref: str 

237): 

238 expanded_node = wxr.wtp.parse( 

239 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

240 ) 

241 examples = [] 

242 for dl_tag in expanded_node.find_html("dl"): 

243 examples.extend(extract_zh_x_dl_tag(wxr, dl_tag)) 

244 if len(examples) == 0: 

245 examples.extend(extract_zh_x_no_dl_tag(wxr, expanded_node)) 

246 

247 second_arg = t_node.template_parameters.get(2, "") 

248 translation = clean_node(wxr, None, second_arg) 

249 for e_data in examples: 

250 e_data.translation = translation 

251 calculate_bold_offsets( 

252 wxr, 

253 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)), 

254 translation, 

255 e_data, 

256 "bold_translation_offsets", 

257 ) 

258 translate_raw_tags(e_data) 

259 

260 for link_node in expanded_node.find_child(NodeKind.LINK): 

261 clean_node(wxr, sense, link_node) 

262 sense.examples.extend(examples) 

263 

264 

265def extract_zh_x_dl_tag( 

266 wxr: WiktextractContext, dl_tag: HTMLNode 

267) -> list[Example]: 

268 examples = [] 

269 for span_tag in dl_tag.find_html("span"): 

270 if "lang" in span_tag.attrs: 

271 e_text = clean_node(wxr, None, span_tag) 

272 if e_text != "": 

273 e_data = Example(text=e_text) 

274 calculate_bold_offsets( 

275 wxr, span_tag, e_text, e_data, "bold_text_offsets" 

276 ) 

277 examples.append(e_data) 

278 else: 

279 raw_tags = clean_node(wxr, None, span_tag).strip("[] ") 

280 for raw_tag in raw_tags.split(","): 

281 raw_tag = raw_tag.strip() 

282 if raw_tag != "" and len(examples) > 0: 

283 examples[-1].raw_tags.append(raw_tag) 

284 ref = "" 

285 for dd_tag in dl_tag.find_html("dd"): 

286 for span_tag in dd_tag.find_html("span"): 

287 if "Latn" in span_tag.attrs.get("lang", ""): 

288 roman = clean_node(wxr, None, span_tag) 

289 for e_data in examples: 

290 e_data.roman = roman 

291 calculate_bold_offsets( 

292 wxr, span_tag, roman, e_data, "bold_roman_offsets" 

293 ) 

294 else: 

295 raw_tag = clean_node(wxr, None, span_tag).strip("[] ") 

296 if raw_tag != "": 

297 for e_data in examples: 

298 e_data.raw_tags.append(raw_tag) 

299 for small_tag in dd_tag.find_html("small"): 

300 ref = clean_node(wxr, None, small_tag).removeprefix("Từ:").strip() 

301 for e_data in examples: 

302 e_data.ref = ref 

303 

304 return examples 

305 

306 

307def extract_zh_x_no_dl_tag( 

308 wxr: WiktextractContext, expanded_node: WikiNode 

309) -> list[Example]: 

310 examples = [] 

311 for span_tag in expanded_node.find_html("span"): 

312 lang = span_tag.attrs.get("lang", "") 

313 match lang: 

314 case "zh-Latn": 

315 roman = clean_node(wxr, None, span_tag) 

316 for e_data in examples: 

317 e_data.roman = roman 

318 calculate_bold_offsets( 

319 wxr, span_tag, roman, e_data, "bold_roman_offsets" 

320 ) 

321 case "zh-Hant" | "zh-Hans": 

322 e_text = clean_node(wxr, None, span_tag) 

323 example = Example(text=e_text) 

324 example.tags.append( 

325 "Traditional-Chinese" 

326 if lang == "zh-Hant" 

327 else "Simplified-Chinese" 

328 ) 

329 if example.text != "": 

330 calculate_bold_offsets( 

331 wxr, span_tag, e_text, example, "bold_text_offsets" 

332 ) 

333 examples.append(example) 

334 

335 return examples