Coverage for src/wiktextract/extractor/th/example.py: 80%

163 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import re 

2 

3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..ruby import extract_ruby 

8from ..share import calculate_bold_offsets 

9from .models import Example, Sense, WordEntry 

10from .tags import translate_raw_tags 

11 

12 

13def extract_example_list_item( 

14 wxr: WiktextractContext, 

15 word_entry: WordEntry, 

16 sense: Sense, 

17 list_item: WikiNode, 

18 ref: str = "", 

19) -> None: 

20 from .linkage import LINKAGE_TEMPLATES, extract_syn_template 

21 

22 for node in list_item.children: 

23 if isinstance(node, TemplateNode): 

24 if node.template_name in ["ux", "usex", "ko-usex"]: 

25 extract_ux_template(wxr, sense, node) 

26 elif node.template_name in ["zh-x", "zh-usex"]: 

27 extract_template_zh_x(wxr, sense, node) 

28 elif node.template_name in ["ja-x", "ja-usex"]: 

29 extract_template_ja_usex(wxr, sense, node, ref) 

30 elif node.template_name.startswith("quote-"): 

31 ref = extract_quote_template(wxr, sense, node) 

32 elif node.template_name in LINKAGE_TEMPLATES: 

33 extract_syn_template( 

34 wxr, word_entry, node, LINKAGE_TEMPLATES[node.template_name] 

35 ) 

36 elif node.template_name == "audio" and len(sense.examples) > 0: 36 ↛ 22line 36 didn't jump to line 22 because the condition on line 36 was always true

37 from .sound import extract_audio_template 

38 

39 extract_audio_template(wxr, sense.examples[-1], node) 

40 sense.categories.extend(sense.examples[-1].categories) 

41 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

42 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

43 extract_example_list_item( 

44 wxr, word_entry, sense, child_list_item, ref 

45 ) 

46 

47 

48def extract_ux_template( 

49 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

50) -> None: 

51 expanded_node = wxr.wtp.parse( 

52 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

53 ) 

54 e_data = Example(text="") 

55 for i_tag in expanded_node.find_html_recursively("i"): 

56 i_class = i_tag.attrs.get("class", "") 

57 if "e-example" in i_class: 

58 e_data.text = clean_node(wxr, None, i_tag) 

59 calculate_bold_offsets( 

60 wxr, i_tag, e_data.text, e_data, "bold_text_offsets" 

61 ) 

62 elif "e-transliteration" in i_class: 62 ↛ 55line 62 didn't jump to line 55 because the condition on line 62 was always true

63 e_data.roman = clean_node(wxr, None, i_tag) 

64 calculate_bold_offsets( 

65 wxr, i_tag, e_data.roman, e_data, "bold_roman_offsets" 

66 ) 

67 for span_tag in expanded_node.find_html_recursively("span"): 

68 span_class = span_tag.attrs.get("class", "") 

69 if "e-translation" in span_class: 69 ↛ 78line 69 didn't jump to line 78 because the condition on line 69 was always true

70 e_data.translation = clean_node(wxr, None, span_tag) 

71 calculate_bold_offsets( 

72 wxr, 

73 span_tag, 

74 e_data.translation, 

75 e_data, 

76 "bold_translation_offsets", 

77 ) 

78 elif "e-literally" in span_class: 

79 e_data.literal_meaning = clean_node(wxr, None, span_tag) 

80 calculate_bold_offsets( 

81 wxr, 

82 span_tag, 

83 e_data.literal_meaning, 

84 e_data, 

85 "bold_literal_offsets", 

86 ) 

87 elif "qualifier-content" in span_class: 

88 raw_tag = clean_node(wxr, None, span_tag) 

89 if raw_tag != "": 

90 e_data.raw_tags.append(raw_tag) 

91 

92 e_data.ref = clean_node( 

93 wxr, None, t_node.template_parameters.get("ref", "") 

94 ) 

95 if e_data.text != "": 95 ↛ exitline 95 didn't return from function 'extract_ux_template' because the condition on line 95 was always true

96 translate_raw_tags(e_data) 

97 sense.examples.append(e_data) 

98 for link_node in expanded_node.find_child(NodeKind.LINK): 

99 clean_node(wxr, sense, link_node) 

100 

101 

102def extract_template_zh_x( 

103 wxr: WiktextractContext, 

104 sense: Sense, 

105 t_node: TemplateNode, 

106) -> None: 

107 expanded_node = wxr.wtp.parse( 

108 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

109 ) 

110 examples = [] 

111 for dl_tag in expanded_node.find_html("dl"): 

112 examples.extend(extract_zh_x_dl_tag(wxr, dl_tag)) 

113 if len(examples) == 0: 

114 examples.extend(extract_zh_x_no_dl_tag(wxr, expanded_node)) 

115 

116 second_arg = t_node.template_parameters.get(2, "") 

117 translation = clean_node(wxr, None, second_arg) 

118 for e_data in examples: 

119 e_data.translation = translation 

120 calculate_bold_offsets( 

121 wxr, 

122 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)), 

123 translation, 

124 e_data, 

125 "bold_translation_offsets", 

126 ) 

127 translate_raw_tags(e_data) 

128 

129 for link_node in expanded_node.find_child(NodeKind.LINK): 

130 clean_node(wxr, sense, link_node) 

131 

132 sense.examples.extend(examples) 

133 

134 

135def extract_zh_x_dl_tag( 

136 wxr: WiktextractContext, dl_tag: HTMLNode 

137) -> list[Example]: 

138 examples = [] 

139 for span_tag in dl_tag.find_html("span"): 

140 if "lang" in span_tag.attrs: 

141 e_text = clean_node(wxr, None, span_tag) 

142 if e_text != "": 142 ↛ 139line 142 didn't jump to line 139 because the condition on line 142 was always true

143 e_data = Example(text=e_text) 

144 calculate_bold_offsets( 

145 wxr, span_tag, e_text, e_data, "bold_text_offsets" 

146 ) 

147 examples.append(e_data) 

148 else: 

149 raw_tags = clean_node(wxr, None, span_tag).strip("[] ") 

150 for raw_tag in re.split(r", | and ", raw_tags): 

151 raw_tag = raw_tag.strip() 

152 if raw_tag != "" and len(examples) > 0: 152 ↛ 150line 152 didn't jump to line 150 because the condition on line 152 was always true

153 examples[-1].raw_tags.append(raw_tag) 

154 for dd_tag in dl_tag.find_html("dd"): 

155 for span_tag in dd_tag.find_html("span"): 

156 if "Latn" in span_tag.attrs.get("lang", ""): 

157 roman = clean_node(wxr, None, span_tag) 

158 for e_data in examples: 

159 e_data.roman = roman 

160 calculate_bold_offsets( 

161 wxr, span_tag, roman, e_data, "bold_roman_offsets" 

162 ) 

163 else: 

164 raw_tag = clean_node(wxr, None, span_tag).strip("[] ") 

165 if raw_tag != "": 165 ↛ 155line 165 didn't jump to line 155 because the condition on line 165 was always true

166 for e_data in examples: 

167 e_data.raw_tags.append(raw_tag) 

168 return examples 

169 

170 

171def extract_zh_x_no_dl_tag( 

172 wxr: WiktextractContext, expanded_node: WikiNode 

173) -> list[Example]: 

174 examples = [] 

175 for span_tag in expanded_node.find_html("span"): 

176 lang = span_tag.attrs.get("lang", "") 

177 match lang: 

178 case "zh-Latn": 

179 roman = clean_node(wxr, None, span_tag) 

180 for e_data in examples: 

181 e_data.roman = roman 

182 calculate_bold_offsets( 

183 wxr, span_tag, roman, e_data, "bold_roman_offsets" 

184 ) 

185 case "zh-Hant" | "zh-Hans": 

186 e_text = clean_node(wxr, None, span_tag) 

187 example = Example(text=e_text) 

188 example.tags.append( 

189 "Traditional-Chinese" 

190 if lang == "zh-Hant" 

191 else "Simplified-Chinese" 

192 ) 

193 if example.text != "": 193 ↛ 175line 193 didn't jump to line 175 because the condition on line 193 was always true

194 calculate_bold_offsets( 

195 wxr, span_tag, e_text, example, "bold_text_offsets" 

196 ) 

197 examples.append(example) 

198 

199 return examples 

200 

201 

202def extract_quote_template( 

203 wxr: WiktextractContext, 

204 sense: Sense, 

205 t_node: TemplateNode, 

206) -> str: 

207 ref = "" 

208 if all( 208 ↛ 213line 208 didn't jump to line 213 because the condition on line 208 was always true

209 arg not in t_node.template_parameters for arg in ["text", "passage", 7] 

210 ): 

211 ref = clean_node(wxr, sense, t_node) 

212 else: 

213 expanded_node = wxr.wtp.parse( 

214 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

215 ) 

216 example = Example(text="") 

217 for span_tag in expanded_node.find_html_recursively("span"): 

218 span_class = span_tag.attrs.get("class", "") 

219 if "cited-source" == span_class: 

220 example.ref = clean_node(wxr, None, span_tag) 

221 elif "e-quotation" in span_class: 

222 example.text = clean_node(wxr, None, span_tag) 

223 calculate_bold_offsets( 

224 wxr, span_tag, example.text, example, "bold_text_offsets" 

225 ) 

226 elif "e-translation" in span_class: 

227 example.translation = clean_node(wxr, None, span_tag) 

228 calculate_bold_offsets( 

229 wxr, 

230 span_tag, 

231 example.translation, 

232 example, 

233 "bold_translation_text", 

234 ) 

235 for i_tag in expanded_node.find_html_recursively( 

236 "i", attr_name="class", attr_value="e-transliteration" 

237 ): 

238 example.roman = clean_node(wxr, None, i_tag) 

239 calculate_bold_offsets( 

240 wxr, i_tag, example.roman, example, "bold_roman_offsets" 

241 ) 

242 break 

243 if example.text != "": 

244 sense.examples.append(example) 

245 clean_node(wxr, sense, expanded_node) 

246 

247 return ref 

248 

249 

250def extract_template_ja_usex( 

251 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode, ref: str 

252) -> None: 

253 expanded_node = wxr.wtp.parse( 

254 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

255 ) 

256 example = Example(text="", ref=ref) 

257 for span_tag in expanded_node.find_html( 

258 "span", attr_name="class", attr_value="Jpan" 

259 ): 

260 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag) 

261 example.text = clean_node(wxr, None, node_without_ruby) 

262 example.ruby = ruby_data 

263 calculate_bold_offsets( 

264 wxr, 

265 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)), 

266 example.text, 

267 example, 

268 "bold_text_offsets", 

269 ) 

270 for span_tag in expanded_node.find_html_recursively( 

271 "span", attr_name="class", attr_value="tr" 

272 ): 

273 example.roman = clean_node(wxr, None, span_tag) 

274 calculate_bold_offsets( 

275 wxr, span_tag, example.roman, example, "bold_roman_offsets" 

276 ) 

277 third_arg = t_node.template_parameters.get(3, "") 

278 example.translation = clean_node(wxr, None, third_arg) 

279 calculate_bold_offsets( 

280 wxr, 

281 wxr.wtp.parse(wxr.wtp.node_to_wikitext(third_arg)), 

282 example.translation, 

283 example, 

284 "bold_translation_offsets", 

285 ) 

286 lit_arg = t_node.template_parameters.get("lit", "") 

287 example.literal_meaning = clean_node(wxr, None, lit_arg) 

288 calculate_bold_offsets( 

289 wxr, 

290 wxr.wtp.parse(wxr.wtp.node_to_wikitext(lit_arg)), 

291 example.literal_meaning, 

292 example, 

293 "bold_literal_offsets", 

294 ) 

295 if example.text != "": 295 ↛ exitline 295 didn't return from function 'extract_template_ja_usex' because the condition on line 295 was always true

296 sense.examples.append(example) 

297 for link_node in expanded_node.find_child(NodeKind.LINK): 

298 clean_node(wxr, sense, link_node)