Coverage for src/wiktextract/extractor/th/example.py: 79%

1import re

3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from ..ruby import extract_ruby

8from ..share import calculate_bold_offsets

9from .models import Example, Sense, WordEntry

10from .tags import translate_raw_tags

13def extract_example_list_item(

14 wxr: WiktextractContext,

15 word_entry: WordEntry,

16 sense: Sense,

17 list_item: WikiNode,

18 ref: str = "",

19) -> None:

20 from .linkage import LINKAGE_TEMPLATES, extract_syn_template

22 for node in list_item.children:

23 if isinstance(node, TemplateNode):

24 if node.template_name in ["ux", "usex", "ko-usex"]:

25 extract_ux_template(wxr, sense, node)

26 elif node.template_name in ["zh-x", "zh-usex"]:

27 extract_template_zh_x(wxr, sense, node)

28 elif node.template_name in ["ja-x", "ja-usex"]:

29 extract_template_ja_usex(wxr, sense, node, ref)

30 elif node.template_name.startswith("quote-"):

31 ref = extract_quote_template(wxr, sense, node)

32 elif node.template_name in LINKAGE_TEMPLATES:

33 extract_syn_template(

34 wxr, word_entry, node, LINKAGE_TEMPLATES[node.template_name]

35 )

36 elif node.template_name == "audio" and len(sense.examples) > 0: 36 ↛ 22line 36 didn't jump to line 22 because the condition on line 36 was always true

37 from .sound import extract_audio_template

39 extract_audio_template(wxr, sense.examples[-1], node)

40 sense.categories.extend(sense.examples[-1].categories)

41 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

42 for child_list_item in node.find_child(NodeKind.LIST_ITEM):

43 extract_example_list_item(

44 wxr, word_entry, sense, child_list_item, ref

45 )

48def extract_ux_template(

49 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode

50) -> None:

51 expanded_node = wxr.wtp.parse(

52 wxr.wtp.node_to_wikitext(t_node), expand_all=True

53 )

54 e_data = Example(text="")

55 for i_tag in expanded_node.find_html_recursively("i"):

56 i_class = i_tag.attrs.get("class", "").split()

57 if "e-example" in i_class:

58 e_data.text = clean_node(wxr, None, i_tag)

59 calculate_bold_offsets(

60 wxr, i_tag, e_data.text, e_data, "bold_text_offsets"

61 )

62 elif "e-transliteration" in i_class: 62 ↛ 55line 62 didn't jump to line 55 because the condition on line 62 was always true

63 e_data.roman = clean_node(wxr, None, i_tag)

64 calculate_bold_offsets(

65 wxr, i_tag, e_data.roman, e_data, "bold_roman_offsets"

66 )

67 for span_tag in expanded_node.find_html_recursively("span"):

68 span_class = span_tag.attrs.get("class", "")

69 if "e-translation" in span_class: 69 ↛ 78line 69 didn't jump to line 78 because the condition on line 69 was always true

70 e_data.translation = clean_node(wxr, None, span_tag)

71 calculate_bold_offsets(

72 wxr,

73 span_tag,

74 e_data.translation,

75 e_data,

76 "bold_translation_offsets",

77 )

78 elif "e-literally" in span_class:

79 e_data.literal_meaning = clean_node(wxr, None, span_tag)

80 calculate_bold_offsets(

81 wxr,

82 span_tag,

83 e_data.literal_meaning,

84 e_data,

85 "bold_literal_offsets",

86 )

87 elif "qualifier-content" in span_class:

88 raw_tag = clean_node(wxr, None, span_tag)

89 if raw_tag != "":

90 e_data.raw_tags.append(raw_tag)

92 e_data.ref = clean_node(

93 wxr, None, t_node.template_parameters.get("ref", "")

94 )

95 if e_data.text != "": 95 ↛ exitline 95 didn't return from function 'extract_ux_template' because the condition on line 95 was always true

96 translate_raw_tags(e_data)

97 sense.examples.append(e_data)

98 for link_node in expanded_node.find_child(NodeKind.LINK):

99 clean_node(wxr, sense, link_node)

100

101

102def extract_template_zh_x(

103 wxr: WiktextractContext,

104 sense: Sense,

105 t_node: TemplateNode,

106) -> None:

107 expanded_node = wxr.wtp.parse(

108 wxr.wtp.node_to_wikitext(t_node), expand_all=True

109 )

110 examples = []

111 for dl_tag in expanded_node.find_html("dl"):

112 examples.extend(extract_zh_x_dl_tag(wxr, dl_tag))

113 if len(examples) == 0:

114 examples.extend(extract_zh_x_no_dl_tag(wxr, expanded_node))

115

116 second_arg = t_node.template_parameters.get(2, "")

117 translation = clean_node(wxr, None, second_arg)

118 for e_data in examples:

119 e_data.translation = translation

120 calculate_bold_offsets(

121 wxr,

122 wxr.wtp.parse(wxr.wtp.node_to_wikitext(second_arg)),

123 translation,

124 e_data,

125 "bold_translation_offsets",

126 )

127 translate_raw_tags(e_data)

128

129 for link_node in expanded_node.find_child(NodeKind.LINK):

130 clean_node(wxr, sense, link_node)

131

132 sense.examples.extend(examples)

133

134

135def extract_zh_x_dl_tag(

136 wxr: WiktextractContext, dl_tag: HTMLNode

137) -> list[Example]:

138 examples = []

139 for span_tag in dl_tag.find_html("span"):

140 if "lang" in span_tag.attrs:

141 e_text = clean_node(wxr, None, span_tag)

142 if e_text != "": 142 ↛ 139line 142 didn't jump to line 139 because the condition on line 142 was always true

143 e_data = Example(text=e_text)

144 calculate_bold_offsets(

145 wxr, span_tag, e_text, e_data, "bold_text_offsets"

146 )

147 examples.append(e_data)

148 else:

149 raw_tags = clean_node(wxr, None, span_tag).strip("[] ")

150 for raw_tag in re.split(r", | and ", raw_tags):

151 raw_tag = raw_tag.strip()

152 if raw_tag != "" and len(examples) > 0: 152 ↛ 150line 152 didn't jump to line 150 because the condition on line 152 was always true

153 examples[-1].raw_tags.append(raw_tag)

154 for dd_tag in dl_tag.find_html("dd"):

155 for span_tag in dd_tag.find_html("span"):

156 if "Latn" in span_tag.attrs.get("lang", ""):

157 roman = clean_node(wxr, None, span_tag)

158 for e_data in examples:

159 e_data.roman = roman

160 calculate_bold_offsets(

161 wxr, span_tag, roman, e_data, "bold_roman_offsets"

162 )

163 else:

164 raw_tag = clean_node(wxr, None, span_tag).strip("[] ")

165 if raw_tag != "": 165 ↛ 155line 165 didn't jump to line 155 because the condition on line 165 was always true

166 for e_data in examples:

167 e_data.raw_tags.append(raw_tag)

168 return examples

169

170

171def extract_zh_x_no_dl_tag(

172 wxr: WiktextractContext, expanded_node: WikiNode

173) -> list[Example]:

174 examples = []

175 for span_tag in expanded_node.find_html("span"):

176 lang = span_tag.attrs.get("lang", "")

177 match lang:

178 case "zh-Latn":

179 roman = clean_node(wxr, None, span_tag)

180 for e_data in examples:

181 e_data.roman = roman

182 calculate_bold_offsets(

183 wxr, span_tag, roman, e_data, "bold_roman_offsets"

184 )

185 case "zh-Hant" | "zh-Hans":

186 e_text = clean_node(wxr, None, span_tag)

187 example = Example(text=e_text)

188 example.tags.append(

189 "Traditional-Chinese"

190 if lang == "zh-Hant"

191 else "Simplified-Chinese"

192 )

193 if example.text != "": 193 ↛ 175line 193 didn't jump to line 175 because the condition on line 193 was always true

194 calculate_bold_offsets(

195 wxr, span_tag, e_text, example, "bold_text_offsets"

196 )

197 examples.append(example)

198

199 return examples

200

201

202def extract_quote_template(

203 wxr: WiktextractContext,

204 sense: Sense,

205 t_node: TemplateNode,

206) -> str:

207 ref = ""

208 if all( 208 ↛ 213line 208 didn't jump to line 213 because the condition on line 208 was always true

209 arg not in t_node.template_parameters for arg in ["text", "passage", 7]

210 ):

211 ref = clean_node(wxr, sense, t_node)

212 else:

213 expanded_node = wxr.wtp.parse(

214 wxr.wtp.node_to_wikitext(t_node), expand_all=True

215 )

216 example = Example(text="")

217 for span_tag in expanded_node.find_html_recursively("span"):

218 span_class = span_tag.attrs.get("class", "")

219 if "cited-source" == span_class:

220 example.ref = clean_node(wxr, None, span_tag)

221 elif "e-quotation" in span_class:

222 example.ruby, node_without_ruby = extract_ruby(wxr, span_tag)

223 example.text = clean_node(wxr, None, node_without_ruby)

224 calculate_bold_offsets(

225 wxr, span_tag, example.text, example, "bold_text_offsets"

226 )

227 elif "e-translation" in span_class:

228 example.translation = clean_node(wxr, None, span_tag)

229 calculate_bold_offsets(

230 wxr,

231 span_tag,

232 example.translation,

233 example,

234 "bold_translation_text",

235 )

236 for i_tag in expanded_node.find_html_recursively(

237 "i", attr_name="class", attr_value="e-transliteration"

238 ):

239 example.roman = clean_node(wxr, None, i_tag)

240 calculate_bold_offsets(

241 wxr, i_tag, example.roman, example, "bold_roman_offsets"

242 )

243 break

244 if example.text != "":

245 sense.examples.append(example)

246 clean_node(wxr, sense, expanded_node)

247

248 return ref

249

250

251def extract_template_ja_usex(

252 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode, ref: str

253) -> None:

254 expanded_node = wxr.wtp.parse(

255 wxr.wtp.node_to_wikitext(t_node), expand_all=True

256 )

257 example = Example(text="", ref=ref)

258 for span_tag in expanded_node.find_html(

259 "span", attr_name="class", attr_value="Jpan"

260 ):

261 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)

262 example.text = clean_node(wxr, None, node_without_ruby)

263 example.ruby = ruby_data

264 calculate_bold_offsets(

265 wxr,

266 wxr.wtp.parse(wxr.wtp.node_to_wikitext(node_without_ruby)),

267 example.text,

268 example,

269 "bold_text_offsets",

270 )

271 for span_tag in expanded_node.find_html_recursively(

272 "span", attr_name="class", attr_value="tr"

273 ):

274 example.roman = clean_node(wxr, None, span_tag)

275 calculate_bold_offsets(

276 wxr, span_tag, example.roman, example, "bold_roman_offsets"

277 )

278 third_arg = t_node.template_parameters.get(3, "")

279 example.translation = clean_node(wxr, None, third_arg)

280 calculate_bold_offsets(

281 wxr,

282 wxr.wtp.parse(wxr.wtp.node_to_wikitext(third_arg)),

283 example.translation,

284 example,

285 "bold_translation_offsets",

286 )

287 lit_arg = t_node.template_parameters.get("lit", "")

288 example.literal_meaning = clean_node(wxr, None, lit_arg)

289 calculate_bold_offsets(

290 wxr,

291 wxr.wtp.parse(wxr.wtp.node_to_wikitext(lit_arg)),

292 example.literal_meaning,

293 example,

294 "bold_literal_offsets",

295 )

296 if example.text != "": 296 ↛ exitline 296 didn't return from function 'extract_template_ja_usex' because the condition on line 296 was always true

297 sense.examples.append(example)

298 for link_node in expanded_node.find_child(NodeKind.LINK):

299 clean_node(wxr, sense, link_node)

Coverage for src / wiktextract / extractor / th / example.py: 79%

164 statements