Coverage for src/wiktextract/extractor/th/linkage.py: 62%

1from itertools import count

3from wikitextprocessor.parser import (

4 LEVEL_KIND_FLAGS,

5 LevelNode,

6 NodeKind,

7 TemplateNode,

8 WikiNode,

11from ...page import clean_node

12from ...wxr_context import WiktextractContext

13from .models import Linkage, WordEntry

14from .section_titles import LINKAGE_SECTIONS

15from .tags import translate_raw_tags

18def extract_linkage_section(

19 wxr: WiktextractContext,

20 word_entry: WordEntry,

21 level_node: LevelNode,

22 linkage_type: str,

23 source: str = "",

24 sense: str = "",

25) -> None:

26 for node in level_node.children:

27 if isinstance(node, TemplateNode) and node.template_name.startswith(

28 "col"

29 ):

30 extract_col_template(

31 wxr, word_entry, node, linkage_type, source, sense

32 )

33 elif isinstance(node, TemplateNode) and node.template_name == "ws":

34 extract_ws_template(

35 wxr, word_entry, node, linkage_type, source, sense

36 )

37 elif isinstance(node, TemplateNode) and node.template_name == "zh-dial": 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 extract_zh_dial_template(wxr, word_entry, node, linkage_type, sense)

39 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

40 for list_item in node.find_child(NodeKind.LIST_ITEM):

41 extract_linkage_list_item(

42 wxr, word_entry, list_item, linkage_type, source, sense

43 )

46def extract_col_template(

47 wxr: WiktextractContext,

48 word_entry: WordEntry,

49 t_node: TemplateNode,

50 linkage_type: str,

51 source: str,

52 sense: str,

53) -> None:

54 expanded_node = wxr.wtp.parse(

55 wxr.wtp.node_to_wikitext(t_node), expand_all=True

56 )

57 for li_tag in expanded_node.find_html_recursively("li"):

58 l_data = []

59 for span_tag in li_tag.find_html("span"):

60 span_class = span_tag.attrs.get("class", "")

61 if "Latn" in span_class:

62 for data in l_data:

63 data.roman = clean_node(wxr, None, span_tag)

64 elif "lang" in span_tag.attrs:

65 word = clean_node(wxr, None, span_tag)

66 if word != "": 66 ↛ 59line 66 didn't jump to line 59 because the condition on line 66 was always true

67 l_data.append(

68 Linkage(word=word, source=source, sense=sense)

69 )

70 if span_class == "Hant":

71 l_data[-1].tags.append("Traditional-Chinese")

72 elif span_class == "Hans":

73 l_data[-1].tags.append("Simplified-Chinese")

74 getattr(word_entry, linkage_type).extend(l_data)

77def extract_linkage_list_item(

78 wxr: WiktextractContext,

79 word_entry: WordEntry,

80 list_item: WikiNode,

81 linkage_type: str,

82 source: str,

83 sense: str,

84) -> None:

85 linkages = []

86 raw_tags = []

88 for index, node in enumerate(list_item.children):

89 if isinstance(node, TemplateNode) and node.template_name == "l":

90 l_data = Linkage(

91 word=clean_node(wxr, None, node.template_parameters.get(2, "")),

92 source=source,

93 sense=sense,

94 raw_tags=raw_tags,

95 )

96 if l_data.word != "": 96 ↛ 88line 96 didn't jump to line 88 because the condition on line 96 was always true

97 translate_raw_tags(l_data)

98 linkages.append(l_data)

99 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:

100 for link_node in node.find_child(NodeKind.LINK):

101 link_str = clean_node(wxr, None, link_node)

102 if link_str.startswith("อรรถาภิธาน:") and not source.startswith( 102 ↛ 100line 102 didn't jump to line 100 because the condition on line 102 was always true

103 "อรรถาภิธาน:"

104 ):

105 extract_thesaurus_page(

106 wxr, word_entry, linkage_type, link_str, sense

107 )

108 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:

109 link_str = clean_node(wxr, None, node)

110 if link_str != "": 110 ↛ 88line 110 didn't jump to line 88 because the condition on line 110 was always true

111 l_data = Linkage(word=link_str, sense=sense, raw_tags=raw_tags)

112 translate_raw_tags(l_data)

113 linkages.append(l_data)

114 elif isinstance(node, str) and ("-" in node or "–" in node):

115 if "-" in node: 115 ↛ 117line 115 didn't jump to line 117 because the condition on line 115 was always true

116 sense = node[node.index("-") + 1 :]

117 elif "–" in node:

118 sense = node[node.index("–") + 1 :]

119 sense = clean_node(

120 wxr,

121 None,

122 [sense] + list_item.children[index + 1 :],

123 ).strip()

124 for l_data in linkages:

125 l_data.sense = sense

126 break

127 elif isinstance(node, TemplateNode) and node.template_name in [

128 "qualifier",

129 "q",

130 "qual",

131 "qf",

132 ]:

133 text = clean_node(wxr, None, node).strip("() ")

134 for raw_tag in text.split(","):

135 raw_tag = raw_tag.strip()

136 if raw_tag != "": 136 ↛ 134line 136 didn't jump to line 134 because the condition on line 136 was always true

137 raw_tags.append(raw_tag)

138 elif isinstance(node, TemplateNode) and node.template_name == "zh-l":

139 linkages.extend(extract_zh_l_template(wxr, node, sense, raw_tags))

140

141 getattr(word_entry, linkage_type).extend(linkages)

142

143

144def extract_thesaurus_page(

145 wxr: WiktextractContext,

146 word_entry: WordEntry,

147 linkage_type: str,

148 page_title: str,

149 sense: str,

150) -> None:

151 page = wxr.wtp.get_page(page_title, 110)

152 if page is None or page.body is None: 152 ↛ 153line 152 didn't jump to line 153 because the condition on line 152 was never true

153 return

154 root = wxr.wtp.parse(page.body)

155 for level2_node in root.find_child(NodeKind.LEVEL2):

156 lang_name = clean_node(wxr, None, level2_node.largs).removeprefix(

157 "ภาษา"

158 )

159 if lang_name != word_entry.lang: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 continue

161 for level3_node in level2_node.find_child(NodeKind.LEVEL3):

162 pos_title = clean_node(wxr, None, level3_node.largs)

163 if pos_title != word_entry.pos_title: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 continue

165 for linkage_level_node in level3_node.find_child_recursively(

166 LEVEL_KIND_FLAGS

167 ):

168 linkage_title = clean_node(wxr, None, linkage_level_node.largs)

169 if LINKAGE_SECTIONS.get(linkage_title) != linkage_type:

170 continue

171 extract_linkage_section(

172 wxr,

173 word_entry,

174 linkage_level_node,

175 linkage_type,

176 source=page_title,

177 sense=sense,

178 )

179

180

181def extract_ws_template(

182 wxr: WiktextractContext,

183 word_entry: WordEntry,

184 t_node: TemplateNode,

185 linkage_type: str,

186 source: str,

187 sense: str,

188) -> None:

189 word = clean_node(wxr, None, t_node.template_parameters.get(2, ""))

190 if word != "": 190 ↛ exitline 190 didn't return from function 'extract_ws_template' because the condition on line 190 was always true

191 l_data = Linkage(word=word, source=source, sense=sense)

192 getattr(word_entry, linkage_type).append(l_data)

193

194

195LINKAGE_TEMPLATES = {

196 "syn": "synonyms",

197 "synonyms": "synonyms",

198 "synsee": "synonyms",

199 "ant": "antonyms",

200 "antonyms": "antonyms",

201 "cot": "coordinate_terms",

202 "coordinate terms": "coordinate_terms",

203 "hyper": "hypernyms",

204 "hypernyms": "hypernyms",

205 "hypo": "hyponyms",

206 "hyponyms": "hyponyms",

207}

208

209

210def extract_syn_template(

211 wxr: WiktextractContext,

212 word_entry: WordEntry,

213 t_node: TemplateNode,

214 linkage_type: str,

215) -> None:

216 sense = " ".join(word_entry.senses[-1].glosses)

217 for arg_name in count(2): 217 ↛ exitline 217 didn't return from function 'extract_syn_template' because the loop on line 217 didn't complete

218 if arg_name not in t_node.template_parameters:

219 break

220 arg_value = clean_node(wxr, None, t_node.template_parameters[arg_name])

221 if arg_value.startswith(("อรรถาภิธาน:", "Thesaurus:")): 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 extract_thesaurus_page(

223 wxr, word_entry, linkage_type, arg_value, sense

224 )

225 elif arg_value != "": 225 ↛ 217line 225 didn't jump to line 217 because the condition on line 225 was always true

226 getattr(word_entry, linkage_type).append(

227 Linkage(word=arg_value, sense=sense)

228 )

229

230

231def extract_zh_dial_template(

232 wxr: WiktextractContext,

233 word_entry: WordEntry,

234 t_node: TemplateNode,

235 linkage_type: str,

236 sense: str,

237):

238 from .sound import split_zh_pron_raw_tag

239

240 linkage_list = []

241 expanded_node = wxr.wtp.parse(

242 wxr.wtp.node_to_wikitext(t_node), expand_all=True

243 )

244 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE):

245 is_note_row = False

246 note_tags = {}

247 for row_node in table_node.find_child(NodeKind.TABLE_ROW):

248 for cell_node in row_node.find_child(

249 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

250 ):

251 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:

252 is_note_row = clean_node(wxr, None, cell_node) == "หมายเหตุ"

253 elif is_note_row:

254 for note_str in clean_node(wxr, None, cell_node).split(";"):

255 if "-" in note_str:

256 note_symbol, note = note_str.split("-", maxsplit=1)

257 note_symbol = note_symbol.strip()

258 note = note.strip()

259 if note_symbol != "" and note != "":

260 note_tags[note_symbol] = note

261 lang_tags = []

262 region_tags = []

263 for row_node in table_node.find_child(NodeKind.TABLE_ROW):

264 if not row_node.contain_node(NodeKind.TABLE_CELL):

265 continue # skip header row

266 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):

267 lang_tags = split_zh_pron_raw_tag(

268 clean_node(wxr, None, header_node)

269 )

270 if lang_tags == ["หมายเหตุ"]: # skip last note row

271 continue

272 for cell_node in row_node.find_child(NodeKind.TABLE_CELL):

273 for link_node in cell_node.find_child(NodeKind.LINK):

274 region_tags = split_zh_pron_raw_tag(

275 clean_node(wxr, None, link_node)

276 )

277 for span_tag in cell_node.find_html("span"):

278 span_text = clean_node(wxr, None, span_tag)

279 if span_text == "":

280 continue

281 if (

282 span_tag.attrs.get("lang", "") == "zh"

283 and span_text != wxr.wtp.title

284 ):

285 l_data = Linkage(word=span_text, sense=sense)

286 if len(lang_tags) > 0:

287 l_data.raw_tags.extend(lang_tags)

288 if len(region_tags) > 0:

289 l_data.raw_tags.extend(region_tags)

290 translate_raw_tags(l_data)

291 linkage_list.append(l_data)

292 elif (

293 span_tag.attrs.get("style", "") == "font-size:60%"

294 and len(linkage_list) > 0

295 ):

296 for note_symbol in span_text.split(","):

297 note_symbol = note_symbol.strip()

298 raw_tag = note_symbol

299 if note_symbol in note_tags:

300 raw_tag = note_tags[note_symbol]

301 if raw_tag != "":

302 linkage_list[-1].raw_tags.append(raw_tag)

303 translate_raw_tags(linkage_list[-1])

304

305 getattr(word_entry, linkage_type).extend(linkage_list)

306

307

308def extract_zh_l_template(

309 wxr: WiktextractContext,

310 t_node: TemplateNode,

311 sense: str,

312 raw_tags: list[str],

313) -> list[Linkage]:

314 l_list = []

315 expanded_node = wxr.wtp.parse(

316 wxr.wtp.node_to_wikitext(t_node), expand_all=True

317 )

318 roman = ""

319 new_sense = clean_node(wxr, None, t_node.template_parameters.get(2, ""))

320 if new_sense != "": 320 ↛ 322line 320 didn't jump to line 322 because the condition on line 320 was always true

321 sense = new_sense

322 for i_tag in expanded_node.find_html_recursively(

323 "span", attr_name="class", attr_value="Latn"

324 ):

325 roman = clean_node(wxr, None, i_tag)

326 for span_tag in expanded_node.find_html(

327 "span", attr_name="lang", attr_value="zh"

328 ):

329 linkage_data = Linkage(

330 sense=sense,

331 raw_tags=raw_tags,

332 roman=roman,

333 word=clean_node(wxr, None, span_tag),

334 )

335 lang_attr = span_tag.attrs.get("lang", "")

336 if lang_attr == "zh-Hant":

337 linkage_data.tags.append("Traditional-Chinese")

338 elif lang_attr == "zh-Hans":

339 linkage_data.tags.append("Simplified-Chinese")

340 if linkage_data.word not in ["／", ""]:

341 translate_raw_tags(linkage_data)

342 l_list.append(linkage_data)

343

344 return l_list

Coverage for src / wiktextract / extractor / th / linkage.py: 62%

181 statements