Coverage for src/wiktextract/extractor/th/linkage.py: 62%

181 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1from itertools import count 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .models import Linkage, WordEntry 

14from .section_titles import LINKAGE_SECTIONS 

15from .tags import translate_raw_tags 

16 

17 

18def extract_linkage_section( 

19 wxr: WiktextractContext, 

20 word_entry: WordEntry, 

21 level_node: LevelNode, 

22 linkage_type: str, 

23 source: str = "", 

24 sense: str = "", 

25) -> None: 

26 for node in level_node.children: 

27 if isinstance(node, TemplateNode) and node.template_name.startswith( 

28 "col" 

29 ): 

30 extract_col_template( 

31 wxr, word_entry, node, linkage_type, source, sense 

32 ) 

33 elif isinstance(node, TemplateNode) and node.template_name == "ws": 

34 extract_ws_template( 

35 wxr, word_entry, node, linkage_type, source, sense 

36 ) 

37 elif isinstance(node, TemplateNode) and node.template_name == "zh-dial": 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 extract_zh_dial_template(wxr, word_entry, node, linkage_type, sense) 

39 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

40 for list_item in node.find_child(NodeKind.LIST_ITEM): 

41 extract_linkage_list_item( 

42 wxr, word_entry, list_item, linkage_type, source, sense 

43 ) 

44 

45 

46def extract_col_template( 

47 wxr: WiktextractContext, 

48 word_entry: WordEntry, 

49 t_node: TemplateNode, 

50 linkage_type: str, 

51 source: str, 

52 sense: str, 

53) -> None: 

54 expanded_node = wxr.wtp.parse( 

55 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

56 ) 

57 for li_tag in expanded_node.find_html_recursively("li"): 

58 l_data = [] 

59 for span_tag in li_tag.find_html("span"): 

60 span_class = span_tag.attrs.get("class", "") 

61 if "Latn" in span_class: 

62 for data in l_data: 

63 data.roman = clean_node(wxr, None, span_tag) 

64 elif "lang" in span_tag.attrs: 

65 word = clean_node(wxr, None, span_tag) 

66 if word != "": 66 ↛ 59line 66 didn't jump to line 59 because the condition on line 66 was always true

67 l_data.append( 

68 Linkage(word=word, source=source, sense=sense) 

69 ) 

70 if span_class == "Hant": 

71 l_data[-1].tags.append("Traditional-Chinese") 

72 elif span_class == "Hans": 

73 l_data[-1].tags.append("Simplified-Chinese") 

74 getattr(word_entry, linkage_type).extend(l_data) 

75 

76 

77def extract_linkage_list_item( 

78 wxr: WiktextractContext, 

79 word_entry: WordEntry, 

80 list_item: WikiNode, 

81 linkage_type: str, 

82 source: str, 

83 sense: str, 

84) -> None: 

85 linkages = [] 

86 raw_tags = [] 

87 

88 for index, node in enumerate(list_item.children): 

89 if isinstance(node, TemplateNode) and node.template_name == "l": 

90 l_data = Linkage( 

91 word=clean_node(wxr, None, node.template_parameters.get(2, "")), 

92 source=source, 

93 sense=sense, 

94 raw_tags=raw_tags, 

95 ) 

96 if l_data.word != "": 96 ↛ 88line 96 didn't jump to line 88 because the condition on line 96 was always true

97 translate_raw_tags(l_data) 

98 linkages.append(l_data) 

99 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

100 for link_node in node.find_child(NodeKind.LINK): 

101 link_str = clean_node(wxr, None, link_node) 

102 if link_str.startswith("อรรถาภิธาน:") and not source.startswith( 102 ↛ 100line 102 didn't jump to line 100 because the condition on line 102 was always true

103 "อรรถาภิธาน:" 

104 ): 

105 extract_thesaurus_page( 

106 wxr, word_entry, linkage_type, link_str, sense 

107 ) 

108 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

109 link_str = clean_node(wxr, None, node) 

110 if link_str != "": 110 ↛ 88line 110 didn't jump to line 88 because the condition on line 110 was always true

111 l_data = Linkage(word=link_str, sense=sense, raw_tags=raw_tags) 

112 translate_raw_tags(l_data) 

113 linkages.append(l_data) 

114 elif isinstance(node, str) and ("-" in node or "–" in node): 

115 if "-" in node: 115 ↛ 117line 115 didn't jump to line 117 because the condition on line 115 was always true

116 sense = node[node.index("-") + 1 :] 

117 elif "–" in node: 

118 sense = node[node.index("–") + 1 :] 

119 sense = clean_node( 

120 wxr, 

121 None, 

122 [sense] + list_item.children[index + 1 :], 

123 ).strip() 

124 for l_data in linkages: 

125 l_data.sense = sense 

126 break 

127 elif isinstance(node, TemplateNode) and node.template_name in [ 

128 "qualifier", 

129 "q", 

130 "qual", 

131 "qf", 

132 ]: 

133 text = clean_node(wxr, None, node).strip("() ") 

134 for raw_tag in text.split(","): 

135 raw_tag = raw_tag.strip() 

136 if raw_tag != "": 136 ↛ 134line 136 didn't jump to line 134 because the condition on line 136 was always true

137 raw_tags.append(raw_tag) 

138 elif isinstance(node, TemplateNode) and node.template_name == "zh-l": 

139 linkages.extend(extract_zh_l_template(wxr, node, sense, raw_tags)) 

140 

141 getattr(word_entry, linkage_type).extend(linkages) 

142 

143 

144def extract_thesaurus_page( 

145 wxr: WiktextractContext, 

146 word_entry: WordEntry, 

147 linkage_type: str, 

148 page_title: str, 

149 sense: str, 

150) -> None: 

151 page = wxr.wtp.get_page(page_title, 110) 

152 if page is None or page.body is None: 152 ↛ 153line 152 didn't jump to line 153 because the condition on line 152 was never true

153 return 

154 root = wxr.wtp.parse(page.body) 

155 for level2_node in root.find_child(NodeKind.LEVEL2): 

156 lang_name = clean_node(wxr, None, level2_node.largs).removeprefix( 

157 "ภาษา" 

158 ) 

159 if lang_name != word_entry.lang: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 continue 

161 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

162 pos_title = clean_node(wxr, None, level3_node.largs) 

163 if pos_title != word_entry.pos_title: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 continue 

165 for linkage_level_node in level3_node.find_child_recursively( 

166 LEVEL_KIND_FLAGS 

167 ): 

168 linkage_title = clean_node(wxr, None, linkage_level_node.largs) 

169 if LINKAGE_SECTIONS.get(linkage_title) != linkage_type: 

170 continue 

171 extract_linkage_section( 

172 wxr, 

173 word_entry, 

174 linkage_level_node, 

175 linkage_type, 

176 source=page_title, 

177 sense=sense, 

178 ) 

179 

180 

181def extract_ws_template( 

182 wxr: WiktextractContext, 

183 word_entry: WordEntry, 

184 t_node: TemplateNode, 

185 linkage_type: str, 

186 source: str, 

187 sense: str, 

188) -> None: 

189 word = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

190 if word != "": 190 ↛ exitline 190 didn't return from function 'extract_ws_template' because the condition on line 190 was always true

191 l_data = Linkage(word=word, source=source, sense=sense) 

192 getattr(word_entry, linkage_type).append(l_data) 

193 

194 

195LINKAGE_TEMPLATES = { 

196 "syn": "synonyms", 

197 "synonyms": "synonyms", 

198 "synsee": "synonyms", 

199 "ant": "antonyms", 

200 "antonyms": "antonyms", 

201 "cot": "coordinate_terms", 

202 "coordinate terms": "coordinate_terms", 

203 "hyper": "hypernyms", 

204 "hypernyms": "hypernyms", 

205 "hypo": "hyponyms", 

206 "hyponyms": "hyponyms", 

207} 

208 

209 

210def extract_syn_template( 

211 wxr: WiktextractContext, 

212 word_entry: WordEntry, 

213 t_node: TemplateNode, 

214 linkage_type: str, 

215) -> None: 

216 sense = " ".join(word_entry.senses[-1].glosses) 

217 for arg_name in count(2): 217 ↛ exitline 217 didn't return from function 'extract_syn_template' because the loop on line 217 didn't complete

218 if arg_name not in t_node.template_parameters: 

219 break 

220 arg_value = clean_node(wxr, None, t_node.template_parameters[arg_name]) 

221 if arg_value.startswith("อรรถาภิธาน:"): 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 extract_thesaurus_page( 

223 wxr, word_entry, linkage_type, arg_value, sense 

224 ) 

225 elif arg_value != "": 225 ↛ 217line 225 didn't jump to line 217 because the condition on line 225 was always true

226 getattr(word_entry, linkage_type).append( 

227 Linkage(word=arg_value, sense=sense) 

228 ) 

229 

230 

231def extract_zh_dial_template( 

232 wxr: WiktextractContext, 

233 word_entry: WordEntry, 

234 t_node: TemplateNode, 

235 linkage_type: str, 

236 sense: str, 

237): 

238 from .sound import split_zh_pron_raw_tag 

239 

240 linkage_list = [] 

241 expanded_node = wxr.wtp.parse( 

242 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

243 ) 

244 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE): 

245 is_note_row = False 

246 note_tags = {} 

247 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

248 for cell_node in row_node.find_child( 

249 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

250 ): 

251 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

252 is_note_row = clean_node(wxr, None, cell_node) == "หมายเหตุ" 

253 elif is_note_row: 

254 for note_str in clean_node(wxr, None, cell_node).split(";"): 

255 if "-" in note_str: 

256 note_symbol, note = note_str.split("-", maxsplit=1) 

257 note_symbol = note_symbol.strip() 

258 note = note.strip() 

259 if note_symbol != "" and note != "": 

260 note_tags[note_symbol] = note 

261 lang_tags = [] 

262 region_tags = [] 

263 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

264 if not row_node.contain_node(NodeKind.TABLE_CELL): 

265 continue # skip header row 

266 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL): 

267 lang_tags = split_zh_pron_raw_tag( 

268 clean_node(wxr, None, header_node) 

269 ) 

270 if lang_tags == ["หมายเหตุ"]: # skip last note row 

271 continue 

272 for cell_node in row_node.find_child(NodeKind.TABLE_CELL): 

273 for link_node in cell_node.find_child(NodeKind.LINK): 

274 region_tags = split_zh_pron_raw_tag( 

275 clean_node(wxr, None, link_node) 

276 ) 

277 for span_tag in cell_node.find_html("span"): 

278 span_text = clean_node(wxr, None, span_tag) 

279 if span_text == "": 

280 continue 

281 if ( 

282 span_tag.attrs.get("lang", "") == "zh" 

283 and span_text != wxr.wtp.title 

284 ): 

285 l_data = Linkage(word=span_text, sense=sense) 

286 if len(lang_tags) > 0: 

287 l_data.raw_tags.extend(lang_tags) 

288 if len(region_tags) > 0: 

289 l_data.raw_tags.extend(region_tags) 

290 translate_raw_tags(l_data) 

291 linkage_list.append(l_data) 

292 elif ( 

293 span_tag.attrs.get("style", "") == "font-size:60%" 

294 and len(linkage_list) > 0 

295 ): 

296 for note_symbol in span_text.split(","): 

297 note_symbol = note_symbol.strip() 

298 raw_tag = note_symbol 

299 if note_symbol in note_tags: 

300 raw_tag = note_tags[note_symbol] 

301 if raw_tag != "": 

302 linkage_list[-1].raw_tags.append(raw_tag) 

303 translate_raw_tags(linkage_list[-1]) 

304 

305 getattr(word_entry, linkage_type).extend(linkage_list) 

306 

307 

308def extract_zh_l_template( 

309 wxr: WiktextractContext, 

310 t_node: TemplateNode, 

311 sense: str, 

312 raw_tags: list[str], 

313) -> list[Linkage]: 

314 l_list = [] 

315 expanded_node = wxr.wtp.parse( 

316 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

317 ) 

318 roman = "" 

319 new_sense = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

320 if new_sense != "": 320 ↛ 322line 320 didn't jump to line 322 because the condition on line 320 was always true

321 sense = new_sense 

322 for i_tag in expanded_node.find_html_recursively( 

323 "span", attr_name="class", attr_value="Latn" 

324 ): 

325 roman = clean_node(wxr, None, i_tag) 

326 for span_tag in expanded_node.find_html( 

327 "span", attr_name="lang", attr_value="zh" 

328 ): 

329 linkage_data = Linkage( 

330 sense=sense, 

331 raw_tags=raw_tags, 

332 roman=roman, 

333 word=clean_node(wxr, None, span_tag), 

334 ) 

335 lang_attr = span_tag.attrs.get("lang", "") 

336 if lang_attr == "zh-Hant": 

337 linkage_data.tags.append("Traditional-Chinese") 

338 elif lang_attr == "zh-Hans": 

339 linkage_data.tags.append("Simplified-Chinese") 

340 if linkage_data.word not in ["/", ""]: 

341 translate_raw_tags(linkage_data) 

342 l_list.append(linkage_data) 

343 

344 return l_list