Coverage for src/wiktextract/extractor/vi/linkage.py: 88%

180 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1import re 

2 

3from wikitextprocessor import ( 

4 HTMLNode, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .models import Form, Linkage, WordEntry 

14from .tags import translate_raw_tags 

15 

16GLOSS_LIST_LINKAGE_TEMPLATES = { 

17 "antonyms": "antonyms", 

18 "def-ant": "antonyms", 

19 "antonym": "antonyms", 

20 "coordinate terms": "coordinate_terms", 

21 "def-cot": "coordinate_terms", 

22 "def-coo": "coordinate_terms", 

23 "cot": "coordinate_terms", 

24 "holonyms": "holonyms", 

25 "holonym": "holonyms", 

26 "holo": "holonyms", 

27 "hypernyms": "hypernyms", 

28 "hyper": "hypernyms", 

29 "hyponyms": "hyponyms", 

30 "hypo": "hyponyms", 

31 "inline alt forms": "alt_forms", 

32 "alti": "alt_forms", 

33 "meronyms": "meronyms", 

34 "mero": "meronyms", 

35 "synonyms": "synonyms", 

36 "synonym": "synonyms", 

37 "def-syn": "synonyms", 

38 "synsee": "synonyms", 

39} 

40 

41QUALIFIER_TEMPALTES = ["qualifier", "qual", "q", "qf", "i"] 

42 

43 

44def extract_gloss_list_linkage_template( 

45 wxr: WiktextractContext, 

46 word_entry: WordEntry, 

47 t_node: TemplateNode, 

48 linkage_type: str, 

49 sense: str, 

50): 

51 expanded_node = wxr.wtp.parse( 

52 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

53 ) 

54 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

55 l_list = [] 

56 raw_tags = [] 

57 for top_span_tag in expanded_node.find_html("span"): 

58 for node in top_span_tag.children: 

59 if isinstance(node, HTMLNode) and node.tag == "span": 

60 span_lang = node.attrs.get("lang", "") 

61 span_class = node.attrs.get("class", "").split() 

62 if span_lang == lang_code: 

63 l_data = Linkage( 

64 word=clean_node(wxr, None, node), 

65 sense=sense, 

66 raw_tags=raw_tags, 

67 ) 

68 if "Hant" in span_class: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 l_data.tags.append("Traditional-Chinese") 

70 elif "Hans" in span_class: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 l_data.tags.append("Simplified-Chinese") 

72 if l_data.word != "": 72 ↛ 58line 72 didn't jump to line 58 because the condition on line 72 was always true

73 translate_raw_tags(l_data) 

74 l_list.append(l_data) 

75 elif span_lang == f"{lang_code}-Latn" or "tr" in span_class: 

76 roman = clean_node(wxr, None, node) 

77 for d in l_list: 

78 d.roman = roman 

79 elif "mention-gloss" in span_class: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true

80 sense = clean_node(wxr, None, node) 

81 for d in l_list: 

82 d.sense = sense 

83 elif "qualifier-content" in span_class: 

84 raw_tag_str = clean_node(wxr, None, node) 

85 for raw_tag in raw_tag_str.split(","): 

86 raw_tag = raw_tag.strip() 

87 if raw_tag != "": 87 ↛ 85line 87 didn't jump to line 85 because the condition on line 87 was always true

88 raw_tags.append(raw_tag) 

89 elif isinstance(node, str) and node.strip() == ",": 

90 if linkage_type == "alt_forms": 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true

91 for l_data in l_list: 

92 word_entry.forms.append( 

93 Form( 

94 form=l_data.word, 

95 sense=l_data.sense, 

96 tags=l_data.tags + ["alternative"], 

97 raw_tags=l_data.raw_tags, 

98 roman=l_data.roman, 

99 ) 

100 ) 

101 else: 

102 getattr(word_entry, linkage_type).extend(l_list) 

103 l_list.clear() 

104 raw_tags.clear() 

105 

106 if linkage_type == "alt_forms": 

107 for l_data in l_list: 

108 word_entry.forms.append( 

109 Form( 

110 form=l_data.word, 

111 sense=l_data.sense, 

112 tags=l_data.tags + ["alternative"], 

113 raw_tags=l_data.raw_tags, 

114 roman=l_data.roman, 

115 ) 

116 ) 

117 else: 

118 getattr(word_entry, linkage_type).extend(l_list) 

119 

120 

121def extract_alt_form_section( 

122 wxr: WiktextractContext, 

123 base_data: WordEntry, 

124 page_data: list[WordEntry], 

125 level_node: LevelNode, 

126): 

127 forms = [] 

128 for list_node in level_node.find_child(NodeKind.LIST): 

129 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

130 raw_tags = [] 

131 for node in list_item.children: 

132 if isinstance(node, TemplateNode) and node.template_name in [ 

133 "alter", 

134 "def-alt", 

135 ]: 

136 forms.extend(extract_alter_template(wxr, node, raw_tags)) 

137 elif ( 

138 isinstance(node, TemplateNode) 

139 and node.template_name in QUALIFIER_TEMPALTES 

140 ): 

141 raw_tags.extend(extract_qualifier_template(wxr, node)) 

142 

143 if len(page_data) == 0 or page_data[-1].lang != base_data.lang: 

144 base_data.forms.extend(forms) 

145 else: 

146 page_data[-1].forms.extend(forms) 

147 

148 

149def extract_alter_template( 

150 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str] 

151) -> list[Form]: 

152 forms = [] 

153 expanded_node = wxr.wtp.parse( 

154 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

155 ) 

156 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

157 for span_tag in expanded_node.find_html( 

158 "span", attr_name="lang", attr_value=lang_code 

159 ): 

160 word = clean_node(wxr, None, span_tag) 

161 if word != "": 161 ↛ 157line 161 didn't jump to line 157 because the condition on line 161 was always true

162 form = Form(form=word, tags=["alternative"], raw_tags=raw_tags) 

163 translate_raw_tags(form) 

164 forms.append(form) 

165 return forms 

166 

167 

168def extract_qualifier_template( 

169 wxr: WiktextractContext, t_node: TemplateNode 

170) -> list[str]: 

171 raw_tags = [] 

172 for raw_tag in clean_node(wxr, None, t_node).strip("()").split(","): 

173 raw_tag = raw_tag.strip() 

174 if raw_tag != "": 174 ↛ 172line 174 didn't jump to line 172 because the condition on line 174 was always true

175 raw_tags.append(raw_tag) 

176 return raw_tags 

177 

178 

179def extract_linkage_section( 

180 wxr: WiktextractContext, 

181 page_data: list[WordEntry], 

182 level_node: LevelNode, 

183 linkage_type: str, 

184): 

185 l_data = [] 

186 if linkage_type == "idioms": 

187 l_data.extend(extract_idiom_section(wxr, level_node)) 

188 linkage_type = "related" 

189 else: 

190 for node in level_node.children: 

191 if isinstance(node, TemplateNode) and ( 

192 re.fullmatch(r"(?:col|der|rel)(?:\d+)?", node.template_name) 

193 or node.template_name in ["columns", "column"] 

194 ): 

195 l_data.extend(extract_col_template(wxr, node)) 

196 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

197 for list_item in node.find_child(NodeKind.LIST_ITEM): 

198 l_data.extend(extract_linkage_list_item(wxr, list_item)) 

199 

200 if level_node.kind == NodeKind.LEVEL3: 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true

201 for data in page_data: 

202 if data.lang_code == page_data[-1].lang_code: 

203 getattr(data, linkage_type).extend(l_data) 

204 elif len(page_data) > 0: 204 ↛ exitline 204 didn't return from function 'extract_linkage_section' because the condition on line 204 was always true

205 getattr(page_data[-1], linkage_type).extend(l_data) 

206 

207 

208def extract_col_template( 

209 wxr: WiktextractContext, t_node: TemplateNode 

210) -> list[Linkage]: 

211 l_list = [] 

212 expanded_template = wxr.wtp.parse( 

213 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

214 ) 

215 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

216 for li_tag in expanded_template.find_html_recursively("li"): 

217 first_word = True 

218 translation = "" 

219 for node in li_tag.children: 

220 if isinstance(node, str): 

221 m = re.search(r"“(.+)”", node) 

222 if m is not None: 

223 translation = m.group(1).strip() 

224 for span_tag in li_tag.find_html("span"): 

225 span_lang = span_tag.attrs.get("lang", "") 

226 span_class = span_tag.attrs.get("class", "") 

227 if span_lang.endswith("-Latn") and len(l_list) > 0: 

228 l_list[-1].roman = clean_node(wxr, None, span_tag) 

229 elif span_lang == lang_code: 

230 if lang_code == "zh": 

231 l_data = Linkage(word=clean_node(wxr, None, span_tag)) 

232 if "Hant" in span_class: 

233 l_data.tags.append("Traditional-Chinese") 

234 elif "Hans" in span_class: 234 ↛ 236line 234 didn't jump to line 236 because the condition on line 234 was always true

235 l_data.tags.append("Simplified-Chinese") 

236 l_list.append(l_data) 

237 elif not first_word: 

238 l_list[-1].other = clean_node(wxr, None, span_tag) 

239 else: 

240 l_list.append( 

241 Linkage( 

242 word=clean_node(wxr, None, span_tag), 

243 translation=translation, 

244 ) 

245 ) 

246 first_word = False 

247 

248 return l_list 

249 

250 

251def extract_linkage_list_item( 

252 wxr: WiktextractContext, list_item: WikiNode 

253) -> list[Linkage]: 

254 l_list = [] 

255 sense = "" 

256 for node in list_item.children: 

257 if isinstance(node, TemplateNode): 

258 if node.template_name in ["sense", "s"]: 

259 sense = clean_node(wxr, None, node).strip("(): ") 

260 elif node.template_name in ["l", "link"]: 260 ↛ 256line 260 didn't jump to line 256 because the condition on line 260 was always true

261 l_list.extend(extract_link_template(wxr, node, sense)) 

262 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 262 ↛ 263line 262 didn't jump to line 263 because the condition on line 262 was never true

263 word = clean_node(wxr, None, node) 

264 if word != "": 

265 l_list.append(Linkage(word=word, sense=sense)) 

266 return l_list 

267 

268 

269def extract_link_template( 

270 wxr: WiktextractContext, t_node: TemplateNode, sense: str 

271) -> list[Linkage]: 

272 l_list = [] 

273 expanded_template = wxr.wtp.parse( 

274 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

275 ) 

276 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

277 for span_tag in expanded_template.find_html("span"): 

278 span_lang = span_tag.attrs.get("lang", "") 

279 if span_lang == lang_code: 279 ↛ 277line 279 didn't jump to line 277 because the condition on line 279 was always true

280 l_list.append( 

281 Linkage(word=clean_node(wxr, None, span_tag), sense=sense) 

282 ) 

283 

284 return l_list 

285 

286 

287def extract_idiom_section( 

288 wxr: WiktextractContext, level_node: LevelNode 

289) -> list[Linkage]: 

290 l_list = [] 

291 for list_node in level_node.find_child(NodeKind.LIST): 

292 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

293 l_list.extend(extract_idiom_list_item(wxr, list_item)) 

294 

295 return l_list 

296 

297 

298def extract_idiom_list_item( 

299 wxr: WiktextractContext, list_item: WikiNode 

300) -> list[Linkage]: 

301 l_list = [] 

302 bold_index = 0 

303 sense_nodes = [] 

304 for index, node in enumerate(list_item.children): 

305 if isinstance(node, WikiNode) and node.kind == NodeKind.BOLD: 

306 word = clean_node(wxr, None, node) 

307 if word != "": 307 ↛ 304line 307 didn't jump to line 304 because the condition on line 307 was always true

308 bold_index = index 

309 l_list.append(Linkage(word=word, tags=["idiomatic"])) 

310 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

311 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

312 sense = clean_node(wxr, None, child_list_item.children) 

313 if sense != "" and len(l_list) > 0: 313 ↛ 311line 313 didn't jump to line 311 because the condition on line 313 was always true

314 l_list[-1].senses.append(sense) 

315 elif index > bold_index: 

316 sense_nodes.append(node) 

317 

318 sense = clean_node(wxr, None, sense_nodes).strip(": ") 

319 if sense != "" and len(l_list) > 0: 

320 l_list[-1].sense = sense 

321 

322 return l_list