Coverage for src/wiktextract/extractor/ku/linkage.py: 80%

157 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1import re 

2from itertools import count 

3 

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, Linkage, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def extract_ku_form_template( 

13 wxr: WiktextractContext, 

14 word_entry: WordEntry, 

15 t_node: TemplateNode, 

16 linkage_type: str = "", 

17 sense: str = "", 

18) -> None: 

19 expanded_node = wxr.wtp.parse( 

20 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

21 ) 

22 form = Form(form="") 

23 for index, span_tag in enumerate(expanded_node.find_html("span")): 

24 if index == 0: 

25 form.raw_tags.append(clean_node(wxr, None, span_tag)) 

26 elif index == 1: 26 ↛ 23line 26 didn't jump to line 23 because the condition on line 26 was always true

27 form.form = clean_node(wxr, None, span_tag) 

28 if form.form != "": 28 ↛ exitline 28 didn't return from function 'extract_ku_form_template' because the condition on line 28 was always true

29 translate_raw_tags(form) 

30 if linkage_type == "": 30 ↛ 33line 30 didn't jump to line 33 because the condition on line 30 was always true

31 word_entry.forms.append(form) 

32 else: 

33 getattr(word_entry, linkage_type).append( 

34 Linkage( 

35 word=form.form, 

36 raw_tags=form.raw_tags, 

37 sense=sense, 

38 ) 

39 ) 

40 

41 

42def extract_g_template( 

43 wxr: WiktextractContext, 

44 word_entry: WordEntry, 

45 t_node: TemplateNode, 

46 linkage_type: str = "", 

47 sense: str = "", 

48 raw_tags: list[str] = [], 

49) -> None: 

50 expanded_node = wxr.wtp.parse( 

51 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

52 ) 

53 for span_tag in expanded_node.find_html( 

54 "span", attr_name="class", attr_value="gender" 

55 ): 

56 for abbr_tag in span_tag.find_html("abbr"): 

57 raw_tag = clean_node(wxr, None, abbr_tag) 

58 if raw_tag not in ["", "?"]: 58 ↛ 56line 58 didn't jump to line 56 because the condition on line 58 was always true

59 raw_tags.append(raw_tag) 

60 if linkage_type == "": 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 form = Form( 

62 form=clean_node( 

63 wxr, 

64 None, 

65 t_node.template_parameters.get( 

66 2, t_node.template_parameters.get("cuda", "") 

67 ), 

68 ), 

69 roman=clean_node( 

70 wxr, None, t_node.template_parameters.get("tr", "") 

71 ), 

72 translation=clean_node( 

73 wxr, None, t_node.template_parameters.get("w", "") 

74 ), 

75 raw_tags=raw_tags, 

76 ) 

77 if form.form != "": 

78 translate_raw_tags(form) 

79 word_entry.forms.append(form) 

80 else: 

81 l_data = Linkage( 

82 word=clean_node( 

83 wxr, 

84 None, 

85 t_node.template_parameters.get( 

86 2, t_node.template_parameters.get("cuda", "") 

87 ), 

88 ), 

89 roman=clean_node( 

90 wxr, None, t_node.template_parameters.get("tr", "") 

91 ), 

92 translation=clean_node( 

93 wxr, None, t_node.template_parameters.get("w", "") 

94 ), 

95 sense=sense, 

96 raw_tags=raw_tags, 

97 ) 

98 if l_data.word != "": 98 ↛ exitline 98 didn't return from function 'extract_g_template' because the condition on line 98 was always true

99 translate_raw_tags(l_data) 

100 getattr(word_entry, linkage_type).append(l_data) 

101 

102 

103def extract_hw_template( 

104 wxr: WiktextractContext, 

105 word_entry: WordEntry, 

106 t_node: TemplateNode, 

107 linkage_type: str = "", 

108 sense: str = "", 

109) -> None: 

110 # https://ku.wiktionary.org/wiki/Şablon:hw 

111 raw_tags = [] 

112 forms = [] 

113 for arg in count(5): 113 ↛ 119line 113 didn't jump to line 119 because the loop on line 113 didn't complete

114 if arg not in t_node.template_parameters: 114 ↛ 116line 114 didn't jump to line 116 because the condition on line 114 was always true

115 break 

116 raw_tag = clean_node(wxr, None, t_node.template_parameters[arg]) 

117 if raw_tag != "": 

118 raw_tags.append(raw_tag) 

119 expanded_node = wxr.wtp.parse( 

120 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

121 ) 

122 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

123 for span_tag in expanded_node.find_html("span"): 

124 span_lang = span_tag.attrs.get("lang", "") 

125 if span_lang == lang_code: 125 ↛ 129line 125 didn't jump to line 129 because the condition on line 125 was always true

126 form_str = clean_node(wxr, None, span_tag) 

127 if form_str != "": 127 ↛ 123line 127 didn't jump to line 123 because the condition on line 127 was always true

128 forms.append(Form(form=form_str, raw_tags=raw_tags)) 

129 elif span_lang.endswith("-Latn") and len(forms) > 0: 

130 forms[-1].roman = clean_node(wxr, None, span_tag) 

131 

132 if linkage_type == "": 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true

133 word_entry.forms.extend(forms) 

134 else: 

135 getattr(word_entry, linkage_type).extend( 

136 [ 

137 Linkage( 

138 word=f.form, 

139 roman=f.roman, 

140 sense=sense, 

141 raw_tags=f.raw_tags, 

142 ) 

143 for f in forms 

144 ] 

145 ) 

146 

147 

148def extract_linkage_section( 

149 wxr: WiktextractContext, 

150 word_entry: WordEntry, 

151 level_node: WikiNode, 

152 linkage_type: str, 

153 shared_tags: list[str] = [], 

154) -> None: 

155 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): 

156 if ( 

157 isinstance(node, TemplateNode) 

158 and re.fullmatch(r"kol(?:\d+)?", node.template_name) is not None 

159 ): 

160 extract_kol_template(wxr, word_entry, node, linkage_type) 

161 elif isinstance(node, TemplateNode) and node.template_name == "stûn": 

162 extract_stûn_template(wxr, word_entry, node, linkage_type) 

163 elif node.kind == NodeKind.LIST: 163 ↛ 155line 163 didn't jump to line 155 because the condition on line 163 was always true

164 for list_item in node.find_child(NodeKind.LIST_ITEM): 

165 extract_linkage_list_item( 

166 wxr, word_entry, list_item, linkage_type, "", shared_tags 

167 ) 

168 

169 

170def extract_kol_template( 

171 wxr: WiktextractContext, 

172 word_entry: WordEntry, 

173 t_node: TemplateNode, 

174 linkage_type: str, 

175) -> None: 

176 # https://ku.wiktionary.org/wiki/Şablon:kol 

177 sense = clean_node(wxr, None, t_node.template_parameters.get("sernav", "")) 

178 for arg in count(3 if t_node.template_name == "kol" else 2): 178 ↛ exitline 178 didn't return from function 'extract_kol_template' because the loop on line 178 didn't complete

179 if arg not in t_node.template_parameters: 

180 break 

181 arg_value = t_node.template_parameters[arg] 

182 if isinstance(arg_value, str): 

183 if arg_value.strip() != "": 183 ↛ 178line 183 didn't jump to line 178 because the condition on line 183 was always true

184 word = arg_value.strip() 

185 raw_tag = "" 

186 m = re.search(r"<q:(.+)>", word) 

187 if m is not None: 

188 word = word[: m.start()].strip() 

189 raw_tag = m.group(1).strip() 

190 if linkage_type != "": 190 ↛ 197line 190 didn't jump to line 197 because the condition on line 190 was always true

191 l_data = Linkage(word=word, sense=sense) 

192 if raw_tag != "": 

193 l_data.raw_tags.append(raw_tag) 

194 translate_raw_tags(l_data) 

195 getattr(word_entry, linkage_type).append(l_data) 

196 else: 

197 form = Form(form=word, sense=sense) 

198 if raw_tag != "": 

199 form.raw_tags.append(raw_tag) 

200 translate_raw_tags(form) 

201 word_entry.forms.append(form) 

202 else: 

203 if not isinstance(arg_value, list): 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true

204 arg_value = [arg_value] 

205 if ( 

206 len(arg_value) > 0 

207 and isinstance(arg_value[0], str) 

208 and arg_value[0].strip() == "" 

209 ): 

210 arg_value.pop(0) # not preformatted node 

211 arg_value_node = wxr.wtp.parse(wxr.wtp.node_to_wikitext(arg_value)) 

212 extract_linkage_list_item( 

213 wxr, word_entry, arg_value_node, linkage_type, sense 

214 ) 

215 

216 

217def extract_linkage_list_item( 

218 wxr: WiktextractContext, 

219 word_entry: WordEntry, 

220 list_item: WikiNode, 

221 linkage_type: str, 

222 sense: str, 

223 shared_tags: list[str] = [], 

224) -> None: 

225 raw_tags = [] 

226 forms = [] 

227 for node in list_item.children: 

228 if ( 

229 isinstance(node, WikiNode) and node.kind == NodeKind.LINK 

230 ) or isinstance(node, str): 

231 word = clean_node(wxr, None, node) 

232 if word != "": 

233 if linkage_type != "": 233 ↛ 244line 233 didn't jump to line 244 because the condition on line 233 was always true

234 l_data = Linkage( 

235 word=word, 

236 sense=sense, 

237 raw_tags=raw_tags, 

238 tags=shared_tags, 

239 ) 

240 forms.append(l_data) 

241 translate_raw_tags(l_data) 

242 getattr(word_entry, linkage_type).append(l_data) 

243 else: 

244 form = Form(form=word, raw_tags=raw_tags, tags=shared_tags) 

245 translate_raw_tags(form) 

246 forms.append(form) 

247 word_entry.forms.append(form) 

248 elif isinstance(node, TemplateNode): 248 ↛ 227line 248 didn't jump to line 227 because the condition on line 248 was always true

249 if node.template_name == "g": 

250 extract_g_template( 

251 wxr, 

252 word_entry, 

253 node, 

254 linkage_type=linkage_type, 

255 raw_tags=raw_tags, 

256 ) 

257 elif node.template_name.startswith("ku-"): 

258 extract_ku_form_template( 

259 wxr, 

260 word_entry, 

261 node, 

262 linkage_type=linkage_type, 

263 sense=sense, 

264 ) 

265 elif node.template_name in ["herwiha", "hw"]: 

266 extract_hw_template( 

267 wxr, 

268 word_entry, 

269 node, 

270 linkage_type=linkage_type, 

271 sense=sense, 

272 ) 

273 elif node.template_name == "mj": 273 ↛ 227line 273 didn't jump to line 227 because the condition on line 273 was always true

274 raw_tag = clean_node(wxr, None, node).strip("() ") 

275 if raw_tag != "": 275 ↛ 227line 275 didn't jump to line 227 because the condition on line 275 was always true

276 raw_tags.append(raw_tag) 

277 for form in forms: 

278 form.raw_tags.append(raw_tag) 

279 translate_raw_tags(form) 

280 

281 

282def extract_stûn_template( 

283 wxr: WiktextractContext, 

284 word_entry: WordEntry, 

285 t_node: TemplateNode, 

286 linkage_type: str, 

287) -> None: 

288 first_arg = t_node.template_parameters.get(1) 

289 if first_arg is None: 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true

290 return 

291 first_arg = wxr.wtp.parse(wxr.wtp.node_to_wikitext(first_arg)) 

292 for list_node in first_arg.find_child(NodeKind.LIST): 

293 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

294 extract_linkage_list_item( 

295 wxr, word_entry, list_item, linkage_type, "" 

296 ) 

297 

298 

299LINKAGE_TEMPLATES = { 

300 "hevmane": "synonyms", 

301 "hevwate": "synonyms", 

302 "hevmaneya peyvê": "synonyms", 

303 "hevmaneyên peyvê": "synonyms", 

304 "dijmane": "antonyms", 

305 "dijmaneyên peyvê": "antonyms", 

306 "dijwate": "antonyms", 

307 "jornav": "hypernyms", 

308 "hîpernîm": "hypernyms", 

309 "jêrnav": "hyponyms", 

310 "hîponîm": "hyponyms", 

311 "termên koordîne": "coordinate_terms", 

312 "peyvên koordîneyî": "coordinate_terms", 

313 "herwiha di rêza maneyê de": "forms", 

314 "herwiha-rêz": "forms", 

315} 

316 

317 

318def extract_nyms_template( 

319 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

320) -> None: 

321 # https://ku.wiktionary.org/wiki/Modul:nyms 

322 expanded_node = wxr.wtp.parse( 

323 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

324 ) 

325 l_list = [] 

326 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

327 for span_tag in expanded_node.find_html_recursively("span"): 

328 span_lang = span_tag.attrs.get("lang", "") 

329 span_class = span_tag.attrs.get("class", "") 

330 if span_lang == lang_code: 

331 l_list.append( 

332 Linkage( 

333 word=clean_node(wxr, None, span_tag), 

334 sense=" ".join( 

335 word_entry.senses[-1].glosses 

336 if len(word_entry.senses) > 0 

337 else "" 

338 ), 

339 ) 

340 ) 

341 elif span_class == "tr Latn" and len(l_list) > 0: 

342 l_list[-1].roman = clean_node(wxr, None, span_tag) 

343 elif span_class == "ann-pos" and len(l_list) > 0: 

344 raw_tag = clean_node(wxr, None, span_tag) 

345 if raw_tag != "": 345 ↛ 327line 345 didn't jump to line 327 because the condition on line 345 was always true

346 l_list[-1].raw_tags.append(raw_tag) 

347 translate_raw_tags(l_list[-1]) 

348 

349 field = LINKAGE_TEMPLATES[t_node.template_name] 

350 if field == "forms": 350 ↛ 351line 350 didn't jump to line 351 because the condition on line 350 was never true

351 l_list = [ 

352 Form(form=l_data.word, tags=["alt-of"], sense=l_data.sense) 

353 for l_data in l_list 

354 ] 

355 getattr(word_entry, LINKAGE_TEMPLATES[t_node.template_name]).extend(l_list)