Coverage for src/wiktextract/extractor/ku/linkage.py: 80%

1import re

2from itertools import count

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode

6from ...page import clean_node

7from ...wxr_context import WiktextractContext

8from .models import Form, Linkage, WordEntry

9from .tags import translate_raw_tags

12def extract_ku_form_template(

13 wxr: WiktextractContext,

14 word_entry: WordEntry,

15 t_node: TemplateNode,

16 linkage_type: str = "",

17 sense: str = "",

18) -> None:

19 expanded_node = wxr.wtp.parse(

20 wxr.wtp.node_to_wikitext(t_node), expand_all=True

21 )

22 form = Form(form="")

23 for index, span_tag in enumerate(expanded_node.find_html("span")):

24 if index == 0:

25 form.raw_tags.append(clean_node(wxr, None, span_tag))

26 elif index == 1: 26 ↛ 23line 26 didn't jump to line 23 because the condition on line 26 was always true

27 form.form = clean_node(wxr, None, span_tag)

28 if form.form != "": 28 ↛ exitline 28 didn't return from function 'extract_ku_form_template' because the condition on line 28 was always true

29 translate_raw_tags(form)

30 if linkage_type == "": 30 ↛ 33line 30 didn't jump to line 33 because the condition on line 30 was always true

31 word_entry.forms.append(form)

32 else:

33 getattr(word_entry, linkage_type).append(

34 Linkage(

35 word=form.form,

36 raw_tags=form.raw_tags,

37 sense=sense,

38 )

39 )

42def extract_g_template(

43 wxr: WiktextractContext,

44 word_entry: WordEntry,

45 t_node: TemplateNode,

46 linkage_type: str = "",

47 sense: str = "",

48 raw_tags: list[str] = [],

49) -> None:

50 expanded_node = wxr.wtp.parse(

51 wxr.wtp.node_to_wikitext(t_node), expand_all=True

52 )

53 for span_tag in expanded_node.find_html(

54 "span", attr_name="class", attr_value="gender"

55 ):

56 for abbr_tag in span_tag.find_html("abbr"):

57 raw_tag = clean_node(wxr, None, abbr_tag)

58 if raw_tag not in ["", "?"]: 58 ↛ 56line 58 didn't jump to line 56 because the condition on line 58 was always true

59 raw_tags.append(raw_tag)

60 if linkage_type == "": 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 form = Form(

62 form=clean_node(

63 wxr,

64 None,

65 t_node.template_parameters.get(

66 2, t_node.template_parameters.get("cuda", "")

67 ),

68 ),

69 roman=clean_node(

70 wxr, None, t_node.template_parameters.get("tr", "")

71 ),

72 translation=clean_node(

73 wxr, None, t_node.template_parameters.get("w", "")

74 ),

75 raw_tags=raw_tags,

76 )

77 if form.form != "":

78 translate_raw_tags(form)

79 word_entry.forms.append(form)

80 else:

81 l_data = Linkage(

82 word=clean_node(

83 wxr,

84 None,

85 t_node.template_parameters.get(

86 2, t_node.template_parameters.get("cuda", "")

87 ),

88 ),

89 roman=clean_node(

90 wxr, None, t_node.template_parameters.get("tr", "")

91 ),

92 translation=clean_node(

93 wxr, None, t_node.template_parameters.get("w", "")

94 ),

95 sense=sense,

96 raw_tags=raw_tags,

97 )

98 if l_data.word != "": 98 ↛ exitline 98 didn't return from function 'extract_g_template' because the condition on line 98 was always true

99 translate_raw_tags(l_data)

100 getattr(word_entry, linkage_type).append(l_data)

101

102

103def extract_hw_template(

104 wxr: WiktextractContext,

105 word_entry: WordEntry,

106 t_node: TemplateNode,

107 linkage_type: str = "",

108 sense: str = "",

109) -> None:

110 # https://ku.wiktionary.org/wiki/Şablon:hw

111 raw_tags = []

112 forms = []

113 for arg in count(5): 113 ↛ 119line 113 didn't jump to line 119 because the loop on line 113 didn't complete

114 if arg not in t_node.template_parameters: 114 ↛ 116line 114 didn't jump to line 116 because the condition on line 114 was always true

115 break

116 raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])

117 if raw_tag != "":

118 raw_tags.append(raw_tag)

119 expanded_node = wxr.wtp.parse(

120 wxr.wtp.node_to_wikitext(t_node), expand_all=True

121 )

122 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

123 for span_tag in expanded_node.find_html("span"):

124 span_lang = span_tag.attrs.get("lang", "")

125 if span_lang == lang_code: 125 ↛ 129line 125 didn't jump to line 129 because the condition on line 125 was always true

126 form_str = clean_node(wxr, None, span_tag)

127 if form_str != "": 127 ↛ 123line 127 didn't jump to line 123 because the condition on line 127 was always true

128 forms.append(Form(form=form_str, raw_tags=raw_tags))

129 elif span_lang.endswith("-Latn") and len(forms) > 0:

130 forms[-1].roman = clean_node(wxr, None, span_tag)

131

132 if linkage_type == "": 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true

133 word_entry.forms.extend(forms)

134 else:

135 getattr(word_entry, linkage_type).extend(

136 [

137 Linkage(

138 word=f.form,

139 roman=f.roman,

140 sense=sense,

141 raw_tags=f.raw_tags,

142 )

143 for f in forms

144 ]

145 )

146

147

148def extract_linkage_section(

149 wxr: WiktextractContext,

150 word_entry: WordEntry,

151 level_node: WikiNode,

152 linkage_type: str,

153 shared_tags: list[str] = [],

154) -> None:

155 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):

156 if (

157 isinstance(node, TemplateNode)

158 and re.fullmatch(r"kol(?:\d+)?", node.template_name) is not None

159 ):

160 extract_kol_template(wxr, word_entry, node, linkage_type)

161 elif isinstance(node, TemplateNode) and node.template_name == "stûn":

162 extract_stûn_template(wxr, word_entry, node, linkage_type)

163 elif node.kind == NodeKind.LIST: 163 ↛ 155line 163 didn't jump to line 155 because the condition on line 163 was always true

164 for list_item in node.find_child(NodeKind.LIST_ITEM):

165 extract_linkage_list_item(

166 wxr, word_entry, list_item, linkage_type, "", shared_tags

167 )

168

169

170def extract_kol_template(

171 wxr: WiktextractContext,

172 word_entry: WordEntry,

173 t_node: TemplateNode,

174 linkage_type: str,

175) -> None:

176 # https://ku.wiktionary.org/wiki/Şablon:kol

177 sense = clean_node(wxr, None, t_node.template_parameters.get("sernav", ""))

178 for arg in count(3 if t_node.template_name == "kol" else 2): 178 ↛ exitline 178 didn't return from function 'extract_kol_template' because the loop on line 178 didn't complete

179 if arg not in t_node.template_parameters:

180 break

181 arg_value = t_node.template_parameters[arg]

182 if isinstance(arg_value, str):

183 if arg_value.strip() != "": 183 ↛ 178line 183 didn't jump to line 178 because the condition on line 183 was always true

184 word = arg_value.strip()

185 raw_tag = ""

186 m = re.search(r"<q:(.+)>", word)

187 if m is not None:

188 word = word[: m.start()].strip()

189 raw_tag = m.group(1).strip()

190 if linkage_type != "": 190 ↛ 197line 190 didn't jump to line 197 because the condition on line 190 was always true

191 l_data = Linkage(word=word, sense=sense)

192 if raw_tag != "":

193 l_data.raw_tags.append(raw_tag)

194 translate_raw_tags(l_data)

195 getattr(word_entry, linkage_type).append(l_data)

196 else:

197 form = Form(form=word, sense=sense)

198 if raw_tag != "":

199 form.raw_tags.append(raw_tag)

200 translate_raw_tags(form)

201 word_entry.forms.append(form)

202 else:

203 if not isinstance(arg_value, list): 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true

204 arg_value = [arg_value]

205 if (

206 len(arg_value) > 0

207 and isinstance(arg_value[0], str)

208 and arg_value[0].strip() == ""

209 ):

210 arg_value.pop(0) # not preformatted node

211 arg_value_node = wxr.wtp.parse(wxr.wtp.node_to_wikitext(arg_value))

212 extract_linkage_list_item(

213 wxr, word_entry, arg_value_node, linkage_type, sense

214 )

215

216

217def extract_linkage_list_item(

218 wxr: WiktextractContext,

219 word_entry: WordEntry,

220 list_item: WikiNode,

221 linkage_type: str,

222 sense: str,

223 shared_tags: list[str] = [],

224) -> None:

225 raw_tags = []

226 forms = []

227 for node in list_item.children:

228 if (

229 isinstance(node, WikiNode) and node.kind == NodeKind.LINK

230 ) or isinstance(node, str):

231 word = clean_node(wxr, None, node)

232 if word != "":

233 if linkage_type != "": 233 ↛ 244line 233 didn't jump to line 244 because the condition on line 233 was always true

234 l_data = Linkage(

235 word=word,

236 sense=sense,

237 raw_tags=raw_tags,

238 tags=shared_tags,

239 )

240 forms.append(l_data)

241 translate_raw_tags(l_data)

242 getattr(word_entry, linkage_type).append(l_data)

243 else:

244 form = Form(form=word, raw_tags=raw_tags, tags=shared_tags)

245 translate_raw_tags(form)

246 forms.append(form)

247 word_entry.forms.append(form)

248 elif isinstance(node, TemplateNode): 248 ↛ 227line 248 didn't jump to line 227 because the condition on line 248 was always true

249 if node.template_name == "g":

250 extract_g_template(

251 wxr,

252 word_entry,

253 node,

254 linkage_type=linkage_type,

255 raw_tags=raw_tags,

256 )

257 elif node.template_name.startswith("ku-"):

258 extract_ku_form_template(

259 wxr,

260 word_entry,

261 node,

262 linkage_type=linkage_type,

263 sense=sense,

264 )

265 elif node.template_name in ["herwiha", "hw"]:

266 extract_hw_template(

267 wxr,

268 word_entry,

269 node,

270 linkage_type=linkage_type,

271 sense=sense,

272 )

273 elif node.template_name == "mj": 273 ↛ 227line 273 didn't jump to line 227 because the condition on line 273 was always true

274 raw_tag = clean_node(wxr, None, node).strip("() ")

275 if raw_tag != "": 275 ↛ 227line 275 didn't jump to line 227 because the condition on line 275 was always true

276 raw_tags.append(raw_tag)

277 for form in forms:

278 form.raw_tags.append(raw_tag)

279 translate_raw_tags(form)

280

281

282def extract_stûn_template(

283 wxr: WiktextractContext,

284 word_entry: WordEntry,

285 t_node: TemplateNode,

286 linkage_type: str,

287) -> None:

288 first_arg = t_node.template_parameters.get(1)

289 if first_arg is None: 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true

290 return

291 first_arg = wxr.wtp.parse(wxr.wtp.node_to_wikitext(first_arg))

292 for list_node in first_arg.find_child(NodeKind.LIST):

293 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

294 extract_linkage_list_item(

295 wxr, word_entry, list_item, linkage_type, ""

296 )

297

298

299LINKAGE_TEMPLATES = {

300 "hevmane": "synonyms",

301 "hevwate": "synonyms",

302 "hevmaneya peyvê": "synonyms",

303 "hevmaneyên peyvê": "synonyms",

304 "dijmane": "antonyms",

305 "dijmaneyên peyvê": "antonyms",

306 "dijwate": "antonyms",

307 "jornav": "hypernyms",

308 "hîpernîm": "hypernyms",

309 "jêrnav": "hyponyms",

310 "hîponîm": "hyponyms",

311 "termên koordîne": "coordinate_terms",

312 "peyvên koordîneyî": "coordinate_terms",

313 "herwiha di rêza maneyê de": "forms",

314 "herwiha-rêz": "forms",

315}

316

317

318def extract_nyms_template(

319 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

320) -> None:

321 # https://ku.wiktionary.org/wiki/Modul:nyms

322 expanded_node = wxr.wtp.parse(

323 wxr.wtp.node_to_wikitext(t_node), expand_all=True

324 )

325 l_list = []

326 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

327 for span_tag in expanded_node.find_html_recursively("span"):

328 span_lang = span_tag.attrs.get("lang", "")

329 span_class = span_tag.attrs.get("class", "")

330 if span_lang == lang_code:

331 l_list.append(

332 Linkage(

333 word=clean_node(wxr, None, span_tag),

334 sense=" ".join(

335 word_entry.senses[-1].glosses

336 if len(word_entry.senses) > 0

337 else ""

338 ),

339 )

340 )

341 elif span_class == "tr Latn" and len(l_list) > 0:

342 l_list[-1].roman = clean_node(wxr, None, span_tag)

343 elif span_class == "ann-pos" and len(l_list) > 0:

344 raw_tag = clean_node(wxr, None, span_tag)

345 if raw_tag != "": 345 ↛ 327line 345 didn't jump to line 327 because the condition on line 345 was always true

346 l_list[-1].raw_tags.append(raw_tag)

347 translate_raw_tags(l_list[-1])

348

349 field = LINKAGE_TEMPLATES[t_node.template_name]

350 if field == "forms": 350 ↛ 351line 350 didn't jump to line 351 because the condition on line 350 was never true

351 l_list = [

352 Form(form=l_data.word, tags=["alt-of"], sense=l_data.sense)

353 for l_data in l_list

354 ]

355 getattr(word_entry, LINKAGE_TEMPLATES[t_node.template_name]).extend(l_list)