Coverage for src/wiktextract/extractor/ku/linkage.py: 80%
157 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import re
2from itertools import count
4from wikitextprocessor import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, Linkage, WordEntry
9from .tags import translate_raw_tags
12def extract_ku_form_template(
13 wxr: WiktextractContext,
14 word_entry: WordEntry,
15 t_node: TemplateNode,
16 linkage_type: str = "",
17 sense: str = "",
18) -> None:
19 expanded_node = wxr.wtp.parse(
20 wxr.wtp.node_to_wikitext(t_node), expand_all=True
21 )
22 form = Form(form="")
23 for index, span_tag in enumerate(expanded_node.find_html("span")):
24 if index == 0:
25 form.raw_tags.append(clean_node(wxr, None, span_tag))
26 elif index == 1: 26 ↛ 23line 26 didn't jump to line 23 because the condition on line 26 was always true
27 form.form = clean_node(wxr, None, span_tag)
28 if form.form != "": 28 ↛ exitline 28 didn't return from function 'extract_ku_form_template' because the condition on line 28 was always true
29 translate_raw_tags(form)
30 if linkage_type == "": 30 ↛ 33line 30 didn't jump to line 33 because the condition on line 30 was always true
31 word_entry.forms.append(form)
32 else:
33 getattr(word_entry, linkage_type).append(
34 Linkage(
35 word=form.form,
36 raw_tags=form.raw_tags,
37 sense=sense,
38 )
39 )
42def extract_g_template(
43 wxr: WiktextractContext,
44 word_entry: WordEntry,
45 t_node: TemplateNode,
46 linkage_type: str = "",
47 sense: str = "",
48 raw_tags: list[str] = [],
49) -> None:
50 expanded_node = wxr.wtp.parse(
51 wxr.wtp.node_to_wikitext(t_node), expand_all=True
52 )
53 for span_tag in expanded_node.find_html(
54 "span", attr_name="class", attr_value="gender"
55 ):
56 for abbr_tag in span_tag.find_html("abbr"):
57 raw_tag = clean_node(wxr, None, abbr_tag)
58 if raw_tag not in ["", "?"]: 58 ↛ 56line 58 didn't jump to line 56 because the condition on line 58 was always true
59 raw_tags.append(raw_tag)
60 if linkage_type == "": 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true
61 form = Form(
62 form=clean_node(
63 wxr,
64 None,
65 t_node.template_parameters.get(
66 2, t_node.template_parameters.get("cuda", "")
67 ),
68 ),
69 roman=clean_node(
70 wxr, None, t_node.template_parameters.get("tr", "")
71 ),
72 translation=clean_node(
73 wxr, None, t_node.template_parameters.get("w", "")
74 ),
75 raw_tags=raw_tags,
76 )
77 if form.form != "":
78 translate_raw_tags(form)
79 word_entry.forms.append(form)
80 else:
81 l_data = Linkage(
82 word=clean_node(
83 wxr,
84 None,
85 t_node.template_parameters.get(
86 2, t_node.template_parameters.get("cuda", "")
87 ),
88 ),
89 roman=clean_node(
90 wxr, None, t_node.template_parameters.get("tr", "")
91 ),
92 translation=clean_node(
93 wxr, None, t_node.template_parameters.get("w", "")
94 ),
95 sense=sense,
96 raw_tags=raw_tags,
97 )
98 if l_data.word != "": 98 ↛ exitline 98 didn't return from function 'extract_g_template' because the condition on line 98 was always true
99 translate_raw_tags(l_data)
100 getattr(word_entry, linkage_type).append(l_data)
103def extract_hw_template(
104 wxr: WiktextractContext,
105 word_entry: WordEntry,
106 t_node: TemplateNode,
107 linkage_type: str = "",
108 sense: str = "",
109) -> None:
110 # https://ku.wiktionary.org/wiki/Şablon:hw
111 raw_tags = []
112 forms = []
113 for arg in count(5): 113 ↛ 119line 113 didn't jump to line 119 because the loop on line 113 didn't complete
114 if arg not in t_node.template_parameters: 114 ↛ 116line 114 didn't jump to line 116 because the condition on line 114 was always true
115 break
116 raw_tag = clean_node(wxr, None, t_node.template_parameters[arg])
117 if raw_tag != "":
118 raw_tags.append(raw_tag)
119 expanded_node = wxr.wtp.parse(
120 wxr.wtp.node_to_wikitext(t_node), expand_all=True
121 )
122 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
123 for span_tag in expanded_node.find_html("span"):
124 span_lang = span_tag.attrs.get("lang", "")
125 if span_lang == lang_code: 125 ↛ 129line 125 didn't jump to line 129 because the condition on line 125 was always true
126 form_str = clean_node(wxr, None, span_tag)
127 if form_str != "": 127 ↛ 123line 127 didn't jump to line 123 because the condition on line 127 was always true
128 forms.append(Form(form=form_str, raw_tags=raw_tags))
129 elif span_lang.endswith("-Latn") and len(forms) > 0:
130 forms[-1].roman = clean_node(wxr, None, span_tag)
132 if linkage_type == "": 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true
133 word_entry.forms.extend(forms)
134 else:
135 getattr(word_entry, linkage_type).extend(
136 [
137 Linkage(
138 word=f.form,
139 roman=f.roman,
140 sense=sense,
141 raw_tags=f.raw_tags,
142 )
143 for f in forms
144 ]
145 )
148def extract_linkage_section(
149 wxr: WiktextractContext,
150 word_entry: WordEntry,
151 level_node: WikiNode,
152 linkage_type: str,
153 shared_tags: list[str] = [],
154) -> None:
155 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
156 if (
157 isinstance(node, TemplateNode)
158 and re.fullmatch(r"kol(?:\d+)?", node.template_name) is not None
159 ):
160 extract_kol_template(wxr, word_entry, node, linkage_type)
161 elif isinstance(node, TemplateNode) and node.template_name == "stûn":
162 extract_stûn_template(wxr, word_entry, node, linkage_type)
163 elif node.kind == NodeKind.LIST: 163 ↛ 155line 163 didn't jump to line 155 because the condition on line 163 was always true
164 for list_item in node.find_child(NodeKind.LIST_ITEM):
165 extract_linkage_list_item(
166 wxr, word_entry, list_item, linkage_type, "", shared_tags
167 )
170def extract_kol_template(
171 wxr: WiktextractContext,
172 word_entry: WordEntry,
173 t_node: TemplateNode,
174 linkage_type: str,
175) -> None:
176 # https://ku.wiktionary.org/wiki/Şablon:kol
177 sense = clean_node(wxr, None, t_node.template_parameters.get("sernav", ""))
178 for arg in count(3 if t_node.template_name == "kol" else 2): 178 ↛ exitline 178 didn't return from function 'extract_kol_template' because the loop on line 178 didn't complete
179 if arg not in t_node.template_parameters:
180 break
181 arg_value = t_node.template_parameters[arg]
182 if isinstance(arg_value, str):
183 if arg_value.strip() != "": 183 ↛ 178line 183 didn't jump to line 178 because the condition on line 183 was always true
184 word = arg_value.strip()
185 raw_tag = ""
186 m = re.search(r"<q:(.+)>", word)
187 if m is not None:
188 word = word[: m.start()].strip()
189 raw_tag = m.group(1).strip()
190 if linkage_type != "": 190 ↛ 197line 190 didn't jump to line 197 because the condition on line 190 was always true
191 l_data = Linkage(word=word, sense=sense)
192 if raw_tag != "":
193 l_data.raw_tags.append(raw_tag)
194 translate_raw_tags(l_data)
195 getattr(word_entry, linkage_type).append(l_data)
196 else:
197 form = Form(form=word, sense=sense)
198 if raw_tag != "":
199 form.raw_tags.append(raw_tag)
200 translate_raw_tags(form)
201 word_entry.forms.append(form)
202 else:
203 if not isinstance(arg_value, list): 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true
204 arg_value = [arg_value]
205 if (
206 len(arg_value) > 0
207 and isinstance(arg_value[0], str)
208 and arg_value[0].strip() == ""
209 ):
210 arg_value.pop(0) # not preformatted node
211 arg_value_node = wxr.wtp.parse(wxr.wtp.node_to_wikitext(arg_value))
212 extract_linkage_list_item(
213 wxr, word_entry, arg_value_node, linkage_type, sense
214 )
217def extract_linkage_list_item(
218 wxr: WiktextractContext,
219 word_entry: WordEntry,
220 list_item: WikiNode,
221 linkage_type: str,
222 sense: str,
223 shared_tags: list[str] = [],
224) -> None:
225 raw_tags = []
226 forms = []
227 for node in list_item.children:
228 if (
229 isinstance(node, WikiNode) and node.kind == NodeKind.LINK
230 ) or isinstance(node, str):
231 word = clean_node(wxr, None, node)
232 if word != "":
233 if linkage_type != "": 233 ↛ 244line 233 didn't jump to line 244 because the condition on line 233 was always true
234 l_data = Linkage(
235 word=word,
236 sense=sense,
237 raw_tags=raw_tags,
238 tags=shared_tags,
239 )
240 forms.append(l_data)
241 translate_raw_tags(l_data)
242 getattr(word_entry, linkage_type).append(l_data)
243 else:
244 form = Form(form=word, raw_tags=raw_tags, tags=shared_tags)
245 translate_raw_tags(form)
246 forms.append(form)
247 word_entry.forms.append(form)
248 elif isinstance(node, TemplateNode): 248 ↛ 227line 248 didn't jump to line 227 because the condition on line 248 was always true
249 if node.template_name == "g":
250 extract_g_template(
251 wxr,
252 word_entry,
253 node,
254 linkage_type=linkage_type,
255 raw_tags=raw_tags,
256 )
257 elif node.template_name.startswith("ku-"):
258 extract_ku_form_template(
259 wxr,
260 word_entry,
261 node,
262 linkage_type=linkage_type,
263 sense=sense,
264 )
265 elif node.template_name in ["herwiha", "hw"]:
266 extract_hw_template(
267 wxr,
268 word_entry,
269 node,
270 linkage_type=linkage_type,
271 sense=sense,
272 )
273 elif node.template_name == "mj": 273 ↛ 227line 273 didn't jump to line 227 because the condition on line 273 was always true
274 raw_tag = clean_node(wxr, None, node).strip("() ")
275 if raw_tag != "": 275 ↛ 227line 275 didn't jump to line 227 because the condition on line 275 was always true
276 raw_tags.append(raw_tag)
277 for form in forms:
278 form.raw_tags.append(raw_tag)
279 translate_raw_tags(form)
282def extract_stûn_template(
283 wxr: WiktextractContext,
284 word_entry: WordEntry,
285 t_node: TemplateNode,
286 linkage_type: str,
287) -> None:
288 first_arg = t_node.template_parameters.get(1)
289 if first_arg is None: 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true
290 return
291 first_arg = wxr.wtp.parse(wxr.wtp.node_to_wikitext(first_arg))
292 for list_node in first_arg.find_child(NodeKind.LIST):
293 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
294 extract_linkage_list_item(
295 wxr, word_entry, list_item, linkage_type, ""
296 )
299LINKAGE_TEMPLATES = {
300 "hevmane": "synonyms",
301 "hevwate": "synonyms",
302 "hevmaneya peyvê": "synonyms",
303 "hevmaneyên peyvê": "synonyms",
304 "dijmane": "antonyms",
305 "dijmaneyên peyvê": "antonyms",
306 "dijwate": "antonyms",
307 "jornav": "hypernyms",
308 "hîpernîm": "hypernyms",
309 "jêrnav": "hyponyms",
310 "hîponîm": "hyponyms",
311 "termên koordîne": "coordinate_terms",
312 "peyvên koordîneyî": "coordinate_terms",
313 "herwiha di rêza maneyê de": "forms",
314 "herwiha-rêz": "forms",
315}
318def extract_nyms_template(
319 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
320) -> None:
321 # https://ku.wiktionary.org/wiki/Modul:nyms
322 expanded_node = wxr.wtp.parse(
323 wxr.wtp.node_to_wikitext(t_node), expand_all=True
324 )
325 l_list = []
326 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
327 for span_tag in expanded_node.find_html_recursively("span"):
328 span_lang = span_tag.attrs.get("lang", "")
329 span_class = span_tag.attrs.get("class", "")
330 if span_lang == lang_code:
331 l_list.append(
332 Linkage(
333 word=clean_node(wxr, None, span_tag),
334 sense=" ".join(
335 word_entry.senses[-1].glosses
336 if len(word_entry.senses) > 0
337 else ""
338 ),
339 )
340 )
341 elif span_class == "tr Latn" and len(l_list) > 0:
342 l_list[-1].roman = clean_node(wxr, None, span_tag)
343 elif span_class == "ann-pos" and len(l_list) > 0:
344 raw_tag = clean_node(wxr, None, span_tag)
345 if raw_tag != "": 345 ↛ 327line 345 didn't jump to line 327 because the condition on line 345 was always true
346 l_list[-1].raw_tags.append(raw_tag)
347 translate_raw_tags(l_list[-1])
349 field = LINKAGE_TEMPLATES[t_node.template_name]
350 if field == "forms": 350 ↛ 351line 350 didn't jump to line 351 because the condition on line 350 was never true
351 l_list = [
352 Form(form=l_data.word, tags=["alt-of"], sense=l_data.sense)
353 for l_data in l_list
354 ]
355 getattr(word_entry, LINKAGE_TEMPLATES[t_node.template_name]).extend(l_list)