Coverage for src/wiktextract/extractor/ku/pos.py: 78%
209 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import re
2from itertools import count
4from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .example import extract_example_list_item
9from .form_table import (
10 extract_ku_tewîn_lk_template,
11 extract_ku_tewîn_nav_template,
12)
13from .models import AltForm, Form, Sense, WordEntry
14from .section_titles import POS_DATA
15from .tags import TAGS, translate_raw_tags
17FORM_OF_TEMPLATES = frozenset(
18 [
19 "formeke peyvê",
20 "inflection of",
21 "dem2",
22 "guherto",
23 "guharto",
24 "rastnivîs",
25 "şaşnivîs",
26 "şaşî",
27 "kevnbûyî",
28 "binêre",
29 "bnr",
30 "binêre2",
31 "bnr2",
32 "awayekî din",
33 "ad",
34 "komparatîv",
35 "kom",
36 "sûperlatîv",
37 "sûp",
38 "dem",
39 "dema-bê",
40 "dema-fireh",
41 "raboriya-sade",
42 "rehê dema niha",
43 ]
44)
45FORM_OF_TEMPLATE_SUFFIXES = (
46 "-dema-bê",
47 "-dema-bê-p",
48 "-dema-niha",
49 "-dema-niha-p",
50 "-fermanî",
51)
54def extract_pos_section(
55 wxr: WiktextractContext,
56 page_data: list[WordEntry],
57 base_data: WordEntry,
58 level_node: LevelNode,
59 pos_title: str,
60) -> None:
61 page_data.append(base_data.model_copy(deep=True))
62 page_data[-1].pos_title = pos_title
63 pos_data = POS_DATA[pos_title]
64 page_data[-1].pos = pos_data["pos"]
65 page_data[-1].tags.extend(pos_data.get("tags", []))
67 gloss_list_index = len(level_node.children)
68 for index, list_node in level_node.find_child(NodeKind.LIST, True):
69 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
70 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): 70 ↛ 69line 70 didn't jump to line 69 because the condition on line 70 was always true
71 extract_gloss_list_item(wxr, page_data[-1], list_item)
72 if index < gloss_list_index: 72 ↛ 69line 72 didn't jump to line 69 because the condition on line 72 was always true
73 gloss_list_index = index
75 extract_pos_header_nodes(
76 wxr, page_data[-1], level_node.children[:gloss_list_index]
77 )
78 for t_node in level_node.find_child(NodeKind.TEMPLATE):
79 if t_node.template_name == "binêre/el":
80 extract_binêre_el_template(wxr, page_data[-1], t_node)
81 elif (
82 t_node.template_name in FORM_OF_TEMPLATES
83 or t_node.template_name.endswith(FORM_OF_TEMPLATE_SUFFIXES)
84 ):
85 sense = Sense()
86 extract_form_of_template(wxr, sense, t_node)
87 gloss = clean_node(wxr, sense, t_node)
88 if gloss != "": 88 ↛ 90line 88 didn't jump to line 90 because the condition on line 88 was always true
89 sense.glosses.append(gloss)
90 page_data[-1].senses.append(sense)
93def extract_gloss_list_item(
94 wxr: WiktextractContext,
95 word_entry: WordEntry,
96 list_item: WikiNode,
97 parent_sense: Sense | None = None,
98) -> None:
99 sense = (
100 parent_sense.model_copy(deep=True)
101 if parent_sense is not None
102 else Sense()
103 )
104 gloss_nodes = []
105 for node in list_item.children:
106 if isinstance(node, TemplateNode):
107 if node.template_name in ["f", "ferhengok"]:
108 extract_ferhengok_template(wxr, sense, node)
109 elif ( 109 ↛ 115line 109 didn't jump to line 115 because the condition on line 109 was always true
110 node.template_name in FORM_OF_TEMPLATES
111 or node.template_name.endswith(FORM_OF_TEMPLATE_SUFFIXES)
112 ):
113 extract_form_of_template(wxr, sense, node)
114 gloss_nodes.append(node)
115 elif node.template_name in ["bajar"]:
116 clean_node(wxr, sense, node)
117 sense.topics.append("city")
118 else:
119 t_node_text = clean_node(wxr, sense, node)
120 if t_node_text.startswith("(") and t_node_text.endswith(")"):
121 sense.raw_tags.append(t_node_text.strip("() "))
122 else:
123 gloss_nodes.append(t_node_text)
124 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
125 gloss_nodes.append(node)
127 gloss_str = clean_node(wxr, sense, gloss_nodes)
128 if gloss_str != "": 128 ↛ 133line 128 didn't jump to line 133 because the condition on line 128 was always true
129 sense.glosses.append(gloss_str)
130 translate_raw_tags(sense)
131 word_entry.senses.append(sense)
133 for child_list in list_item.find_child(NodeKind.LIST):
134 if child_list.sarg.startswith("#") and child_list.sarg.endswith(
135 (":", "*")
136 ):
137 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM):
138 extract_example_list_item(wxr, word_entry, sense, e_list_item)
139 elif child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 139 ↛ 133line 139 didn't jump to line 133 because the condition on line 139 was always true
140 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
141 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
143 if len(sense.glosses) == 0 and len(sense.examples) > 0: 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true
144 word_entry.senses.append(sense)
147def extract_ferhengok_template(
148 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
149) -> None:
150 # https://ku.wiktionary.org/wiki/Şablon:ferhengok
151 node_str = clean_node(wxr, sense, t_node).strip("() ")
152 for raw_tag in re.split(r",| an | û ", node_str):
153 raw_tag = raw_tag.strip()
154 if raw_tag != "": 154 ↛ 152line 154 didn't jump to line 152 because the condition on line 154 was always true
155 sense.raw_tags.append(raw_tag)
158# https://ku.wiktionary.org/wiki/Alîkarî:Cureyên_peyvan
159POS_HEADER_TEMPLATES = frozenset(
160 [
161 "navdêr",
162 "serenav",
163 "lêker",
164 "rengdêr",
165 "hoker",
166 "cînav",
167 "baneşan",
168 "daçek",
169 "pêşdaçek",
170 "paşdaçek",
171 "bazinedaçek",
172 "girêdek",
173 "artîkel",
174 "pirtik",
175 "navgir",
176 "paşgir",
177 "pêşgir",
178 "reh",
179 "biwêj",
180 "hevok",
181 "gp",
182 "hejmar",
183 "tîp",
184 "sembol",
185 "kurtenav",
186 ]
187)
190def extract_pos_header_nodes(
191 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
192) -> None:
193 for node in nodes:
194 if (
195 isinstance(node, TemplateNode)
196 and node.template_name in POS_HEADER_TEMPLATES
197 ):
198 form = Form(
199 form=clean_node(
200 wxr, None, node.template_parameters.get("tr", "")
201 ),
202 tags=["romanization"],
203 )
204 if form.form not in ["", "-"]:
205 word_entry.forms.append(form)
206 clean_node(wxr, word_entry, node)
207 if isinstance(node, TemplateNode) and node.template_name in [
208 "navdêr",
209 "serenav",
210 ]:
211 extract_navdêr_template(wxr, word_entry, node)
212 elif isinstance(node, TemplateNode) and node.template_name == "lêker":
213 extract_lêker_template(wxr, word_entry, node)
214 elif isinstance(node, TemplateNode) and node.template_name in [
215 "ku-tewîn-nav",
216 "ku-tew-nav",
217 "ku-tewîn-rd",
218 ]:
219 extract_ku_tewîn_nav_template(wxr, word_entry, node)
220 elif (
221 isinstance(node, TemplateNode)
222 and node.template_name == "ku-tewîn-lk"
223 ):
224 extract_ku_tewîn_lk_template(wxr, word_entry, node)
227def extract_navdêr_template(
228 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
229) -> None:
230 # https://ku.wiktionary.org/wiki/Şablon:navdêr
231 # Şablon:serenav
232 GENDERS = {
233 "n": "masculine",
234 "n+": "masculine-usually",
235 "m": "feminine",
236 "m+": "feminine-usually",
237 "nt": "gender-neutral",
238 "mn": ["feminine", "masculine"],
239 "m/n": ["feminine", "masculine"],
240 "g": "common-gender",
241 }
242 z_arg = clean_node(wxr, None, t_node.template_parameters.get("z", ""))
243 if z_arg in GENDERS:
244 tag = GENDERS[z_arg]
245 if isinstance(tag, str): 245 ↛ 247line 245 didn't jump to line 247 because the condition on line 245 was always true
246 word_entry.tags.append(tag)
247 elif isinstance(tag, list):
248 word_entry.tags.extend(tag)
249 NUMBERS = {
250 "p": "plural",
251 "p+": "plural-normally",
252 "tp": "plural-only",
253 "y": "singular",
254 "nj": "uncountable",
255 "j/nj": ["countable", "uncountable"],
256 }
257 j_arg = clean_node(wxr, None, t_node.template_parameters.get("j", ""))
258 if j_arg in NUMBERS: 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true
259 tag = NUMBERS[j_arg]
260 if isinstance(tag, str):
261 word_entry.tags.append(tag)
262 elif isinstance(tag, list):
263 word_entry.tags.extend(tag)
265 FORMS = {
266 "m": "feminine",
267 "n": "masculine",
268 "nt": "gender-neutral",
269 "y": "singular",
270 "p": "plural",
271 "np": ["masculine", "plural"],
272 "mp": ["feminine", "plural"],
273 "lk": "verb-from-noun",
274 "hanja": "Hanja",
275 }
276 for form_arg, tag in FORMS.items():
277 if form_arg not in t_node.template_parameters:
278 continue
279 extract_navdêr_template_form(wxr, word_entry, t_node, form_arg, tag)
280 for index in count(2): 280 ↛ 276line 280 didn't jump to line 276 because the loop on line 280 didn't complete
281 form_arg += str(index)
282 if form_arg not in t_node.template_parameters: 282 ↛ 284line 282 didn't jump to line 284 because the condition on line 282 was always true
283 break
284 extract_navdêr_template_form(wxr, word_entry, t_node, form_arg, tag)
286 expanded_node = wxr.wtp.parse(
287 wxr.wtp.node_to_wikitext(t_node), expand_all=True
288 )
289 for i_tag in expanded_node.find_html_recursively("i"):
290 i_text = clean_node(wxr, None, i_tag)
291 if i_text.startswith("(") and i_text.endswith(")"):
292 word_entry.forms.append(
293 Form(form=i_text.strip("() "), tags=["romanization"])
294 )
295 clean_node(wxr, word_entry, expanded_node)
298def extract_navdêr_template_form(
299 wxr: WiktextractContext,
300 word_entry: WordEntry,
301 t_node: TemplateNode,
302 arg_name: str,
303 tag: str | list[str],
304) -> None:
305 if arg_name not in t_node.template_parameters: 305 ↛ 306line 305 didn't jump to line 306 because the condition on line 305 was never true
306 return
307 form = Form(
308 form=clean_node(wxr, None, t_node.template_parameters[arg_name])
309 )
310 if isinstance(tag, str): 310 ↛ 312line 310 didn't jump to line 312 because the condition on line 310 was always true
311 form.tags.append(tag)
312 elif isinstance(tag, list):
313 form.tags.extend(tag)
314 if form.form != "": 314 ↛ exitline 314 didn't return from function 'extract_navdêr_template_form' because the condition on line 314 was always true
315 word_entry.forms.append(form)
318def extract_lêker_template(
319 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
320) -> None:
321 # https://ku.wiktionary.org/wiki/Şablon:lêker
322 TAGS = {
323 "gh": "transitive",
324 "ngh": "intransitive",
325 "x": "proper-noun",
326 "p": "compound",
327 "h": "compound",
328 "b": "idiomatic",
329 }
330 c_arg_value = clean_node(wxr, None, t_node.template_parameters.get("c", ""))
331 for c_arg in c_arg_value.split("-"):
332 if c_arg in TAGS: 332 ↛ 333line 332 didn't jump to line 333 because the condition on line 332 was never true
333 word_entry.tags.append(TAGS[c_arg])
334 FORM_TAGS = {
335 "nd": "noun-from-verb",
336 "niha": "present",
337 "borî": "past",
338 "subj": "subjunctive",
339 }
340 for form_arg, tag in FORM_TAGS.items():
341 extract_lêker_template_form(wxr, word_entry, t_node, form_arg, tag)
344def extract_lêker_template_form(
345 wxr: WiktextractContext,
346 word_entry: WordEntry,
347 t_node: TemplateNode,
348 arg_name: str,
349 tag: str,
350) -> None:
351 if arg_name not in t_node.template_parameters:
352 return
353 form = Form(
354 form=clean_node(wxr, None, t_node.template_parameters[arg_name]),
355 tags=[tag],
356 roman=clean_node(
357 wxr, None, t_node.template_parameters.get(arg_name + "tr", "")
358 ),
359 )
360 if form.form != "": 360 ↛ 362line 360 didn't jump to line 362 because the condition on line 360 was always true
361 word_entry.forms.append(form)
362 if arg_name != "nd" and not arg_name.endswith("2"): 362 ↛ exitline 362 didn't return from function 'extract_lêker_template_form' because the condition on line 362 was always true
363 extract_lêker_template_form(
364 wxr, word_entry, t_node, arg_name + "2", tag
365 )
368def extract_form_of_template(
369 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
370) -> None:
371 # Şablon:formeke peyvê
372 is_alt_of = False
373 break_first_arg = True
374 if t_node.template_name in ["formeke peyvê", "inflection of"]:
375 form_args = ["cude", 3, 2]
376 elif t_node.template_name in [
377 "dem2",
378 "guherto",
379 "guharto",
380 "rastnivîs",
381 "şaşnivîs",
382 "şaşî",
383 "kevnbûyî",
384 "binêre2",
385 "bnr2",
386 "awayekî din",
387 "ad",
388 "komparatîv",
389 "kom",
390 "sûperlatîv",
391 "sûp",
392 "dema-bê",
393 "dema-fireh",
394 "raboriya-sade",
395 ]:
396 form_args = [2]
397 elif t_node.template_name.endswith( 397 ↛ 400line 397 didn't jump to line 400 because the condition on line 397 was never true
398 ("-dema-bê", "-dema-bê-p", "-dema-niha", "-dema-niha-p", "-fermanî")
399 ):
400 form_args = [1]
401 elif t_node.template_name == "dem": 401 ↛ 402line 401 didn't jump to line 402 because the condition on line 401 was never true
402 form_args = [3]
403 elif t_node.template_name == "rehê dema niha": 403 ↛ 404line 403 didn't jump to line 404 because the condition on line 403 was never true
404 extract_rehê_dema_niha_template(wxr, sense, t_node)
405 return
406 elif t_node.template_name in ["binêre", "bnr"]: 406 ↛ 411line 406 didn't jump to line 411 because the condition on line 406 was always true
407 form_args = [1, 2, 3, 4]
408 is_alt_of = True
409 break_first_arg = False
410 else:
411 form_args = []
412 for arg in form_args:
413 form_str = clean_node(
414 wxr, None, t_node.template_parameters.get(arg, "")
415 )
416 if form_str != "":
417 if is_alt_of:
418 sense.alt_of.append(AltForm(word=form_str))
419 else:
420 sense.form_of.append(AltForm(word=form_str))
421 if is_alt_of and "alt-of" not in sense.tags:
422 sense.tags.append("alt-of")
423 elif not is_alt_of and "form-of" not in sense.tags:
424 sense.tags.append("form-of")
425 if t_node.template_name in ["formeke peyvê", "inflection of"]:
426 for tag_arg in count(4): 426 ↛ 438line 426 didn't jump to line 438 because the loop on line 426 didn't complete
427 if tag_arg not in t_node.template_parameters:
428 break
429 raw_tag = clean_node(
430 wxr, None, t_node.template_parameters[tag_arg]
431 ).capitalize()
432 if raw_tag in TAGS:
433 tr_tag = TAGS[raw_tag]
434 if isinstance(tr_tag, str): 434 ↛ 436line 434 didn't jump to line 436 because the condition on line 434 was always true
435 sense.tags.append(tr_tag)
436 elif isinstance(tr_tag, list):
437 sense.tags.extend(tr_tag)
438 if break_first_arg:
439 break
442def extract_rehê_dema_niha_template(
443 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
444) -> None:
445 expanded_node = wxr.wtp.parse(
446 wxr.wtp.node_to_wikitext(t_node), expand_all=True
447 )
448 for bold_node in expanded_node.find_child(NodeKind.BOLD):
449 word = clean_node(wxr, None, bold_node)
450 if word != "":
451 sense.form_of.append(AltForm(word=word))
452 if "form-of" not in sense.tags:
453 sense.tags.append("form-of")
456def extract_binêre_el_template(
457 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
458) -> None:
459 first_arg = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
460 if first_arg != "": 460 ↛ exitline 460 didn't return from function 'extract_binêre_el_template' because the condition on line 460 was always true
461 sense = (
462 word_entry.senses[-1]
463 if len(word_entry.senses) > 0
464 else Sense(tags=["no-gloss"])
465 )
466 sense.alt_of.append(AltForm(word=first_arg))
467 sense.tags.extend(["alt-of", "obsolete"])
468 if len(word_entry.senses) == 0: 468 ↛ exitline 468 didn't return from function 'extract_binêre_el_template' because the condition on line 468 was always true
469 word_entry.senses.append(sense)