Coverage for src/wiktextract/extractor/ku/pos.py: 78%
218 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import re
2from itertools import count
4from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .example import extract_example_list_item
9from .form_table import (
10 extract_ku_tewîn_lk_template,
11 extract_ku_tewîn_nav_template,
12)
13from .models import AltForm, Form, Sense, WordEntry
14from .section_titles import POS_DATA
15from .tags import TAGS, translate_raw_tags
17FORM_OF_TEMPLATES = frozenset(
18 [
19 "formeke peyvê",
20 "inflection of",
21 "dem2",
22 "guherto",
23 "guharto",
24 "rastnivîs",
25 "şaşnivîs",
26 "şaşî",
27 "kevnbûyî",
28 "binêre",
29 "bnr",
30 "binêre2",
31 "bnr2",
32 "awayekî din",
33 "ad",
34 "komparatîv",
35 "kom",
36 "sûperlatîv",
37 "sûp",
38 "dem",
39 "dema-bê",
40 "dema-fireh",
41 "raboriya-sade",
42 "rehê dema niha",
43 ]
44)
45FORM_OF_TEMPLATE_SUFFIXES = (
46 "-dema-bê",
47 "-dema-bê-p",
48 "-dema-niha",
49 "-dema-niha-p",
50 "-fermanî",
51)
54def extract_pos_section(
55 wxr: WiktextractContext,
56 page_data: list[WordEntry],
57 base_data: WordEntry,
58 level_node: LevelNode,
59 pos_title: str,
60) -> None:
61 page_data.append(base_data.model_copy(deep=True))
62 page_data[-1].pos_title = pos_title
63 pos_data = POS_DATA[pos_title]
64 page_data[-1].pos = pos_data["pos"]
65 page_data[-1].tags.extend(pos_data.get("tags", []))
67 gloss_list_index = len(level_node.children)
68 for index, list_node in level_node.find_child(NodeKind.LIST, True):
69 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
70 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"): 70 ↛ 69line 70 didn't jump to line 69 because the condition on line 70 was always true
71 extract_gloss_list_item(wxr, page_data[-1], list_item)
72 if index < gloss_list_index: 72 ↛ 69line 72 didn't jump to line 69 because the condition on line 72 was always true
73 gloss_list_index = index
75 extract_pos_header_nodes(
76 wxr, page_data[-1], level_node.children[:gloss_list_index]
77 )
78 for t_node in level_node.find_child(NodeKind.TEMPLATE):
79 if t_node.template_name == "binêre/el":
80 extract_binêre_el_template(wxr, page_data[-1], t_node)
81 elif (
82 t_node.template_name in FORM_OF_TEMPLATES
83 or t_node.template_name.endswith(FORM_OF_TEMPLATE_SUFFIXES)
84 ):
85 sense = Sense()
86 extract_form_of_template(wxr, sense, t_node)
87 gloss = clean_node(wxr, sense, t_node)
88 if gloss != "": 88 ↛ 90line 88 didn't jump to line 90 because the condition on line 88 was always true
89 sense.glosses.append(gloss)
90 page_data[-1].senses.append(sense)
93def extract_gloss_list_item(
94 wxr: WiktextractContext,
95 word_entry: WordEntry,
96 list_item: WikiNode,
97 parent_sense: Sense | None = None,
98) -> None:
99 sense = (
100 parent_sense.model_copy(deep=True)
101 if parent_sense is not None
102 else Sense()
103 )
104 gloss_nodes = []
105 for node in list_item.children:
106 if isinstance(node, TemplateNode):
107 if node.template_name in ["f", "ferhengok"]:
108 extract_ferhengok_template(wxr, sense, node)
109 elif ( 109 ↛ 115line 109 didn't jump to line 115 because the condition on line 109 was always true
110 node.template_name in FORM_OF_TEMPLATES
111 or node.template_name.endswith(FORM_OF_TEMPLATE_SUFFIXES)
112 ):
113 extract_form_of_template(wxr, sense, node)
114 gloss_nodes.append(node)
115 elif node.template_name in ["bajar"]:
116 clean_node(wxr, sense, node)
117 sense.topics.append("city")
118 else:
119 t_node_text = clean_node(wxr, sense, node)
120 if t_node_text.startswith("(") and t_node_text.endswith(")"):
121 sense.raw_tags.append(t_node_text.strip("() "))
122 else:
123 gloss_nodes.append(t_node_text)
124 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
125 gloss_nodes.append(node)
127 gloss_str = clean_node(wxr, sense, gloss_nodes)
128 if gloss_str != "": 128 ↛ 133line 128 didn't jump to line 133 because the condition on line 128 was always true
129 sense.glosses.append(gloss_str)
130 translate_raw_tags(sense)
131 word_entry.senses.append(sense)
133 for child_list in list_item.find_child(NodeKind.LIST):
134 if child_list.sarg.startswith("#") and child_list.sarg.endswith(
135 (":", "*")
136 ):
137 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM):
138 extract_example_list_item(wxr, word_entry, sense, e_list_item)
139 elif child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 139 ↛ 133line 139 didn't jump to line 133 because the condition on line 139 was always true
140 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
141 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
143 if len(sense.glosses) == 0 and len(sense.examples) > 0: 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true
144 word_entry.senses.append(sense)
147def extract_ferhengok_template(
148 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
149) -> None:
150 # https://ku.wiktionary.org/wiki/Şablon:ferhengok
151 node_str = clean_node(wxr, sense, t_node).strip("() ")
152 for raw_tag in re.split(r",| an | û ", node_str):
153 raw_tag = raw_tag.strip()
154 if raw_tag != "": 154 ↛ 152line 154 didn't jump to line 152 because the condition on line 154 was always true
155 sense.raw_tags.append(raw_tag)
158# https://ku.wiktionary.org/wiki/Alîkarî:Cureyên_peyvan
159POS_HEADER_TEMPLATES = frozenset(
160 [
161 "navdêr",
162 "serenav",
163 "lêker",
164 "rengdêr",
165 "hoker",
166 "cînav",
167 "baneşan",
168 "daçek",
169 "pêşdaçek",
170 "paşdaçek",
171 "bazinedaçek",
172 "girêdek",
173 "artîkel",
174 "pirtik",
175 "navgir",
176 "paşgir",
177 "pêşgir",
178 "reh",
179 "biwêj",
180 "hevok",
181 "gp",
182 "hejmar",
183 "tîp",
184 "sembol",
185 "kurtenav",
186 ]
187)
190def extract_pos_header_nodes(
191 wxr: WiktextractContext, word_entry: WordEntry, nodes: list[WikiNode | str]
192) -> None:
193 for node in nodes:
194 if (
195 isinstance(node, TemplateNode)
196 and node.template_name in POS_HEADER_TEMPLATES
197 ):
198 form = Form(
199 form=clean_node(
200 wxr, None, node.template_parameters.get("tr", "")
201 ),
202 tags=["romanization"],
203 )
204 if form.form not in ["", "-"]:
205 word_entry.forms.append(form)
206 clean_node(wxr, word_entry, node)
207 if isinstance(node, TemplateNode) and node.template_name in [
208 "navdêr",
209 "serenav",
210 ]:
211 extract_navdêr_template(wxr, word_entry, node)
212 elif isinstance(node, TemplateNode) and node.template_name == "lêker":
213 extract_lêker_template(wxr, word_entry, node)
214 elif isinstance(node, TemplateNode) and node.template_name in [
215 "ku-tewîn-nav",
216 "ku-tew-nav",
217 "ku-tewîn-rd",
218 ]:
219 extract_ku_tewîn_nav_template(wxr, word_entry, node)
220 elif (
221 isinstance(node, TemplateNode)
222 and node.template_name == "ku-tewîn-lk"
223 ):
224 extract_ku_tewîn_lk_template(wxr, word_entry, node)
227def extract_navdêr_template(
228 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
229) -> None:
230 # https://ku.wiktionary.org/wiki/Şablon:navdêr
231 # Şablon:serenav
232 GENDERS = {
233 "n": "masculine",
234 "n+": "masculine-usually",
235 "m": "feminine",
236 "m+": "feminine-usually",
237 "nt": "gender-neutral",
238 "mn": ["feminine", "masculine"],
239 "m/n": ["feminine", "masculine"],
240 "g": "common-gender",
241 }
242 z_arg = clean_node(wxr, None, t_node.template_parameters.get("z", ""))
243 if z_arg in GENDERS:
244 tag = GENDERS[z_arg]
245 if isinstance(tag, str): 245 ↛ 247line 245 didn't jump to line 247 because the condition on line 245 was always true
246 word_entry.tags.append(tag)
247 elif isinstance(tag, list):
248 word_entry.tags.extend(tag)
249 NUMBERS = {
250 "p": "plural",
251 "p+": "plural-normally",
252 "tp": "plural-only",
253 "y": "singular",
254 "nj": "uncountable",
255 "j/nj": ["countable", "uncountable"],
256 }
257 j_arg = clean_node(wxr, None, t_node.template_parameters.get("j", ""))
258 if j_arg in NUMBERS: 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true
259 tag = NUMBERS[j_arg]
260 if isinstance(tag, str):
261 word_entry.tags.append(tag)
262 elif isinstance(tag, list):
263 word_entry.tags.extend(tag)
265 FORMS = {
266 "m": "feminine",
267 "n": "masculine",
268 "nt": "gender-neutral",
269 "y": "singular",
270 "p": "plural",
271 "np": ["masculine", "plural"],
272 "mp": ["feminine", "plural"],
273 "lk": "verb-from-noun",
274 "hanja": "Hanja",
275 }
276 for form_arg, tag in FORMS.items():
277 if form_arg not in t_node.template_parameters:
278 continue
279 extract_navdêr_template_form(wxr, word_entry, t_node, form_arg, tag)
280 for index in count(2): 280 ↛ 276line 280 didn't jump to line 276 because the loop on line 280 didn't complete
281 form_arg += str(index)
282 if form_arg not in t_node.template_parameters: 282 ↛ 284line 282 didn't jump to line 284 because the condition on line 282 was always true
283 break
284 extract_navdêr_template_form(wxr, word_entry, t_node, form_arg, tag)
286 expanded_node = wxr.wtp.parse(
287 wxr.wtp.node_to_wikitext(t_node), expand_all=True
288 )
289 for i_tag in expanded_node.find_html_recursively("i"):
290 i_text = clean_node(wxr, None, i_tag)
291 if i_text.startswith("(") and i_text.endswith(")"):
292 word_entry.forms.append(
293 Form(form=i_text.strip("() "), tags=["romanization"])
294 )
295 for main_span_tag in expanded_node.find_html(
296 "span", attr_name="class", attr_value="headword-line"
297 ):
298 for strong_tag in main_span_tag.find_html(
299 "strong", attr_name="class", attr_value="headword"
300 ):
301 strong_str = clean_node(wxr, None, strong_tag)
302 if strong_str not in ["", wxr.wtp.title]: 302 ↛ 298line 302 didn't jump to line 298 because the condition on line 302 was always true
303 word_entry.forms.append(
304 Form(form=strong_str, tags=["canonical"])
305 )
306 for roman_span in main_span_tag.find_html(
307 "span", attr_name="class", attr_value="headword-tr"
308 ):
309 roman = clean_node(wxr, None, roman_span)
310 if roman != "": 310 ↛ 306line 310 didn't jump to line 306 because the condition on line 310 was always true
311 word_entry.forms.append(
312 Form(form=roman, tags=["transliteration"])
313 )
315 clean_node(wxr, word_entry, expanded_node)
318def extract_navdêr_template_form(
319 wxr: WiktextractContext,
320 word_entry: WordEntry,
321 t_node: TemplateNode,
322 arg_name: str,
323 tag: str | list[str],
324) -> None:
325 if arg_name not in t_node.template_parameters: 325 ↛ 326line 325 didn't jump to line 326 because the condition on line 325 was never true
326 return
327 form = Form(
328 form=clean_node(wxr, None, t_node.template_parameters[arg_name])
329 )
330 if isinstance(tag, str): 330 ↛ 332line 330 didn't jump to line 332 because the condition on line 330 was always true
331 form.tags.append(tag)
332 elif isinstance(tag, list):
333 form.tags.extend(tag)
334 if form.form != "": 334 ↛ exitline 334 didn't return from function 'extract_navdêr_template_form' because the condition on line 334 was always true
335 word_entry.forms.append(form)
338def extract_lêker_template(
339 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
340) -> None:
341 # https://ku.wiktionary.org/wiki/Şablon:lêker
342 TAGS = {
343 "gh": "transitive",
344 "ngh": "intransitive",
345 "x": "proper-noun",
346 "p": "compound",
347 "h": "compound",
348 "b": "idiomatic",
349 }
350 c_arg_value = clean_node(wxr, None, t_node.template_parameters.get("c", ""))
351 for c_arg in c_arg_value.split("-"):
352 if c_arg in TAGS: 352 ↛ 353line 352 didn't jump to line 353 because the condition on line 352 was never true
353 word_entry.tags.append(TAGS[c_arg])
354 FORM_TAGS = {
355 "nd": "noun-from-verb",
356 "niha": "present",
357 "borî": "past",
358 "subj": "subjunctive",
359 }
360 for form_arg, tag in FORM_TAGS.items():
361 extract_lêker_template_form(wxr, word_entry, t_node, form_arg, tag)
364def extract_lêker_template_form(
365 wxr: WiktextractContext,
366 word_entry: WordEntry,
367 t_node: TemplateNode,
368 arg_name: str,
369 tag: str,
370) -> None:
371 if arg_name not in t_node.template_parameters:
372 return
373 form = Form(
374 form=clean_node(wxr, None, t_node.template_parameters[arg_name]),
375 tags=[tag],
376 roman=clean_node(
377 wxr, None, t_node.template_parameters.get(arg_name + "tr", "")
378 ),
379 )
380 if form.form != "": 380 ↛ 382line 380 didn't jump to line 382 because the condition on line 380 was always true
381 word_entry.forms.append(form)
382 if arg_name != "nd" and not arg_name.endswith("2"): 382 ↛ exitline 382 didn't return from function 'extract_lêker_template_form' because the condition on line 382 was always true
383 extract_lêker_template_form(
384 wxr, word_entry, t_node, arg_name + "2", tag
385 )
388def extract_form_of_template(
389 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
390) -> None:
391 # Şablon:formeke peyvê
392 is_alt_of = False
393 break_first_arg = True
394 if t_node.template_name in ["formeke peyvê", "inflection of"]:
395 form_args = ["cude", 3, 2]
396 elif t_node.template_name in [
397 "dem2",
398 "guherto",
399 "guharto",
400 "rastnivîs",
401 "şaşnivîs",
402 "şaşî",
403 "kevnbûyî",
404 "binêre2",
405 "bnr2",
406 "awayekî din",
407 "ad",
408 "komparatîv",
409 "kom",
410 "sûperlatîv",
411 "sûp",
412 "dema-bê",
413 "dema-fireh",
414 "raboriya-sade",
415 ]:
416 form_args = [2]
417 elif t_node.template_name.endswith( 417 ↛ 420line 417 didn't jump to line 420 because the condition on line 417 was never true
418 ("-dema-bê", "-dema-bê-p", "-dema-niha", "-dema-niha-p", "-fermanî")
419 ):
420 form_args = [1]
421 elif t_node.template_name == "dem": 421 ↛ 422line 421 didn't jump to line 422 because the condition on line 421 was never true
422 form_args = [3]
423 elif t_node.template_name == "rehê dema niha": 423 ↛ 424line 423 didn't jump to line 424 because the condition on line 423 was never true
424 extract_rehê_dema_niha_template(wxr, sense, t_node)
425 return
426 elif t_node.template_name in ["binêre", "bnr"]: 426 ↛ 431line 426 didn't jump to line 431 because the condition on line 426 was always true
427 form_args = [1, 2, 3, 4]
428 is_alt_of = True
429 break_first_arg = False
430 else:
431 form_args = []
432 for arg in form_args:
433 form_str = clean_node(
434 wxr, None, t_node.template_parameters.get(arg, "")
435 )
436 if form_str != "":
437 if is_alt_of:
438 sense.alt_of.append(AltForm(word=form_str))
439 else:
440 sense.form_of.append(AltForm(word=form_str))
441 if is_alt_of and "alt-of" not in sense.tags:
442 sense.tags.append("alt-of")
443 elif not is_alt_of and "form-of" not in sense.tags:
444 sense.tags.append("form-of")
445 if t_node.template_name in ["formeke peyvê", "inflection of"]:
446 for tag_arg in count(4): 446 ↛ 458line 446 didn't jump to line 458 because the loop on line 446 didn't complete
447 if tag_arg not in t_node.template_parameters:
448 break
449 raw_tag = clean_node(
450 wxr, None, t_node.template_parameters[tag_arg]
451 ).capitalize()
452 if raw_tag in TAGS:
453 tr_tag = TAGS[raw_tag]
454 if isinstance(tr_tag, str): 454 ↛ 456line 454 didn't jump to line 456 because the condition on line 454 was always true
455 sense.tags.append(tr_tag)
456 elif isinstance(tr_tag, list):
457 sense.tags.extend(tr_tag)
458 if break_first_arg:
459 break
462def extract_rehê_dema_niha_template(
463 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
464) -> None:
465 expanded_node = wxr.wtp.parse(
466 wxr.wtp.node_to_wikitext(t_node), expand_all=True
467 )
468 for bold_node in expanded_node.find_child(NodeKind.BOLD):
469 word = clean_node(wxr, None, bold_node)
470 if word != "":
471 sense.form_of.append(AltForm(word=word))
472 if "form-of" not in sense.tags:
473 sense.tags.append("form-of")
476def extract_binêre_el_template(
477 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
478) -> None:
479 first_arg = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
480 if first_arg != "": 480 ↛ exitline 480 didn't return from function 'extract_binêre_el_template' because the condition on line 480 was always true
481 sense = (
482 word_entry.senses[-1]
483 if len(word_entry.senses) > 0
484 else Sense(tags=["no-gloss"])
485 )
486 sense.alt_of.append(AltForm(word=first_arg))
487 sense.tags.extend(["alt-of", "obsolete"])
488 if len(word_entry.senses) == 0: 488 ↛ exitline 488 didn't return from function 'extract_binêre_el_template' because the condition on line 488 was always true
489 word_entry.senses.append(sense)