Coverage for src/wiktextract/extractor/ku/tewandin.py: 94%
72 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import re
2from itertools import count
4from wikitextprocessor.parser import LEVEL_KIND_FLAGS, NodeKind, TemplateNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .tags import translate_raw_tags
12def extract_tewandin_page(
13 wxr: WiktextractContext, word_entry: WordEntry, title: str
14) -> None:
15 page = wxr.wtp.get_page(title, 106)
16 if page is None or page.body is None:
17 return
18 root = wxr.wtp.parse(page.body)
19 for t_node in root.find_child(NodeKind.TEMPLATE): 19 ↛ 20line 19 didn't jump to line 20 because the loop on line 19 never started
20 extract_tewandin_template(wxr, word_entry, t_node, title)
21 for level_node in root.find_child(LEVEL_KIND_FLAGS):
22 for t_node in level_node.find_child(NodeKind.TEMPLATE):
23 extract_tewandin_template(wxr, word_entry, t_node, title)
26def extract_tewandin_template(
27 wxr: WiktextractContext,
28 word_entry: WordEntry,
29 t_node: TemplateNode,
30 source_page: str,
31 tab_name: str = "",
32 form_set: set[str] = set(),
33) -> None:
34 if t_node.template_name == "ku-tewandin":
35 extract_ku_tewandin_template(
36 wxr,
37 word_entry,
38 t_node,
39 source_page,
40 tab_name=tab_name,
41 form_set=form_set,
42 )
43 elif t_node.template_name == "etîket tewandin": 43 ↛ exitline 43 didn't return from function 'extract_tewandin_template' because the condition on line 43 was always true
44 extract_etîket_tewandin_template(wxr, word_entry, t_node, source_page)
47def extract_ku_tewandin_template(
48 wxr: WiktextractContext,
49 word_entry: WordEntry,
50 t_node: TemplateNode,
51 source_page: str,
52 tab_name: str = "",
53 form_set: set[str] = {},
54) -> None:
55 # https://ku.wiktionary.org/wiki/Şablon:ku-tewandin
56 from .form_table import TableHeader
58 expanded_node = wxr.wtp.parse(
59 wxr.wtp.node_to_wikitext(t_node), expand_all=True
60 )
61 clean_node(wxr, word_entry, expanded_node)
62 for table in expanded_node.find_child_recursively(NodeKind.TABLE):
63 col_headers = []
64 last_row_has_data_cell = False
65 for row in table.find_child(NodeKind.TABLE_ROW):
66 row_header = ""
67 col_index = 0
68 for cell in row.find_child(
69 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
70 ):
71 cell_str = clean_node(wxr, None, cell)
72 if cell_str == "":
73 continue
74 colspan = 1
75 colspan_str = cell.attrs.get("colspan", "1")
76 if re.fullmatch(r"\d+", colspan_str): 76 ↛ 78line 76 didn't jump to line 78 because the condition on line 76 was always true
77 colspan = int(colspan_str)
78 if cell.kind == NodeKind.TABLE_HEADER_CELL:
79 if row.contain_node(NodeKind.TABLE_CELL):
80 row_header = cell_str
81 else:
82 if last_row_has_data_cell:
83 col_headers.clear()
84 col_headers.append(
85 TableHeader(
86 text=cell_str,
87 col_index=col_index,
88 colspan=colspan,
89 )
90 )
91 col_index += colspan
92 last_row_has_data_cell = False
93 else:
94 last_row_has_data_cell = True
95 form = Form(form=cell_str, source=source_page)
96 if tab_name != "": 96 ↛ 98line 96 didn't jump to line 98 because the condition on line 96 was always true
97 form.raw_tags.append(tab_name)
98 if row_header != "":
99 form.raw_tags.append(row_header)
100 for header in col_headers:
101 if (
102 col_index >= header.col_index
103 and col_index < header.col_index + header.colspan
104 ):
105 form.raw_tags.append(header.text)
106 if form.form != wxr.wtp.title and form.form not in form_set: 106 ↛ 110line 106 didn't jump to line 110 because the condition on line 106 was always true
107 translate_raw_tags(form)
108 word_entry.forms.append(form)
109 form_set.add(form.form)
110 col_index += colspan
113def extract_etîket_tewandin_template(
114 wxr: WiktextractContext,
115 word_entry: WordEntry,
116 t_node: TemplateNode,
117 source_page: str,
118) -> None:
119 # https://ku.wiktionary.org/wiki/Şablon:etîket_tewandin
120 form_set = set()
121 for num in count(1): 121 ↛ exitline 121 didn't return from function 'extract_etîket_tewandin_template' because the loop on line 121 didn't complete
122 tab_name_arg = f"etîket{num}"
123 if tab_name_arg not in t_node.template_parameters:
124 break
125 tab_name = clean_node(
126 wxr, None, t_node.template_parameters[tab_name_arg]
127 )
128 tab_arg = wxr.wtp.parse(
129 wxr.wtp.node_to_wikitext(
130 t_node.template_parameters[f"naverok{num}"]
131 )
132 )
133 for node in tab_arg.find_child(NodeKind.TEMPLATE):
134 extract_tewandin_template(
135 wxr,
136 word_entry,
137 node,
138 source_page,
139 tab_name=tab_name,
140 form_set=form_set,
141 )