Coverage for src/wiktextract/extractor/ku/translation.py: 54%
104 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1import re
3from mediawiki_langcodes import name_to_code
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .models import Translation, WordEntry
17def is_translation_page(title: str) -> bool:
18 return re.search(r"/Werger(?:\d+)?$", title) is not None
21def extract_translation_section(
22 wxr: WiktextractContext,
23 word_entry: WordEntry,
24 level_node: LevelNode,
25 source: str = "",
26 tags: list[str] = [],
27 sense: str = "",
28 from_trans_see: bool = False,
29) -> None:
30 sense_index = 0
31 for node in level_node.find_child(
32 NodeKind.LIST | NodeKind.TEMPLATE | NodeKind.ITALIC | NodeKind.BOLD
33 ):
34 if (
35 isinstance(node, TemplateNode)
36 and node.template_name == "werger-ser"
37 and not (sense != "" and from_trans_see)
38 ):
39 sense = clean_node(wxr, None, node.template_parameters.get(1, ""))
40 sense_i_str = clean_node(
41 wxr, None, node.template_parameters.get(2, "")
42 )
43 if re.fullmatch(r"\d+", sense_i_str): 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 sense_index = int(sense_i_str)
45 elif node.kind == NodeKind.LIST:
46 for list_item in node.find_child(NodeKind.LIST_ITEM):
47 extract_translation_list_item(
48 wxr,
49 word_entry,
50 list_item,
51 sense,
52 sense_index,
53 source,
54 tags=tags,
55 )
56 elif node.kind in (NodeKind.ITALIC | NodeKind.BOLD): 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true
57 for link_node in node.find_child(NodeKind.LINK):
58 link_str = clean_node(wxr, None, link_node)
59 if is_translation_page(link_str):
60 extract_translation_page(wxr, word_entry, link_str)
61 elif ( 61 ↛ 66line 61 didn't jump to line 66 because the condition on line 61 was never true
62 isinstance(node, TemplateNode)
63 and node.template_name in ("werger-bnr", "bnr-werger")
64 and not from_trans_see
65 ):
66 extract_trans_see_template(wxr, word_entry, node)
69def extract_translation_list_item(
70 wxr: WiktextractContext,
71 word_entry: WordEntry,
72 list_item: WikiNode,
73 sense: str,
74 sense_index: int,
75 source: str,
76 tags: list[str] = [],
77) -> None:
78 lang_name = "unknown"
79 lang_code = "unknown"
80 before_colon = True
81 for index, node in enumerate(list_item.children):
82 if isinstance(node, str) and ":" in node and lang_name == "unknown":
83 lang_name = clean_node(
84 wxr,
85 None,
86 list_item.children[:index] + [node[: node.index(":")]],
87 )
88 if lang_name == "": 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 lang_name = "unknown"
90 before_colon = False
91 elif isinstance(node, TemplateNode) and node.template_name == "Z":
92 lang_code = clean_node(
93 wxr, None, node.template_parameters.get(1, "")
94 )
95 elif isinstance(node, TemplateNode) and node.template_name in [
96 "W",
97 "W+",
98 "W-",
99 ]:
100 extract_w_template(
101 wxr, word_entry, node, sense, sense_index, lang_name, source
102 )
103 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true
104 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
105 extract_translation_list_item(
106 wxr,
107 word_entry,
108 child_list_item,
109 sense,
110 sense_index,
111 source,
112 tags=tags,
113 )
114 elif (
115 isinstance(node, WikiNode)
116 and node.kind == NodeKind.LINK
117 and not before_colon
118 ):
119 if lang_code in ["", "unknown"]:
120 new_code = name_to_code(lang_name, "ku")
121 if new_code != "": 121 ↛ 122line 121 didn't jump to line 122 because the condition on line 121 was never true
122 lang_code = new_code
123 tr_data = Translation(
124 word=clean_node(wxr, None, node),
125 lang=lang_name,
126 lang_code=lang_code,
127 sense=sense,
128 sense_index=sense_index,
129 source=source,
130 tags=tags,
131 )
132 if tr_data.word != "": 132 ↛ 81line 132 didn't jump to line 81 because the condition on line 132 was always true
133 word_entry.translations.append(tr_data)
136def extract_w_template(
137 wxr: WiktextractContext,
138 word_entry: WordEntry,
139 t_node: TemplateNode,
140 sense: str,
141 sense_index: int,
142 lang_name: str,
143 source: str,
144 tags: list[str] = [],
145) -> None:
146 # https://ku.wiktionary.org/wiki/Şablon:W
147 tr_data = Translation(
148 lang=lang_name,
149 lang_code=clean_node(
150 wxr, None, t_node.template_parameters.get(1, "unknown")
151 ),
152 word=clean_node(
153 wxr,
154 None,
155 t_node.template_parameters.get(
156 "cuda", t_node.template_parameters.get(2, "")
157 ),
158 ),
159 source=source,
160 tags=tags,
161 sense=sense,
162 sense_index=sense_index,
163 )
164 tag_args = {
165 "n": "masculine",
166 "m": "feminine",
167 "f": "feminine",
168 "nt": "gender-neutral",
169 "mn": ["feminine", "masculine"],
170 "g": "common-gender",
171 "p": "plural",
172 "y": "singular",
173 }
174 for tag_arg in [3, 4]:
175 tag_arg_value = clean_node(
176 wxr, None, t_node.template_parameters.get(tag_arg, "")
177 )
178 if tag_arg_value in tag_args:
179 tag = tag_args[tag_arg_value]
180 if isinstance(tag, str): 180 ↛ 182line 180 didn't jump to line 182 because the condition on line 180 was always true
181 tr_data.tags.append(tag)
182 elif isinstance(tag, list):
183 tr_data.tags.extend(tag)
184 expanded_node = wxr.wtp.parse(
185 wxr.wtp.node_to_wikitext(t_node), expand_all=True
186 )
187 for span_tag in expanded_node.find_html("span"): 187 ↛ 193line 187 didn't jump to line 193 because the loop on line 187 didn't complete
188 if "Latn" in span_tag.attrs.get("class", ""):
189 roman = clean_node(wxr, None, span_tag)
190 if roman not in ["", tr_data.word]: 190 ↛ 187line 190 didn't jump to line 187 because the condition on line 190 was always true
191 tr_data.roman = roman
192 break
193 if tr_data.word != "": 193 ↛ exitline 193 didn't return from function 'extract_w_template' because the condition on line 193 was always true
194 word_entry.translations.append(tr_data)
197def extract_translation_page(
198 wxr: WiktextractContext, word_entry: WordEntry, page_title: str
199) -> None:
200 page = wxr.wtp.get_page(page_title, 0)
201 if page is None or page.body is None:
202 return
203 root = wxr.wtp.parse(page.body)
204 target_node = find_subpage_section(wxr, root, "Werger")
205 if target_node is not None:
206 extract_translation_section(
207 wxr, word_entry, target_node, source=page_title
208 )
211def extract_trans_see_template(
212 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
213):
214 # https://ku.wiktionary.org/wiki/Şablon:werger-bnr
215 sense = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
216 page_titles = []
217 if 2 in t_node.template_parameters:
218 for index in range(2, 11):
219 if index not in t_node.template_parameters:
220 break
221 page_titles.append(
222 clean_node(wxr, None, t_node.template_parameters[index])
223 )
224 else:
225 page_titles.append(
226 clean_node(wxr, None, t_node.template_parameters.get(1, ""))
227 )
228 for page_title in page_titles:
229 if "#" in page_title:
230 page_title = page_title[: page_title.index("#")]
231 page = wxr.wtp.get_page(page_title)
232 if page is None:
233 return
234 root = wxr.wtp.parse(page.body)
235 target_node = find_subpage_section(wxr, root, "Werger")
236 if target_node is not None:
237 extract_translation_section(
238 wxr,
239 word_entry,
240 target_node,
241 source=page_title,
242 sense=sense,
243 from_trans_see=True,
244 )
247def find_subpage_section(
248 wxr: WiktextractContext, root: WikiNode, target_section: str
249) -> WikiNode | None:
250 for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS):
251 section_title = clean_node(wxr, None, level_node.largs)
252 if section_title == target_section:
253 return level_node
254 return None