Coverage for src/wiktextract/extractor/ku/translation.py: 63%
87 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import re
3from mediawiki_langcodes import name_to_code
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .models import Translation, WordEntry
17def is_translation_page(title: str) -> bool:
18 return re.search(r"/Werger(?:\d+)?$", title) is not None
21def extract_translation_section(
22 wxr: WiktextractContext,
23 word_entry: WordEntry,
24 level_node: LevelNode,
25 source: str = "",
26 tags: list[str] = [],
27) -> None:
28 sense = ""
29 sense_index = 0
30 for node in level_node.find_child(
31 NodeKind.LIST | NodeKind.TEMPLATE | NodeKind.ITALIC | NodeKind.BOLD
32 ):
33 if (
34 isinstance(node, TemplateNode)
35 and node.template_name == "werger-ser"
36 ):
37 sense = clean_node(wxr, None, node.template_parameters.get(1, ""))
38 sense_i_str = clean_node(
39 wxr, None, node.template_parameters.get(2, "")
40 )
41 if re.fullmatch(r"\d+", sense_i_str): 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 sense_index = int(sense_i_str)
43 elif node.kind == NodeKind.LIST:
44 for list_item in node.find_child(NodeKind.LIST_ITEM):
45 extract_translation_list_item(
46 wxr,
47 word_entry,
48 list_item,
49 sense,
50 sense_index,
51 source,
52 tags=tags,
53 )
54 elif node.kind in (NodeKind.ITALIC | NodeKind.BOLD): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 for link_node in node.find_child(NodeKind.LINK):
56 link_str = clean_node(wxr, None, link_node)
57 if is_translation_page(link_str):
58 extract_translation_page(wxr, word_entry, link_str)
59 elif ( 59 ↛ 63line 59 didn't jump to line 63 because the condition on line 59 was never true
60 isinstance(node, TemplateNode)
61 and node.template_name == "werger-bnr"
62 ):
63 page_title = clean_node(
64 wxr, None, node.template_parameters.get(1, "")
65 )
66 if is_translation_page(page_title):
67 extract_translation_page(wxr, word_entry, page_title)
70def extract_translation_list_item(
71 wxr: WiktextractContext,
72 word_entry: WordEntry,
73 list_item: WikiNode,
74 sense: str,
75 sense_index: int,
76 source: str,
77 tags: list[str] = [],
78) -> None:
79 lang_name = "unknown"
80 lang_code = "unknown"
81 before_colon = True
82 for index, node in enumerate(list_item.children):
83 if isinstance(node, str) and ":" in node and lang_name == "unknown":
84 lang_name = clean_node(
85 wxr,
86 None,
87 list_item.children[:index] + [node[: node.index(":")]],
88 )
89 if lang_name == "": 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true
90 lang_name = "unknown"
91 before_colon = False
92 elif isinstance(node, TemplateNode) and node.template_name == "Z":
93 lang_code = clean_node(
94 wxr, None, node.template_parameters.get(1, "")
95 )
96 elif isinstance(node, TemplateNode) and node.template_name in [
97 "W",
98 "W+",
99 "W-",
100 ]:
101 extract_w_template(
102 wxr, word_entry, node, sense, sense_index, lang_name, source
103 )
104 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true
105 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
106 extract_translation_list_item(
107 wxr,
108 word_entry,
109 child_list_item,
110 sense,
111 sense_index,
112 source,
113 tags=tags,
114 )
115 elif (
116 isinstance(node, WikiNode)
117 and node.kind == NodeKind.LINK
118 and not before_colon
119 ):
120 if lang_code in ["", "unknown"]:
121 new_code = name_to_code(lang_name, "ku")
122 if new_code != "": 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true
123 lang_code = new_code
124 tr_data = Translation(
125 word=clean_node(wxr, None, node),
126 lang=lang_name,
127 lang_code=lang_code,
128 sense=sense,
129 sense_index=sense_index,
130 source=source,
131 tags=tags,
132 )
133 if tr_data.word != "": 133 ↛ 82line 133 didn't jump to line 82 because the condition on line 133 was always true
134 word_entry.translations.append(tr_data)
137def extract_w_template(
138 wxr: WiktextractContext,
139 word_entry: WordEntry,
140 t_node: TemplateNode,
141 sense: str,
142 sense_index: int,
143 lang_name: str,
144 source: str,
145 tags: list[str] = [],
146) -> None:
147 # https://ku.wiktionary.org/wiki/Şablon:W
148 tr_data = Translation(
149 lang=lang_name,
150 lang_code=clean_node(
151 wxr, None, t_node.template_parameters.get(1, "unknown")
152 ),
153 word=clean_node(
154 wxr,
155 None,
156 t_node.template_parameters.get(
157 "cuda", t_node.template_parameters.get(2, "")
158 ),
159 ),
160 source=source,
161 tags=tags,
162 )
163 tag_args = {
164 "n": "masculine",
165 "m": "feminine",
166 "f": "feminine",
167 "nt": "gender-neutral",
168 "mn": ["feminine", "masculine"],
169 "g": "common-gender",
170 "p": "plural",
171 "y": "singular",
172 }
173 for tag_arg in [3, 4]:
174 tag_arg_value = clean_node(
175 wxr, None, t_node.template_parameters.get(tag_arg, "")
176 )
177 if tag_arg_value in tag_args:
178 tag = tag_args[tag_arg_value]
179 if isinstance(tag, str): 179 ↛ 181line 179 didn't jump to line 181 because the condition on line 179 was always true
180 tr_data.tags.append(tag)
181 elif isinstance(tag, list):
182 tr_data.tags.extend(tag)
183 expanded_node = wxr.wtp.parse(
184 wxr.wtp.node_to_wikitext(t_node), expand_all=True
185 )
186 for span_tag in expanded_node.find_html("span"): 186 ↛ 192line 186 didn't jump to line 192 because the loop on line 186 didn't complete
187 if "Latn" in span_tag.attrs.get("class", ""):
188 roman = clean_node(wxr, None, span_tag)
189 if roman not in ["", tr_data.word]: 189 ↛ 186line 189 didn't jump to line 186 because the condition on line 189 was always true
190 tr_data.roman = roman
191 break
192 if tr_data.word != "": 192 ↛ exitline 192 didn't return from function 'extract_w_template' because the condition on line 192 was always true
193 word_entry.translations.append(tr_data)
196def extract_translation_page(
197 wxr: WiktextractContext, word_entry: WordEntry, page_title: str
198) -> None:
199 page = wxr.wtp.get_page(page_title, 0)
200 if page is None or page.body is None:
201 return
202 root = wxr.wtp.parse(page.body)
203 for level2_node in root.find_child(NodeKind.LEVEL2):
204 lang_name = clean_node(wxr, None, level2_node.largs)
205 if lang_name != word_entry.lang:
206 continue
207 for child_level in level2_node.find_child_recursively(LEVEL_KIND_FLAGS):
208 child_level_str = clean_node(wxr, None, child_level.largs)
209 if child_level_str == "Werger":
210 extract_translation_section(
211 wxr, word_entry, child_level, page_title
212 )