Coverage for src/wiktextract/extractor/ms/translation.py: 57%
96 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1from wikitextprocessor.parser import (
2 LEVEL_KIND_FLAGS,
3 LevelNode,
4 NodeKind,
5 TemplateNode,
6 WikiNode,
7)
9from ...page import clean_node
10from ...wxr_context import WiktextractContext
11from .models import Translation, WordEntry
12from .tags import translate_raw_tags
15def extract_translation_section(
16 wxr: WiktextractContext,
17 page_data: list[WordEntry],
18 base_data: WordEntry,
19 level_node: LevelNode,
20 sense: str = "",
21 source: str = "",
22 from_trans_see: bool = False,
23) -> None:
24 tr_list = []
25 cats = {}
26 for node in level_node.children:
27 if (
28 isinstance(node, TemplateNode)
29 and node.template_name
30 in [
31 "ter-atas",
32 "teratas",
33 "trans-top",
34 ]
35 and not (sense != "" and from_trans_see)
36 ):
37 sense = clean_node(wxr, cats, node)
38 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
39 for list_item in node.find_child(NodeKind.LIST_ITEM):
40 tr_list.extend(
41 extract_translation_list_item(wxr, list_item, sense, source)
42 )
43 elif ( 43 ↛ 48line 43 didn't jump to line 48 because the condition on line 43 was never true
44 isinstance(node, TemplateNode)
45 and node.template_name in ["ter-lihat", "trans-see"]
46 and not from_trans_see
47 ):
48 extract_trans_see_template(wxr, page_data, base_data, node)
50 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true
51 base_data.categories.extend(cats.get("categories", []))
52 for tr_data in tr_list:
53 if tr_data.word != "":
54 base_data.translations.append(tr_data)
55 base_data.categories.extend(tr_data.categories)
56 elif level_node.kind == NodeKind.LEVEL3: 56 ↛ 65line 56 didn't jump to line 65 because the condition on line 56 was always true
57 for data in page_data:
58 if data.lang_code == page_data[-1].lang_code: 58 ↛ 57line 58 didn't jump to line 57 because the condition on line 58 was always true
59 data.categories.extend(cats.get("categories", []))
60 for tr_data in tr_list:
61 if tr_data.word != "": 61 ↛ 60line 61 didn't jump to line 60 because the condition on line 61 was always true
62 data.translations.append(tr_data)
63 data.categories.extend(tr_data.categories)
64 else:
65 page_data[-1].categories.extend(cats.get("categories", []))
66 for tr_data in tr_list:
67 if tr_data.word != "":
68 page_data[-1].translations.append(tr_data)
69 page_data[-1].categories.extend(tr_data.categories)
72def extract_translation_list_item(
73 wxr: WiktextractContext, list_item: WikiNode, sense: str, source: str
74) -> None:
75 tr_list = []
76 lang_name = "unknown"
77 for node in list_item.children:
78 if (
79 isinstance(node, str)
80 and node.strip().endswith(":")
81 and lang_name == "unknown"
82 ):
83 lang_name = node.strip(": ") or "unknown"
84 elif isinstance(node, TemplateNode) and node.template_name in [
85 "t",
86 "trad",
87 "tø",
88 "t-",
89 "t+",
90 ]:
91 tr_list.append(
92 extract_t_template(wxr, node, sense, lang_name, source)
93 )
94 elif ( 94 ↛ 100line 94 didn't jump to line 100 because the condition on line 94 was never true
95 isinstance(node, TemplateNode)
96 and node.template_name
97 in ["penerang", "qualifier", "i", "q", "qual"]
98 and len(tr_list) > 0
99 ):
100 raw_tag = clean_node(wxr, None, node).strip("() ")
101 if raw_tag != "":
102 tr_list[-1].raw_tags.append(raw_tag)
103 translate_raw_tags(tr_list[-1])
104 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
105 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
106 tr_list.extend(
107 extract_translation_list_item(
108 wxr, child_list_item, sense, source
109 )
110 )
111 return tr_list
114def extract_t_template(
115 wxr: WiktextractContext,
116 t_node: TemplateNode,
117 sense: str,
118 lang_name: str,
119 source: str,
120) -> Translation:
121 lang_code = (
122 clean_node(wxr, None, t_node.template_parameters.get(1, ""))
123 or "unknown"
124 )
125 tr_data = Translation(
126 word="", lang=lang_name, lang_code=lang_code, sense=sense, source=source
127 )
128 expanded_node = wxr.wtp.parse(
129 wxr.wtp.node_to_wikitext(t_node), expand_all=True
130 )
131 for span_tag in expanded_node.find_html("span"):
132 if span_tag.attrs.get("lang") == lang_code and tr_data.word == "":
133 tr_data.word = clean_node(wxr, None, span_tag)
134 elif span_tag.attrs.get("class", "") == "gender":
135 for abbr_tag in span_tag.find_html("abbr"):
136 raw_tag = clean_node(wxr, None, abbr_tag)
137 if raw_tag not in ["", "?", "jantina tidak diberi"]: 137 ↛ 135line 137 didn't jump to line 135 because the condition on line 137 was always true
138 tr_data.raw_tags.append(raw_tag)
139 elif "tr" in span_tag.attrs.get("class", ""):
140 tr_data.roman = clean_node(wxr, None, span_tag)
141 if tr_data.word != "": 141 ↛ 145line 141 didn't jump to line 145 because the condition on line 141 was always true
142 translate_raw_tags(tr_data)
143 for link_node in expanded_node.find_child(NodeKind.LINK):
144 clean_node(wxr, tr_data, link_node)
145 return tr_data
148def extract_trans_see_template(
149 wxr: WiktextractContext,
150 page_data: list[WordEntry],
151 base_data: WordEntry,
152 t_node: TemplateNode,
153):
154 # https://ms.wiktionary.org/wik/Templat:ter-lihat
155 sense = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
156 page_titles = []
157 if 2 in t_node.template_parameters:
158 for index in range(2, 11):
159 if index not in t_node.template_parameters:
160 break
161 page_titles.append(
162 clean_node(wxr, None, t_node.template_parameters[index])
163 )
164 else:
165 page_titles.append(
166 clean_node(wxr, None, t_node.template_parameters.get(1, ""))
167 )
168 for page_title in page_titles:
169 if "#" in page_title:
170 page_title = page_title[: page_title.index("#")]
171 page = wxr.wtp.get_page(page_title)
172 if page is None:
173 return
174 root = wxr.wtp.parse(page.body)
175 target_node = find_subpage_section(wxr, root, "Terjemahan")
176 if target_node is not None:
177 extract_translation_section(
178 wxr,
179 page_data,
180 base_data,
181 target_node,
182 sense=sense,
183 source=page_title,
184 from_trans_see=True,
185 )
188def find_subpage_section(
189 wxr: WiktextractContext, root: WikiNode, target_section: str
190) -> WikiNode | None:
191 for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS):
192 section_title = clean_node(wxr, None, level_node.largs)
193 if section_title == target_section:
194 return level_node
195 return None