Coverage for src/wiktextract/extractor/ja/translation.py: 89%
119 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1from mediawiki_langcodes import name_to_code
2from wikitextprocessor.parser import (
3 LEVEL_KIND_FLAGS,
4 LevelNode,
5 NodeKind,
6 TemplateNode,
7 WikiNode,
8)
10from ...page import clean_node
11from ...wxr_context import WiktextractContext
12from .models import Translation, WordEntry
13from .tags import translate_raw_tags
16def extract_translation_section(
17 wxr: WiktextractContext,
18 word_entry: WordEntry,
19 level_node: LevelNode,
20 sense: str = "",
21 is_subpage: bool = False,
22) -> None:
23 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
24 if (
25 isinstance(node, TemplateNode)
26 and node.template_name == "trans-top"
27 and not (sense != "" and is_subpage)
28 ):
29 sense = clean_node(wxr, None, node.template_parameters.get(1, ""))
30 elif (
31 isinstance(node, TemplateNode)
32 and node.template_name in ["trans-see", "trans-see2"]
33 and not is_subpage
34 ):
35 extract_trans_see_template(wxr, word_entry, node)
36 elif node.kind == NodeKind.LIST: 36 ↛ 23line 36 didn't jump to line 23 because the condition on line 36 was always true
37 for list_item in node.find_child(NodeKind.LIST_ITEM):
38 process_translation_list_item(
39 wxr, word_entry, list_item, sense, "", ""
40 )
43def process_translation_list_item(
44 wxr: WiktextractContext,
45 word_entry: WordEntry,
46 list_item: WikiNode,
47 sense_text: str,
48 lang_name: str,
49 lang_code: str,
50) -> None:
51 after_collon = False
52 last_tr: Translation | None = None
53 for node_index, node in enumerate(list_item.children):
54 if isinstance(node, str) and ":" in node and not after_collon:
55 after_collon = True
56 lang_nodes = list_item.children[:node_index]
57 lang_nodes.append(node[: node.index(":")])
58 new_lang_name = clean_node(wxr, None, lang_nodes)
59 new_lang_code = name_to_code(new_lang_name, "ja")
60 if new_lang_code != "" or lang_name == "":
61 lang_code = new_lang_code
62 lang_name = new_lang_name
63 elif isinstance(node, TemplateNode):
64 if not after_collon:
65 lang_name = clean_node(wxr, None, node)
66 if node.template_name == "T":
67 lang_code = node.template_parameters.get(1, "")
68 else:
69 lang_code = node.template_name
70 elif node.template_name.lower() in [
71 "t+",
72 "t",
73 "t-",
74 "l",
75 "lang",
76 "tø",
77 "t+check",
78 "t-check",
79 ]:
80 for tr_data in process_t_template(
81 wxr, word_entry, node, sense_text, lang_name, lang_code
82 ):
83 last_tr = tr_data
84 elif node.template_name.lower() == "archar":
85 tr_data = Translation(
86 word=clean_node(wxr, None, node),
87 sense=sense_text,
88 lang_code=lang_code,
89 lang=lang_name,
90 )
91 word_entry.translations.append(tr_data)
92 last_tr = tr_data
93 elif (
94 node.template_name.lower()
95 in [
96 "m",
97 "f",
98 "p",
99 "n",
100 "c",
101 "s",
102 "mf",
103 "mpl",
104 "fpl",
105 "npl",
106 "inv",
107 ]
108 and last_tr is not None
109 ):
110 last_tr.raw_tags.append(clean_node(wxr, None, node))
111 translate_raw_tags(last_tr)
112 elif node.template_name.lower() == "zh-ts": 112 ↛ 53line 112 didn't jump to line 53 because the condition on line 112 was always true
113 last_tr = process_zh_ts_template(
114 wxr, word_entry, node, sense_text, lang_name, lang_code
115 )
116 elif (
117 isinstance(node, WikiNode)
118 and node.kind == NodeKind.LINK
119 and after_collon
120 ):
121 tr_word = clean_node(wxr, None, node)
122 if len(tr_word) > 0: 122 ↛ 53line 122 didn't jump to line 53 because the condition on line 122 was always true
123 tr_data = Translation(
124 word=tr_word,
125 sense=sense_text,
126 lang_code=lang_code,
127 lang=lang_name,
128 )
129 word_entry.translations.append(tr_data)
130 last_tr = tr_data
131 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
132 for nested_list_item in node.find_child_recursively(
133 NodeKind.LIST_ITEM
134 ):
135 process_translation_list_item(
136 wxr,
137 word_entry,
138 nested_list_item,
139 sense_text,
140 lang_name,
141 lang_code,
142 )
145T_TAGS = {
146 "m": "masculine",
147 "f": "feminine",
148 "mf": ["masculine", "feminine"],
149 "n": "neuter",
150 "c": "common",
151 "impf": "imperfective",
152 "pf": "perfective",
153 "s": "singular",
154 "p": "plural",
155}
158def process_t_template(
159 wxr: WiktextractContext,
160 word_entry: WordEntry,
161 node: TemplateNode,
162 sense_text: str,
163 lang_name: str,
164 lang_code: str,
165) -> list[Translation]:
166 # https://ja.wiktionary.org/wiki/テンプレート:t
167 second_arg = wxr.wtp.parse(
168 wxr.wtp.node_to_wikitext(node.template_parameters.get(2, ""))
169 )
170 for t_node in second_arg.find_child(NodeKind.TEMPLATE):
171 if t_node.template_name == "zh-l": 171 ↛ 170line 171 didn't jump to line 170 because the condition on line 171 was always true
172 from .linkage import extract_zh_l_template
174 tr_list = []
175 for l_data in extract_zh_l_template(wxr, t_node):
176 tr_data = Translation(
177 word=l_data.word,
178 tags=l_data.tags,
179 roman=l_data.roman,
180 lang=lang_name,
181 lang_code=lang_code,
182 )
183 tr_list.append(tr_data)
184 word_entry.translations.append(tr_data)
185 return tr_list
187 tr_word = clean_node(wxr, None, node.template_parameters.get(2, ""))
188 if "alt" in node.template_parameters:
189 tr_word = clean_node(wxr, None, node.template_parameters["alt"])
190 roman = clean_node(wxr, None, node.template_parameters.get("tr", ""))
191 tags = []
192 for arg_index in [3, 4]:
193 if arg_index in node.template_parameters:
194 tag_arg = clean_node(
195 wxr, None, node.template_parameters.get(arg_index, "")
196 )
197 tag_value = T_TAGS.get(tag_arg, [])
198 if isinstance(tag_value, str): 198 ↛ 200line 198 didn't jump to line 200 because the condition on line 198 was always true
199 tags.append(tag_value)
200 elif isinstance(tag_value, list):
201 tags.extend(tag_value)
202 if len(tr_word) > 0: 202 ↛ 213line 202 didn't jump to line 213 because the condition on line 202 was always true
203 tr_data = Translation(
204 word=tr_word,
205 roman=roman,
206 sense=sense_text,
207 lang_code=lang_code,
208 lang=lang_name,
209 tags=tags,
210 )
211 word_entry.translations.append(tr_data)
212 return [tr_data]
213 return []
216def process_zh_ts_template(
217 wxr: WiktextractContext,
218 word_entry: WordEntry,
219 node: TemplateNode,
220 sense_text: str,
221 lang_name: str,
222 lang_code: str,
223) -> Translation | None:
224 # https://ja.wiktionary.org/wiki/テンプレート:zh-ts
225 tr_data = None
226 for arg in range(1, 3):
227 tr_word = clean_node(wxr, None, node.template_parameters.get(arg, ""))
228 if tr_word != "": 228 ↛ 226line 228 didn't jump to line 226 because the condition on line 228 was always true
229 tr_data = Translation(
230 word=tr_word,
231 sense=sense_text,
232 lang_code=lang_code,
233 lang=lang_name,
234 )
235 tr_data.tags = (
236 ["Traditional-Chinese"] if arg == 1 else ["Simplified-Chinese"]
237 )
238 word_entry.translations.append(tr_data)
239 return tr_data
242def extract_trans_see_template(
243 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
244):
245 # テンプレート:trans-see, テンプレート:trans-see2
246 page_title = clean_node(
247 wxr,
248 None,
249 t_node.template_parameters.get(
250 2, t_node.template_parameters.get(1, wxr.wtp.title)
251 ),
252 )
253 sense = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
254 target_id = ""
255 if "#" in page_title: 255 ↛ 259line 255 didn't jump to line 259 because the condition on line 255 was always true
256 index = page_title.index("#")
257 target_id = page_title[index + 1 :]
258 page_title = page_title[:index]
259 page_body = wxr.wtp.get_page_body(page_title, 0)
260 if page_body is None: 260 ↛ 261line 260 didn't jump to line 261 because the condition on line 260 was never true
261 return
262 root = wxr.wtp.parse(page_body)
263 target_node = find_subpage_section(wxr, root, "翻訳", target_id)
264 if target_node is not None: 264 ↛ exitline 264 didn't return from function 'extract_trans_see_template' because the condition on line 264 was always true
265 extract_translation_section(
266 wxr, word_entry, target_node, sense=sense, is_subpage=True
267 )
270def find_subpage_section(
271 wxr: WiktextractContext, root: WikiNode, target_title: str, target_id: str
272) -> WikiNode | None:
273 for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS): 273 ↛ 282line 273 didn't jump to line 282 because the loop on line 273 didn't complete
274 section_title = clean_node(wxr, None, level_node.largs)
275 if section_title == target_title:
276 if target_id == "": 276 ↛ 277line 276 didn't jump to line 277 because the condition on line 276 was never true
277 return level_node
278 else:
279 for span in level_node.find_html("span"): 279 ↛ 273line 279 didn't jump to line 273 because the loop on line 279 didn't complete
280 if span.attrs.get("id", "") == target_id: 280 ↛ 279line 280 didn't jump to line 279 because the condition on line 280 was always true
281 return level_node
282 return None