Coverage for src/wiktextract/extractor/zh/translation.py: 88%
100 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from typing import Optional, Union
3from mediawiki_langcodes import code_to_name, name_to_code
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .models import Translation, WordEntry
14from .section_titles import TRANSLATIONS_TITLES
15from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags
18def extract_translation(
19 wxr: WiktextractContext,
20 page_data: list[WordEntry],
21 level_node: WikiNode,
22 sense: str = "",
23 is_subpage: bool = False,
24) -> None:
25 for child in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
26 if isinstance(child, TemplateNode):
27 template_name = child.template_name.lower()
28 if (
29 template_name in {"trans-top", "翻譯-頂", "trans-top-also"}
30 and 1 in child.template_parameters
31 ):
32 sense = clean_node(wxr, None, child.template_parameters.get(1))
33 elif (
34 template_name in {"see translation subpage", "trans-see"}
35 and not is_subpage
36 ):
37 translation_subpage(wxr, page_data, child)
38 elif template_name == "multitrans":
39 wikitext = "".join(
40 wxr.wtp.node_to_wikitext(c)
41 for c in child.template_parameters.get("data", [])
42 )
43 multitrans = wxr.wtp.parse(wikitext)
44 extract_translation(wxr, page_data, multitrans, sense)
45 else:
46 for list_item in child.find_child_recursively(NodeKind.LIST_ITEM):
47 process_translation_list_item(
48 wxr,
49 page_data,
50 list_item,
51 sense,
52 )
55def process_translation_list_item(
56 wxr: WiktextractContext,
57 page_data: list[WordEntry],
58 list_item: WikiNode,
59 sense: str,
60) -> None:
61 tr_data = Translation(word="", sense=sense)
63 for child_index, child in enumerate(list_item.filter_empty_str_child()):
64 if child_index == 0:
65 lang_text = ""
66 if isinstance(child, str):
67 if ":" in child:
68 lang_text = child[: child.index(":")]
69 elif ":" in child: 69 ↛ 73line 69 didn't jump to line 73 because the condition on line 69 was always true
70 lang_text = child[: child.index(":")]
71 else:
72 lang_text = clean_node(wxr, None, child)
73 if len(lang_text) > 0: 73 ↛ 63line 73 didn't jump to line 63 because the condition on line 73 was always true
74 tr_data.lang = lang_text.strip()
75 tr_data.lang_code = name_to_code(tr_data.lang, "zh")
76 elif isinstance(child, TemplateNode):
77 template_name = child.template_name.lower()
78 if template_name in {
79 "t",
80 "t+",
81 "tt",
82 "tt+",
83 "t-check",
84 "t+check",
85 "l",
86 }:
87 if len(tr_data.word) > 0:
88 page_data[-1].translations.append(
89 tr_data.model_copy(deep=True)
90 )
91 tr_data = Translation(
92 word="",
93 lang=tr_data.lang,
94 lang_code=tr_data.lang_code,
95 sense=sense,
96 )
97 if tr_data.lang_code == "":
98 tr_data.lang_code = child.template_parameters.get(1, "")
99 if tr_data.lang == "": 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true
100 tr_data.lang = code_to_name(tr_data.lang_code, "zh")
101 tr_data.word = clean_node(
102 wxr, None, child.template_parameters.get(2, "")
103 )
104 tr_data.roman = clean_node(
105 wxr, None, child.template_parameters.get("tr", "")
106 )
107 tr_data.alt = clean_node(
108 wxr, None, child.template_parameters.get("alt", "")
109 )
110 tr_data.lit = clean_node(
111 wxr, None, child.template_parameters.get("lit", "")
112 )
113 for arg_key, arg_value in child.template_parameters.items():
114 if (
115 isinstance(arg_key, int) and arg_key >= 3
116 ) or arg_key == "g": # template "l" uses the "g" arg
117 for tag_arg in arg_value.split("-"):
118 if tag_arg in TEMPLATE_TAG_ARGS: 118 ↛ 117line 118 didn't jump to line 117 because the condition on line 118 was always true
119 tr_data.tags.append(TEMPLATE_TAG_ARGS[tag_arg])
121 elif template_name == "t-needed":
122 # ignore empty translation
123 continue
124 elif template_name in ("qualifier", "q"):
125 raw_tag = clean_node(wxr, None, child)
126 tr_data.raw_tags.append(raw_tag.strip("()"))
127 else:
128 # zh qualifier templates that use template "注释"
129 # https://zh.wiktionary.org/wiki/Template:注释
130 raw_tag = clean_node(wxr, None, child)
131 if raw_tag.startswith("〈") and raw_tag.endswith("〉"): 131 ↛ 63line 131 didn't jump to line 63 because the condition on line 131 was always true
132 tr_data.raw_tags.append(raw_tag.strip("〈〉"))
133 elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK:
134 if len(tr_data.word) > 0:
135 page_data[-1].translations.append(tr_data.model_copy(deep=True))
136 tr_data = Translation(
137 word="",
138 lang=tr_data.lang,
139 lang_code=tr_data.lang_code,
140 sense=sense,
141 )
142 tr_data.word = clean_node(wxr, None, child)
144 if len(tr_data.word) > 0:
145 translate_raw_tags(tr_data)
146 page_data[-1].translations.append(tr_data.model_copy(deep=True))
149def translation_subpage(
150 wxr: WiktextractContext,
151 page_data: list[WordEntry],
152 template_node: TemplateNode,
153) -> None:
154 # https://zh.wiktionary.org/wiki/Template:翻譯-見
155 # https://zh.wiktionary.org/wiki/Template:See_translation_subpage
157 page_title = wxr.wtp.title
158 target_section = None
159 if template_node.template_name == "see translation subpage": 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true
160 target_section = template_node.template_parameters.get(1)
161 page_title = clean_node(
162 wxr, None, template_node.template_parameters.get(2, wxr.wtp.title)
163 )
164 if "#" in page_title: 164 ↛ 165line 164 didn't jump to line 165 because the condition on line 164 was never true
165 page_title = page_title[: page_title.index("#")]
167 translation_subpage_title = page_title
168 if page_title == wxr.wtp.title: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true
169 translation_subpage_title = f"{page_title}/翻譯"
170 subpage = wxr.wtp.get_page(translation_subpage_title)
171 if subpage is None: 171 ↛ 172line 171 didn't jump to line 172 because the condition on line 171 was never true
172 return
174 root = wxr.wtp.parse(subpage.body, pre_expand=True)
175 target_section_node = (
176 root
177 if target_section is None
178 else find_subpage_section(wxr, root, target_section)
179 )
180 translation_node = find_subpage_section(wxr, target_section_node)
181 if translation_node is not None: 181 ↛ exitline 181 didn't return from function 'translation_subpage' because the condition on line 181 was always true
182 extract_translation(wxr, page_data, translation_node, is_subpage=True)
185def find_subpage_section(
186 wxr: WiktextractContext,
187 node: Union[WikiNode, str],
188 target_section: Union[str, None] = None,
189) -> Optional[WikiNode]:
190 if not isinstance(node, WikiNode): 190 ↛ 191line 190 didn't jump to line 191 because the condition on line 190 was never true
191 return None
192 for level_node in node.find_child_recursively(LEVEL_KIND_FLAGS): 192 ↛ 198line 192 didn't jump to line 198 because the loop on line 192 didn't complete
193 section_title = clean_node(wxr, None, level_node.largs)
194 if isinstance(target_section, str) and section_title == target_section: 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true
195 return level_node
196 if section_title in TRANSLATIONS_TITLES:
197 return level_node
198 return None