Coverage for src/wiktextract/extractor/vi/translation.py: 66%
105 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1from itertools import count
3from mediawiki_langcodes import name_to_code
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .linkage import QUALIFIER_TEMPALTES, extract_qualifier_template
15from .models import Translation, WordEntry
16from .section_titles import TRANSLATION_SECTIONS
17from .tags import translate_raw_tags
20def extract_translation_section(
21 wxr: WiktextractContext,
22 word_entry: WordEntry,
23 level_node: LevelNode,
24 sense: str = "",
25 from_trans_see: bool = False,
26 source: str = "",
27):
28 for node in level_node.children:
29 if isinstance(node, TemplateNode):
30 if node.template_name == "trans-top" and not (
31 sense != "" and from_trans_see
32 ):
33 sense = clean_node(
34 wxr, None, node.template_parameters.get(1, "")
35 )
36 clean_node(wxr, word_entry, node)
37 elif node.template_name == "trans-see" and not from_trans_see: 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true
38 extract_trans_see_template(wxr, word_entry, node)
39 elif node.template_name == "multitrans":
40 extract_multitrans_template(wxr, word_entry, node)
41 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
42 for list_item in node.find_child(NodeKind.LIST_ITEM):
43 extract_translation_list_item(
44 wxr, word_entry, list_item, sense, source
45 )
48def extract_translation_list_item(
49 wxr: WiktextractContext,
50 word_entry: WordEntry,
51 list_item: WikiNode,
52 sense: str,
53 source: str,
54):
55 lang_name = "unknown"
56 lang_code = "unknown"
57 for index, node in enumerate(list_item.children):
58 if isinstance(node, str) and ":" in node and lang_name == "unknown":
59 lang_name = (
60 clean_node(wxr, None, list_item.children[:index])
61 + node[: node.index(":")].strip()
62 ) or "unknown"
63 if lang_name != "unknown": 63 ↛ 57line 63 didn't jump to line 57 because the condition on line 63 was always true
64 lang_code = name_to_code(lang_name, "vi") or "unknown"
65 elif isinstance(node, TemplateNode) and node.template_name in [
66 "t",
67 "t-",
68 "t+",
69 "t2",
70 "t2+",
71 "tt+",
72 ]:
73 extract_t_template(wxr, word_entry, node, lang_name, sense, source)
74 elif ( 74 ↛ 79line 74 didn't jump to line 79 because the condition on line 74 was never true
75 isinstance(node, WikiNode)
76 and node.kind == NodeKind.LINK
77 and lang_name != "unknown"
78 ):
79 word = clean_node(wxr, None, node)
80 if word != "":
81 word_entry.translations.append(
82 Translation(
83 word=word,
84 lang=lang_name,
85 lang_code=lang_code,
86 sense=sense,
87 source=source,
88 )
89 )
90 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
91 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
92 extract_translation_list_item(
93 wxr, word_entry, child_list_item, sense, source
94 )
95 elif (
96 isinstance(node, TemplateNode)
97 and node.template_name in QUALIFIER_TEMPALTES
98 and len(word_entry.translations) > 0
99 ):
100 word_entry.translations[-1].raw_tags.extend(
101 extract_qualifier_template(wxr, node)
102 )
103 translate_raw_tags(word_entry.translations[-1])
106def extract_t_template(
107 wxr: WiktextractContext,
108 word_entry: WordEntry,
109 t_node: TemplateNode,
110 lang_name: str,
111 sense: str,
112 source: str,
113) -> None:
114 lang_code = (
115 clean_node(wxr, None, t_node.template_parameters.get(1, ""))
116 or "unknown"
117 )
118 expanded_node = wxr.wtp.parse(
119 wxr.wtp.node_to_wikitext(t_node), expand_all=True
120 )
121 for e_node in expanded_node.find_child(NodeKind.TEMPLATE): 121 ↛ 122line 121 didn't jump to line 122 because the loop on line 121 never started
122 if e_node.template_name in ["t", "t+"]:
123 expanded_node = wxr.wtp.parse(
124 wxr.wtp.node_to_wikitext(e_node), expand_all=True
125 )
126 lit = clean_node(wxr, None, t_node.template_parameters.get("lit", ""))
127 raw_tags = []
128 roman = ""
129 other = ""
130 for abbr_tag in expanded_node.find_html_recursively("abbr"): 130 ↛ 131line 130 didn't jump to line 131 because the loop on line 130 never started
131 gender = abbr_tag.attrs.get("title", "")
132 if gender != "":
133 raw_tags.append(gender)
134 for span_tag in expanded_node.find_html_recursively("span"):
135 if (
136 span_tag.attrs.get("lang", "").endswith("-Latn")
137 or span_tag.attrs.get("class", "") == "tr"
138 ):
139 roman = clean_node(wxr, None, span_tag)
140 if lang_code == "ja" and "," in roman:
141 other, roman = roman.split(",", maxsplit=1)
142 other = other.strip()
143 roman = roman.strip()
144 for span_tag in expanded_node.find_html_recursively("span"):
145 span_class = span_tag.attrs.get("class", "").split()
146 if span_tag.attrs.get("lang") == lang_code:
147 word = clean_node(wxr, None, span_tag)
148 if word != "": 148 ↛ 144line 148 didn't jump to line 144 because the condition on line 148 was always true
149 tr_data = Translation(
150 word=word,
151 lang=lang_name,
152 lang_code=lang_code,
153 sense=sense,
154 source=source,
155 roman=roman,
156 lit=lit,
157 raw_tags=raw_tags,
158 other=other,
159 )
160 if "Hant" in span_class:
161 tr_data.tags.append("Traditional-Chinese")
162 elif "Hans" in span_class:
163 tr_data.tags.append("Simplified-Chinese")
164 translate_raw_tags(tr_data)
165 word_entry.translations.append(tr_data)
167 for link_node in expanded_node.find_child(NodeKind.LINK):
168 clean_node(wxr, word_entry, link_node)
171def extract_trans_see_template(
172 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
173):
174 sense = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
175 page_titles = []
176 if 2 in t_node.template_parameters:
177 for index in count(2):
178 if index not in t_node.template_parameters:
179 break
180 page_titles.append(
181 clean_node(wxr, None, t_node.template_parameters[index])
182 )
183 else:
184 page_titles.append(sense)
185 for page_title in page_titles:
186 if "#" in page_title:
187 page_title = page_title[: page_title.index("#")]
188 page = wxr.wtp.get_page(page_title)
189 if page is None:
190 return
191 root = wxr.wtp.parse(page.body, pre_expand=True)
192 target_node = find_subpage_section(wxr, root, TRANSLATION_SECTIONS)
193 if target_node is not None:
194 extract_translation_section(
195 wxr,
196 word_entry,
197 target_node,
198 sense=sense,
199 from_trans_see=True,
200 source=page_title,
201 )
204def find_subpage_section(
205 wxr: WiktextractContext, root: WikiNode, target_sections: set[str]
206) -> WikiNode | None:
207 for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS):
208 section_title = clean_node(wxr, None, level_node.largs)
209 if section_title in target_sections:
210 return level_node
211 return None
214def extract_multitrans_template(
215 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
216):
217 arg = wxr.wtp.parse(
218 wxr.wtp.node_to_wikitext(t_node.template_parameters.get("data", ""))
219 )
220 extract_translation_section(wxr, word_entry, arg)