Coverage for src/wiktextract/extractor/vi/translation.py: 65%
119 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1from itertools import count
3from mediawiki_langcodes import name_to_code
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .linkage import QUALIFIER_TEMPALTES, extract_qualifier_template
15from .models import Translation, WordEntry
16from .section_titles import TRANSLATION_SECTIONS
17from .tags import translate_raw_tags
20def extract_translation_section(
21 wxr: WiktextractContext,
22 word_entry: WordEntry,
23 level_node: LevelNode,
24 sense: str = "",
25 from_trans_see: bool = False,
26 source: str = "",
27):
28 for node in level_node.children:
29 if isinstance(node, TemplateNode):
30 if node.template_name == "trans-top" and not (
31 sense != "" and from_trans_see
32 ):
33 sense = clean_node(
34 wxr, None, node.template_parameters.get(1, "")
35 )
36 clean_node(wxr, word_entry, node)
37 elif node.template_name == "trans-see" and not from_trans_see: 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true
38 extract_trans_see_template(wxr, word_entry, node)
39 elif node.template_name == "multitrans":
40 extract_multitrans_template(wxr, word_entry, node)
41 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
42 for list_item in node.find_child(NodeKind.LIST_ITEM):
43 extract_translation_list_item(
44 wxr, word_entry, list_item, sense, source
45 )
46 elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
47 sense = clean_node(wxr, None, node)
50# Thể loại:Bản mẫu ngữ pháp
51ABBR_TAG_TEMPLATES = {"f", "fm", "g", "inv", "m", "mf", "mn", "n", "p"}
54def extract_translation_list_item(
55 wxr: WiktextractContext,
56 word_entry: WordEntry,
57 list_item: WikiNode,
58 sense: str,
59 source: str,
60):
61 lang_name = "unknown"
62 lang_code = "unknown"
63 for index, node in enumerate(list_item.children):
64 if isinstance(node, str) and ":" in node and lang_name == "unknown":
65 lang_name = (
66 clean_node(wxr, None, list_item.children[:index])
67 + node[: node.index(":")].strip()
68 ) or "unknown"
69 if lang_name != "unknown": 69 ↛ 63line 69 didn't jump to line 63 because the condition on line 69 was always true
70 lang_code = name_to_code(lang_name, "vi") or "unknown"
71 elif isinstance(node, TemplateNode) and node.template_name in [
72 "t",
73 "t-",
74 "t+",
75 "t2",
76 "t2+",
77 "tt+",
78 ]:
79 extract_t_template(wxr, word_entry, node, lang_name, sense, source)
80 elif (
81 isinstance(node, WikiNode)
82 and node.kind == NodeKind.LINK
83 and lang_name != "unknown"
84 ):
85 word = clean_node(wxr, None, node)
86 if word != "": 86 ↛ 63line 86 didn't jump to line 63 because the condition on line 86 was always true
87 word_entry.translations.append(
88 Translation(
89 word=word,
90 lang=lang_name,
91 lang_code=lang_code,
92 sense=sense,
93 source=source,
94 )
95 )
96 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
97 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
98 extract_translation_list_item(
99 wxr, word_entry, child_list_item, sense, source
100 )
101 elif (
102 isinstance(node, TemplateNode)
103 and node.template_name in QUALIFIER_TEMPALTES
104 and len(word_entry.translations) > 0
105 ):
106 word_entry.translations[-1].raw_tags.extend(
107 extract_qualifier_template(wxr, node)
108 )
109 translate_raw_tags(word_entry.translations[-1])
110 elif ( 110 ↛ 115line 110 didn't jump to line 115 because the condition on line 110 was never true
111 isinstance(node, TemplateNode)
112 and node.template_name in ABBR_TAG_TEMPLATES
113 and len(word_entry.translations) > 0
114 ):
115 word_entry.translations[-1].raw_tags.extend(
116 extract_abbr_tag_template(wxr, node)
117 )
118 translate_raw_tags(word_entry.translations[-1])
121def extract_t_template(
122 wxr: WiktextractContext,
123 word_entry: WordEntry,
124 t_node: TemplateNode,
125 lang_name: str,
126 sense: str,
127 source: str,
128) -> None:
129 lang_code = (
130 clean_node(wxr, None, t_node.template_parameters.get(1, ""))
131 or "unknown"
132 )
133 expanded_node = wxr.wtp.parse(
134 wxr.wtp.node_to_wikitext(t_node), expand_all=True
135 )
136 for e_node in expanded_node.find_child(NodeKind.TEMPLATE): 136 ↛ 137line 136 didn't jump to line 137 because the loop on line 136 never started
137 if e_node.template_name in ["t", "t+"]:
138 expanded_node = wxr.wtp.parse(
139 wxr.wtp.node_to_wikitext(e_node), expand_all=True
140 )
141 lit = clean_node(wxr, None, t_node.template_parameters.get("lit", ""))
142 raw_tags = []
143 roman = ""
144 other = ""
145 for abbr_tag in expanded_node.find_html_recursively("abbr"): 145 ↛ 146line 145 didn't jump to line 146 because the loop on line 145 never started
146 gender = abbr_tag.attrs.get("title", "")
147 if gender != "":
148 raw_tags.append(gender)
149 for span_tag in expanded_node.find_html_recursively("span"):
150 if (
151 span_tag.attrs.get("lang", "").endswith("-Latn")
152 or span_tag.attrs.get("class", "") == "tr"
153 ):
154 roman = clean_node(wxr, None, span_tag)
155 if lang_code == "ja" and "," in roman:
156 other, roman = roman.split(",", maxsplit=1)
157 other = other.strip()
158 roman = roman.strip()
159 for span_tag in expanded_node.find_html_recursively("span"):
160 span_class = span_tag.attrs.get("class", "").split()
161 if span_tag.attrs.get("lang") == lang_code:
162 word = clean_node(wxr, None, span_tag)
163 if word != "": 163 ↛ 159line 163 didn't jump to line 159 because the condition on line 163 was always true
164 tr_data = Translation(
165 word=word,
166 lang=lang_name,
167 lang_code=lang_code,
168 sense=sense,
169 source=source,
170 roman=roman,
171 lit=lit,
172 raw_tags=raw_tags,
173 other=other,
174 )
175 if "Hant" in span_class:
176 tr_data.tags.append("Traditional-Chinese")
177 elif "Hans" in span_class:
178 tr_data.tags.append("Simplified-Chinese")
179 translate_raw_tags(tr_data)
180 word_entry.translations.append(tr_data)
182 for link_node in expanded_node.find_child(NodeKind.LINK):
183 clean_node(wxr, word_entry, link_node)
186def extract_trans_see_template(
187 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
188):
189 sense = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
190 page_titles = []
191 if 2 in t_node.template_parameters:
192 for index in count(2):
193 if index not in t_node.template_parameters:
194 break
195 page_titles.append(
196 clean_node(wxr, None, t_node.template_parameters[index])
197 )
198 else:
199 page_titles.append(sense)
200 for page_title in page_titles:
201 if "#" in page_title:
202 page_title = page_title[: page_title.index("#")]
203 page = wxr.wtp.get_page(page_title)
204 if page is None:
205 return
206 root = wxr.wtp.parse(page.body, pre_expand=True)
207 target_node = find_subpage_section(wxr, root, TRANSLATION_SECTIONS)
208 if target_node is not None:
209 extract_translation_section(
210 wxr,
211 word_entry,
212 target_node,
213 sense=sense,
214 from_trans_see=True,
215 source=page_title,
216 )
219def find_subpage_section(
220 wxr: WiktextractContext, root: WikiNode, target_sections: set[str]
221) -> WikiNode | None:
222 for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS):
223 section_title = clean_node(wxr, None, level_node.largs)
224 if section_title in target_sections:
225 return level_node
226 return None
229def extract_multitrans_template(
230 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
231):
232 arg = wxr.wtp.parse(
233 wxr.wtp.node_to_wikitext(t_node.template_parameters.get("data", ""))
234 )
235 extract_translation_section(wxr, word_entry, arg)
238def extract_abbr_tag_template(
239 wxr: WiktextractContext, t_node: TemplateNode
240) -> list[str]:
241 raw_tags = []
242 expanded_node = wxr.wtp.parse(
243 wxr.wtp.node_to_wikitext(t_node), expand_all=True
244 )
245 for abbr_tag in expanded_node.find_html_recursively("abbr"):
246 raw_tag = clean_node(wxr, None, abbr_tag.attrs.get("title", ""))
247 if raw_tag != "":
248 raw_tags.append(raw_tag)
249 return raw_tags