Coverage for src/wiktextract/extractor/th/translation.py: 57%
97 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1from itertools import count
3from mediawiki_langcodes import name_to_code
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .models import Translation, WordEntry
15from .section_titles import TRANSLATION_SECTIONS
16from .tags import translate_raw_tags
19def extract_translation_section(
20 wxr: WiktextractContext,
21 word_entry: WordEntry,
22 level_node: LevelNode,
23 sense: str = "",
24 from_trans_see: bool = False,
25 source: str = "",
26) -> None:
27 for node in level_node.children:
28 if (
29 isinstance(node, TemplateNode)
30 and node.template_name == "trans-top"
31 and not (sense != "" and from_trans_see)
32 ):
33 sense = clean_node(wxr, None, node.template_parameters.get(1, ""))
34 clean_node(wxr, word_entry, node)
35 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
36 for list_item in node.find_child(NodeKind.LIST_ITEM):
37 extract_translation_list_item(
38 wxr, word_entry, list_item, sense, source
39 )
40 elif ( 40 ↛ 45line 40 didn't jump to line 45 because the condition on line 40 was never true
41 isinstance(node, TemplateNode)
42 and node.template_name == "trans-see"
43 and not from_trans_see
44 ):
45 extract_trans_see_template(wxr, word_entry, node)
48def extract_translation_list_item(
49 wxr: WiktextractContext,
50 word_entry: WordEntry,
51 list_item: WikiNode,
52 sense: str,
53 source: str,
54) -> None:
55 lang_name = "unknown"
56 lang_code = "unknown"
57 for index, node in enumerate(list_item.children):
58 if isinstance(node, str) and ":" in node and lang_name == "unknown":
59 lang_name = (
60 clean_node(wxr, None, list_item.children[:index])
61 + node[: node.index(":")].strip()
62 )
63 if lang_name == "": 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true
64 lang_name = "unknown"
65 if lang_name != "unknown": 65 ↛ 57line 65 didn't jump to line 57 because the condition on line 65 was always true
66 lang_code = name_to_code(lang_name, "th")
67 if lang_code == "": 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true
68 lang_code = "unknown"
69 elif isinstance(node, TemplateNode) and node.template_name in [
70 "t",
71 "t+",
72 "t-simple",
73 ]:
74 extract_t_template(wxr, word_entry, node, lang_name, sense, source)
75 elif (
76 isinstance(node, WikiNode)
77 and node.kind == NodeKind.LINK
78 and lang_name != "unknown"
79 ):
80 word = clean_node(wxr, None, node)
81 if word != "": 81 ↛ 57line 81 didn't jump to line 57 because the condition on line 81 was always true
82 word_entry.translations.append(
83 Translation(
84 word=word,
85 lang=lang_name,
86 lang_code=lang_code,
87 sense=sense,
88 source=source,
89 )
90 )
91 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
92 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
93 extract_translation_list_item(
94 wxr, word_entry, child_list_item, sense, source
95 )
96 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true
97 for link_node in node.find_child(NodeKind.LINK):
98 link_str = clean_node(wxr, None, link_node)
99 if link_str.endswith("/คำแปลภาษาอื่น"):
100 extract_translation_subpage(wxr, word_entry, link_str)
103def extract_t_template(
104 wxr: WiktextractContext,
105 word_entry: WordEntry,
106 t_node: TemplateNode,
107 lang_name: str,
108 sense: str,
109 source: str,
110) -> None:
111 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
112 if lang_code == "": 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true
113 lang_code = "unknown"
114 tr_data = Translation(
115 word="", lang=lang_name, lang_code=lang_code, sense=sense, source=source
116 )
117 expanded_node = wxr.wtp.parse(
118 wxr.wtp.node_to_wikitext(t_node), expand_all=True
119 )
120 for span_tag in expanded_node.find_html_recursively("span"):
121 if span_tag.attrs.get("lang") == lang_code and tr_data.word == "":
122 tr_data.word = clean_node(wxr, None, span_tag)
123 else:
124 span_class = span_tag.attrs.get("class", "")
125 if "Latn" in span_class:
126 tr_data.roman = clean_node(wxr, None, span_tag)
128 tr_data.lit = clean_node(
129 wxr, None, t_node.template_parameters.get("lit", "")
130 )
131 for abbr_tag in expanded_node.find_html_recursively("abbr"):
132 tr_data.raw_tags.append(clean_node(wxr, None, abbr_tag))
134 if tr_data.word != "": 134 ↛ exitline 134 didn't return from function 'extract_t_template' because the condition on line 134 was always true
135 translate_raw_tags(tr_data)
136 word_entry.translations.append(tr_data)
137 for link_node in expanded_node.find_child(NodeKind.LINK):
138 clean_node(wxr, word_entry, link_node)
141def extract_translation_subpage(
142 wxr: WiktextractContext, word_entry: WordEntry, page_title: str
143) -> None:
144 page = wxr.wtp.get_page(page_title, 0)
145 if page is None or page.body is None:
146 return
147 root = wxr.wtp.parse(page.body)
148 target_node = find_subpage_section(wxr, root, TRANSLATION_SECTIONS)
149 if target_node is not None:
150 extract_translation_section(
151 wxr, word_entry, target_node, source=page_title
152 )
155def extract_trans_see_template(
156 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
157):
158 sense = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
159 page_titles = []
160 if 2 in t_node.template_parameters:
161 for index in count(2):
162 if index not in t_node.template_parameters:
163 break
164 page_titles.append(
165 clean_node(wxr, None, t_node.template_parameters[index])
166 )
167 else:
168 page_titles.append(sense)
169 for page_title in page_titles:
170 if "#" in page_title:
171 page_title = page_title[: page_title.index("#")]
172 page = wxr.wtp.get_page(page_title)
173 if page is None:
174 return
175 root = wxr.wtp.parse(page.body)
176 target_node = find_subpage_section(wxr, root, TRANSLATION_SECTIONS)
177 if target_node is not None:
178 extract_translation_section(
179 wxr,
180 word_entry,
181 target_node,
182 sense=sense,
183 from_trans_see=True,
184 source=page_title,
185 )
188def find_subpage_section(
189 wxr: WiktextractContext, root: WikiNode, target_sections: tuple[str, ...]
190) -> WikiNode | None:
191 for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS):
192 section_title = clean_node(wxr, None, level_node.largs)
193 if section_title in target_sections:
194 return level_node
195 return None