Coverage for src/wiktextract/extractor/th/translation.py: 68%
76 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1from mediawiki_langcodes import name_to_code
2from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
4from ...page import clean_node
5from ...wxr_context import WiktextractContext
6from .models import Translation, WordEntry
7from .tags import translate_raw_tags
10def extract_translation_section(
11 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
12) -> None:
13 sense = ""
14 for node in level_node.children:
15 if isinstance(node, TemplateNode) and node.template_name == "trans-top":
16 sense = clean_node(wxr, None, node.template_parameters.get(1, ""))
17 clean_node(wxr, word_entry, node)
18 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
19 for list_item in node.find_child(NodeKind.LIST_ITEM):
20 extract_translation_list_item(wxr, word_entry, list_item, sense)
23def extract_translation_list_item(
24 wxr: WiktextractContext,
25 word_entry: WordEntry,
26 list_item: WikiNode,
27 sense: str,
28) -> None:
29 lang_name = "unknown"
30 lang_code = "unknown"
31 for index, node in enumerate(list_item.children):
32 if isinstance(node, str) and ":" in node and lang_name == "unknown":
33 lang_name = (
34 clean_node(wxr, None, list_item.children[:index])
35 + node[: node.index(":")].strip()
36 )
37 if lang_name == "": 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true
38 lang_name = "unknown"
39 if lang_name != "unknown": 39 ↛ 31line 39 didn't jump to line 31 because the condition on line 39 was always true
40 lang_code = name_to_code(lang_name, "th")
41 if lang_code == "": 41 ↛ 42line 41 didn't jump to line 42 because the condition on line 41 was never true
42 lang_code = "unknown"
43 elif isinstance(node, TemplateNode) and node.template_name in [
44 "t",
45 "t+",
46 "t-simple",
47 ]:
48 extract_t_template(wxr, word_entry, node, lang_name, sense)
49 elif (
50 isinstance(node, WikiNode)
51 and node.kind == NodeKind.LINK
52 and lang_name != "unknown"
53 ):
54 word = clean_node(wxr, None, node)
55 if word != "": 55 ↛ 31line 55 didn't jump to line 31 because the condition on line 55 was always true
56 word_entry.translations.append(
57 Translation(
58 word=word,
59 lang=lang_name,
60 lang_code=lang_code,
61 sense=sense,
62 )
63 )
64 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
65 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
66 extract_translation_list_item(
67 wxr, word_entry, child_list_item, sense
68 )
69 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true
70 for link_node in node.find_child(NodeKind.LINK):
71 link_str = clean_node(wxr, None, link_node)
72 if link_str.endswith("/คำแปลภาษาอื่น"):
73 extract_translation_page(wxr, word_entry, link_str)
76def extract_t_template(
77 wxr: WiktextractContext,
78 word_entry: WordEntry,
79 t_node: TemplateNode,
80 lang_name: str,
81 sense: str,
82) -> None:
83 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
84 if lang_code == "": 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true
85 lang_code = "unknown"
86 tr_data = Translation(
87 word="", lang=lang_name, lang_code=lang_code, sense=sense
88 )
89 expanded_node = wxr.wtp.parse(
90 wxr.wtp.node_to_wikitext(t_node), expand_all=True
91 )
92 for span_tag in expanded_node.find_html_recursively("span"):
93 if span_tag.attrs.get("lang") == lang_code and tr_data.word == "":
94 tr_data.word = clean_node(wxr, None, span_tag)
95 else:
96 span_class = span_tag.attrs.get("class", "")
97 if "Latn" in span_class:
98 tr_data.roman = clean_node(wxr, None, span_tag)
100 tr_data.lit = clean_node(
101 wxr, None, t_node.template_parameters.get("lit", "")
102 )
103 for abbr_tag in expanded_node.find_html_recursively("abbr"):
104 tr_data.raw_tags.append(clean_node(wxr, None, abbr_tag))
106 if tr_data.word != "": 106 ↛ exitline 106 didn't return from function 'extract_t_template' because the condition on line 106 was always true
107 translate_raw_tags(tr_data)
108 word_entry.translations.append(tr_data)
109 for link_node in expanded_node.find_child(NodeKind.LINK):
110 clean_node(wxr, word_entry, link_node)
113def extract_translation_page(
114 wxr: WiktextractContext,
115 word_entry: WordEntry,
116 page_title: str,
117) -> None:
118 page = wxr.wtp.get_page(page_title, 0)
119 if page is None or page.body is None:
120 return
121 root = wxr.wtp.parse(page.body)
122 for level2_node in root.find_child(NodeKind.LEVEL2):
123 lang_name = clean_node(wxr, None, level2_node.largs).removeprefix(
124 "ภาษา"
125 )
126 if lang_name != word_entry.lang:
127 continue
128 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
129 pos_title = clean_node(wxr, None, level3_node.largs)
130 if pos_title != word_entry.pos_title:
131 continue
132 for tr_level_node in level3_node.find_child(NodeKind.LEVEL4):
133 extract_translation_section(wxr, word_entry, tr_level_node)