Coverage for src/wiktextract/extractor/th/translation.py: 57%

97 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1from itertools import count 

2 

3from mediawiki_langcodes import name_to_code 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .models import Translation, WordEntry 

15from .section_titles import TRANSLATION_SECTIONS 

16from .tags import translate_raw_tags 

17 

18 

19def extract_translation_section( 

20 wxr: WiktextractContext, 

21 word_entry: WordEntry, 

22 level_node: LevelNode, 

23 sense: str = "", 

24 from_trans_see: bool = False, 

25 source: str = "", 

26) -> None: 

27 for node in level_node.children: 

28 if ( 

29 isinstance(node, TemplateNode) 

30 and node.template_name == "trans-top" 

31 and not (sense != "" and from_trans_see) 

32 ): 

33 sense = clean_node(wxr, None, node.template_parameters.get(1, "")) 

34 clean_node(wxr, word_entry, node) 

35 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

36 for list_item in node.find_child(NodeKind.LIST_ITEM): 

37 extract_translation_list_item( 

38 wxr, word_entry, list_item, sense, source 

39 ) 

40 elif ( 40 ↛ 45line 40 didn't jump to line 45 because the condition on line 40 was never true

41 isinstance(node, TemplateNode) 

42 and node.template_name == "trans-see" 

43 and not from_trans_see 

44 ): 

45 extract_trans_see_template(wxr, word_entry, node) 

46 

47 

48def extract_translation_list_item( 

49 wxr: WiktextractContext, 

50 word_entry: WordEntry, 

51 list_item: WikiNode, 

52 sense: str, 

53 source: str, 

54) -> None: 

55 lang_name = "unknown" 

56 lang_code = "unknown" 

57 for index, node in enumerate(list_item.children): 

58 if isinstance(node, str) and ":" in node and lang_name == "unknown": 

59 lang_name = ( 

60 clean_node(wxr, None, list_item.children[:index]) 

61 + node[: node.index(":")].strip() 

62 ) 

63 if lang_name == "": 63 ↛ 64line 63 didn't jump to line 64 because the condition on line 63 was never true

64 lang_name = "unknown" 

65 if lang_name != "unknown": 65 ↛ 57line 65 didn't jump to line 57 because the condition on line 65 was always true

66 lang_code = name_to_code(lang_name, "th") 

67 if lang_code == "": 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true

68 lang_code = "unknown" 

69 elif isinstance(node, TemplateNode) and node.template_name in [ 

70 "t", 

71 "t+", 

72 "t-simple", 

73 ]: 

74 extract_t_template(wxr, word_entry, node, lang_name, sense, source) 

75 elif ( 

76 isinstance(node, WikiNode) 

77 and node.kind == NodeKind.LINK 

78 and lang_name != "unknown" 

79 ): 

80 word = clean_node(wxr, None, node) 

81 if word != "": 81 ↛ 57line 81 didn't jump to line 57 because the condition on line 81 was always true

82 word_entry.translations.append( 

83 Translation( 

84 word=word, 

85 lang=lang_name, 

86 lang_code=lang_code, 

87 sense=sense, 

88 source=source, 

89 ) 

90 ) 

91 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

92 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

93 extract_translation_list_item( 

94 wxr, word_entry, child_list_item, sense, source 

95 ) 

96 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true

97 for link_node in node.find_child(NodeKind.LINK): 

98 link_str = clean_node(wxr, None, link_node) 

99 if link_str.endswith("/คำแปลภาษาอื่น"): 

100 extract_translation_subpage(wxr, word_entry, link_str) 

101 

102 

103def extract_t_template( 

104 wxr: WiktextractContext, 

105 word_entry: WordEntry, 

106 t_node: TemplateNode, 

107 lang_name: str, 

108 sense: str, 

109 source: str, 

110) -> None: 

111 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

112 if lang_code == "": 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 lang_code = "unknown" 

114 tr_data = Translation( 

115 word="", lang=lang_name, lang_code=lang_code, sense=sense, source=source 

116 ) 

117 expanded_node = wxr.wtp.parse( 

118 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

119 ) 

120 for span_tag in expanded_node.find_html_recursively("span"): 

121 if span_tag.attrs.get("lang") == lang_code and tr_data.word == "": 

122 tr_data.word = clean_node(wxr, None, span_tag) 

123 else: 

124 span_class = span_tag.attrs.get("class", "") 

125 if "Latn" in span_class: 

126 tr_data.roman = clean_node(wxr, None, span_tag) 

127 

128 tr_data.lit = clean_node( 

129 wxr, None, t_node.template_parameters.get("lit", "") 

130 ) 

131 for abbr_tag in expanded_node.find_html_recursively("abbr"): 

132 tr_data.raw_tags.append(clean_node(wxr, None, abbr_tag)) 

133 

134 if tr_data.word != "": 134 ↛ exitline 134 didn't return from function 'extract_t_template' because the condition on line 134 was always true

135 translate_raw_tags(tr_data) 

136 word_entry.translations.append(tr_data) 

137 for link_node in expanded_node.find_child(NodeKind.LINK): 

138 clean_node(wxr, word_entry, link_node) 

139 

140 

141def extract_translation_subpage( 

142 wxr: WiktextractContext, word_entry: WordEntry, page_title: str 

143) -> None: 

144 page = wxr.wtp.get_page(page_title, 0) 

145 if page is None or page.body is None: 

146 return 

147 root = wxr.wtp.parse(page.body) 

148 target_node = find_subpage_section(wxr, root, TRANSLATION_SECTIONS) 

149 if target_node is not None: 

150 extract_translation_section( 

151 wxr, word_entry, target_node, source=page_title 

152 ) 

153 

154 

155def extract_trans_see_template( 

156 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

157): 

158 sense = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

159 page_titles = [] 

160 if 2 in t_node.template_parameters: 

161 for index in count(2): 

162 if index not in t_node.template_parameters: 

163 break 

164 page_titles.append( 

165 clean_node(wxr, None, t_node.template_parameters[index]) 

166 ) 

167 else: 

168 page_titles.append(sense) 

169 for page_title in page_titles: 

170 if "#" in page_title: 

171 page_title = page_title[: page_title.index("#")] 

172 page = wxr.wtp.get_page(page_title) 

173 if page is None: 

174 return 

175 root = wxr.wtp.parse(page.body) 

176 target_node = find_subpage_section(wxr, root, TRANSLATION_SECTIONS) 

177 if target_node is not None: 

178 extract_translation_section( 

179 wxr, 

180 word_entry, 

181 target_node, 

182 sense=sense, 

183 from_trans_see=True, 

184 source=page_title, 

185 ) 

186 

187 

188def find_subpage_section( 

189 wxr: WiktextractContext, root: WikiNode, target_sections: tuple[str, ...] 

190) -> WikiNode | None: 

191 for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS): 

192 section_title = clean_node(wxr, None, level_node.largs) 

193 if section_title in target_sections: 

194 return level_node 

195 return None