Coverage for src/wiktextract/extractor/vi/translation.py: 65%

119 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1from itertools import count 

2 

3from mediawiki_langcodes import name_to_code 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .linkage import QUALIFIER_TEMPALTES, extract_qualifier_template 

15from .models import Translation, WordEntry 

16from .section_titles import TRANSLATION_SECTIONS 

17from .tags import translate_raw_tags 

18 

19 

20def extract_translation_section( 

21 wxr: WiktextractContext, 

22 word_entry: WordEntry, 

23 level_node: LevelNode, 

24 sense: str = "", 

25 from_trans_see: bool = False, 

26 source: str = "", 

27): 

28 for node in level_node.children: 

29 if isinstance(node, TemplateNode): 

30 if node.template_name == "trans-top" and not ( 

31 sense != "" and from_trans_see 

32 ): 

33 sense = clean_node( 

34 wxr, None, node.template_parameters.get(1, "") 

35 ) 

36 clean_node(wxr, word_entry, node) 

37 elif node.template_name == "trans-see" and not from_trans_see: 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 extract_trans_see_template(wxr, word_entry, node) 

39 elif node.template_name == "multitrans": 

40 extract_multitrans_template(wxr, word_entry, node) 

41 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

42 for list_item in node.find_child(NodeKind.LIST_ITEM): 

43 extract_translation_list_item( 

44 wxr, word_entry, list_item, sense, source 

45 ) 

46 elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD: 

47 sense = clean_node(wxr, None, node) 

48 

49 

50# Thể loại:Bản mẫu ngữ pháp 

51ABBR_TAG_TEMPLATES = {"f", "fm", "g", "inv", "m", "mf", "mn", "n", "p"} 

52 

53 

54def extract_translation_list_item( 

55 wxr: WiktextractContext, 

56 word_entry: WordEntry, 

57 list_item: WikiNode, 

58 sense: str, 

59 source: str, 

60): 

61 lang_name = "unknown" 

62 lang_code = "unknown" 

63 for index, node in enumerate(list_item.children): 

64 if isinstance(node, str) and ":" in node and lang_name == "unknown": 

65 lang_name = ( 

66 clean_node(wxr, None, list_item.children[:index]) 

67 + node[: node.index(":")].strip() 

68 ) or "unknown" 

69 if lang_name != "unknown": 69 ↛ 63line 69 didn't jump to line 63 because the condition on line 69 was always true

70 lang_code = name_to_code(lang_name, "vi") or "unknown" 

71 elif isinstance(node, TemplateNode) and node.template_name in [ 

72 "t", 

73 "t-", 

74 "t+", 

75 "t2", 

76 "t2+", 

77 "tt+", 

78 ]: 

79 extract_t_template(wxr, word_entry, node, lang_name, sense, source) 

80 elif ( 

81 isinstance(node, WikiNode) 

82 and node.kind == NodeKind.LINK 

83 and lang_name != "unknown" 

84 ): 

85 word = clean_node(wxr, None, node) 

86 if word != "": 86 ↛ 63line 86 didn't jump to line 63 because the condition on line 86 was always true

87 word_entry.translations.append( 

88 Translation( 

89 word=word, 

90 lang=lang_name, 

91 lang_code=lang_code, 

92 sense=sense, 

93 source=source, 

94 ) 

95 ) 

96 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

97 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

98 extract_translation_list_item( 

99 wxr, word_entry, child_list_item, sense, source 

100 ) 

101 elif ( 

102 isinstance(node, TemplateNode) 

103 and node.template_name in QUALIFIER_TEMPALTES 

104 and len(word_entry.translations) > 0 

105 ): 

106 word_entry.translations[-1].raw_tags.extend( 

107 extract_qualifier_template(wxr, node) 

108 ) 

109 translate_raw_tags(word_entry.translations[-1]) 

110 elif ( 110 ↛ 115line 110 didn't jump to line 115 because the condition on line 110 was never true

111 isinstance(node, TemplateNode) 

112 and node.template_name in ABBR_TAG_TEMPLATES 

113 and len(word_entry.translations) > 0 

114 ): 

115 word_entry.translations[-1].raw_tags.extend( 

116 extract_abbr_tag_template(wxr, node) 

117 ) 

118 translate_raw_tags(word_entry.translations[-1]) 

119 

120 

121def extract_t_template( 

122 wxr: WiktextractContext, 

123 word_entry: WordEntry, 

124 t_node: TemplateNode, 

125 lang_name: str, 

126 sense: str, 

127 source: str, 

128) -> None: 

129 lang_code = ( 

130 clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

131 or "unknown" 

132 ) 

133 expanded_node = wxr.wtp.parse( 

134 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

135 ) 

136 for e_node in expanded_node.find_child(NodeKind.TEMPLATE): 136 ↛ 137line 136 didn't jump to line 137 because the loop on line 136 never started

137 if e_node.template_name in ["t", "t+"]: 

138 expanded_node = wxr.wtp.parse( 

139 wxr.wtp.node_to_wikitext(e_node), expand_all=True 

140 ) 

141 lit = clean_node(wxr, None, t_node.template_parameters.get("lit", "")) 

142 raw_tags = [] 

143 roman = "" 

144 other = "" 

145 for abbr_tag in expanded_node.find_html_recursively("abbr"): 145 ↛ 146line 145 didn't jump to line 146 because the loop on line 145 never started

146 gender = abbr_tag.attrs.get("title", "") 

147 if gender != "": 

148 raw_tags.append(gender) 

149 for span_tag in expanded_node.find_html_recursively("span"): 

150 if ( 

151 span_tag.attrs.get("lang", "").endswith("-Latn") 

152 or span_tag.attrs.get("class", "") == "tr" 

153 ): 

154 roman = clean_node(wxr, None, span_tag) 

155 if lang_code == "ja" and "," in roman: 

156 other, roman = roman.split(",", maxsplit=1) 

157 other = other.strip() 

158 roman = roman.strip() 

159 for span_tag in expanded_node.find_html_recursively("span"): 

160 span_class = span_tag.attrs.get("class", "").split() 

161 if span_tag.attrs.get("lang") == lang_code: 

162 word = clean_node(wxr, None, span_tag) 

163 if word != "": 163 ↛ 159line 163 didn't jump to line 159 because the condition on line 163 was always true

164 tr_data = Translation( 

165 word=word, 

166 lang=lang_name, 

167 lang_code=lang_code, 

168 sense=sense, 

169 source=source, 

170 roman=roman, 

171 lit=lit, 

172 raw_tags=raw_tags, 

173 other=other, 

174 ) 

175 if "Hant" in span_class: 

176 tr_data.tags.append("Traditional-Chinese") 

177 elif "Hans" in span_class: 

178 tr_data.tags.append("Simplified-Chinese") 

179 translate_raw_tags(tr_data) 

180 word_entry.translations.append(tr_data) 

181 

182 for link_node in expanded_node.find_child(NodeKind.LINK): 

183 clean_node(wxr, word_entry, link_node) 

184 

185 

186def extract_trans_see_template( 

187 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

188): 

189 sense = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

190 page_titles = [] 

191 if 2 in t_node.template_parameters: 

192 for index in count(2): 

193 if index not in t_node.template_parameters: 

194 break 

195 page_titles.append( 

196 clean_node(wxr, None, t_node.template_parameters[index]) 

197 ) 

198 else: 

199 page_titles.append(sense) 

200 for page_title in page_titles: 

201 if "#" in page_title: 

202 page_title = page_title[: page_title.index("#")] 

203 page = wxr.wtp.get_page(page_title) 

204 if page is None: 

205 return 

206 root = wxr.wtp.parse(page.body, pre_expand=True) 

207 target_node = find_subpage_section(wxr, root, TRANSLATION_SECTIONS) 

208 if target_node is not None: 

209 extract_translation_section( 

210 wxr, 

211 word_entry, 

212 target_node, 

213 sense=sense, 

214 from_trans_see=True, 

215 source=page_title, 

216 ) 

217 

218 

219def find_subpage_section( 

220 wxr: WiktextractContext, root: WikiNode, target_sections: set[str] 

221) -> WikiNode | None: 

222 for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS): 

223 section_title = clean_node(wxr, None, level_node.largs) 

224 if section_title in target_sections: 

225 return level_node 

226 return None 

227 

228 

229def extract_multitrans_template( 

230 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

231): 

232 arg = wxr.wtp.parse( 

233 wxr.wtp.node_to_wikitext(t_node.template_parameters.get("data", "")) 

234 ) 

235 extract_translation_section(wxr, word_entry, arg) 

236 

237 

238def extract_abbr_tag_template( 

239 wxr: WiktextractContext, t_node: TemplateNode 

240) -> list[str]: 

241 raw_tags = [] 

242 expanded_node = wxr.wtp.parse( 

243 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

244 ) 

245 for abbr_tag in expanded_node.find_html_recursively("abbr"): 

246 raw_tag = clean_node(wxr, None, abbr_tag.attrs.get("title", "")) 

247 if raw_tag != "": 

248 raw_tags.append(raw_tag) 

249 return raw_tags