Coverage for src/wiktextract/extractor/zh/translation.py: 87%

119 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1from mediawiki_langcodes import code_to_name, name_to_code 

2from wikitextprocessor.parser import ( 

3 LEVEL_KIND_FLAGS, 

4 LevelNode, 

5 NodeKind, 

6 TemplateNode, 

7 WikiNode, 

8) 

9 

10from ...page import clean_node 

11from ...wxr_context import WiktextractContext 

12from .models import Translation, WordEntry 

13from .section_titles import TRANSLATIONS_TITLES 

14from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags 

15 

16 

17def extract_translation_section( 

18 wxr: WiktextractContext, 

19 word_entry: WordEntry, 

20 level_node: LevelNode, 

21 sense: str = "", 

22 is_subpage: bool = False, 

23 source: str = "", 

24) -> None: 

25 for child in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST): 

26 if isinstance(child, TemplateNode): 

27 template_name = child.template_name.lower() 

28 if ( 

29 template_name in ("trans-top", "翻譯-頂", "trans-top-also") 

30 and 1 in child.template_parameters 

31 and not (sense != "" and is_subpage) 

32 ): 

33 sense = clean_node(wxr, None, child.template_parameters.get(1)) 

34 elif template_name == "see translation subpage" and not is_subpage: 

35 extract_see_trans_subpage_template(wxr, word_entry, child) 

36 elif ( 

37 template_name in ("trans-see", "翻译-见", "翻譯-見") 

38 and not is_subpage 

39 ): 

40 extract_trans_see_template(wxr, word_entry, child) 

41 elif template_name == "multitrans": 

42 wikitext = "".join( 

43 wxr.wtp.node_to_wikitext(c) 

44 for c in child.template_parameters.get("data", []) 

45 ) 

46 multitrans = wxr.wtp.parse(wikitext) 

47 extract_translation_section( 

48 wxr, word_entry, multitrans, sense=sense, source=source 

49 ) 

50 else: 

51 for list_item in child.find_child_recursively(NodeKind.LIST_ITEM): 

52 process_translation_list_item( 

53 wxr, word_entry, list_item, sense, source 

54 ) 

55 

56 

57def process_translation_list_item( 

58 wxr: WiktextractContext, 

59 word_entry: WordEntry, 

60 list_item: WikiNode, 

61 sense: str, 

62 source: str, 

63) -> None: 

64 tr_data = Translation( 

65 word="", sense=sense, lang="unknown", lang_code="unknown", source=source 

66 ) 

67 

68 for child_index, child in enumerate(list_item.filter_empty_str_child()): 

69 if child_index == 0: 

70 lang_text = "" 

71 if isinstance(child, str): 

72 if ":" in child: 

73 lang_text = child[: child.index(":")] 

74 elif ":" in child: 74 ↛ 78line 74 didn't jump to line 78 because the condition on line 74 was always true

75 lang_text = child[: child.index(":")] 

76 else: 

77 lang_text = clean_node(wxr, None, child) 

78 if len(lang_text) > 0: 78 ↛ 68line 78 didn't jump to line 68 because the condition on line 78 was always true

79 tr_data.lang = lang_text.strip() 

80 tr_data.lang_code = name_to_code(tr_data.lang, "zh") 

81 elif isinstance(child, TemplateNode): 

82 template_name = child.template_name.lower() 

83 if template_name in { 

84 "t", 

85 "t+", 

86 "tt", 

87 "tt+", 

88 "t-check", 

89 "t+check", 

90 "l", 

91 }: 

92 if len(tr_data.word) > 0: 

93 word_entry.translations.append( 

94 tr_data.model_copy(deep=True) 

95 ) 

96 tr_data = Translation( 

97 word="", 

98 lang=tr_data.lang, 

99 lang_code=tr_data.lang_code, 

100 sense=sense, 

101 source=source, 

102 ) 

103 if tr_data.lang_code == "": 

104 tr_data.lang_code = child.template_parameters.get(1, "") 

105 if tr_data.lang == "": 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true

106 tr_data.lang = code_to_name(tr_data.lang_code, "zh") 

107 tr_data.word = clean_node( 

108 wxr, None, child.template_parameters.get(2, "") 

109 ) 

110 tr_data.roman = clean_node( 

111 wxr, None, child.template_parameters.get("tr", "") 

112 ) 

113 tr_data.alt = clean_node( 

114 wxr, None, child.template_parameters.get("alt", "") 

115 ) 

116 tr_data.lit = clean_node( 

117 wxr, None, child.template_parameters.get("lit", "") 

118 ) 

119 for arg_key, arg_value in child.template_parameters.items(): 

120 if ( 

121 isinstance(arg_key, int) and arg_key >= 3 

122 ) or arg_key == "g": # template "l" uses the "g" arg 

123 for tag_arg in arg_value.split("-"): 

124 if tag_arg in TEMPLATE_TAG_ARGS: 124 ↛ 123line 124 didn't jump to line 123 because the condition on line 124 was always true

125 tag = TEMPLATE_TAG_ARGS[tag_arg] 

126 if isinstance(tag, str): 126 ↛ 128line 126 didn't jump to line 128 because the condition on line 126 was always true

127 tr_data.tags.append(tag) 

128 elif isinstance(tag, list): 

129 tr_data.tags.extend(tag) 

130 

131 elif template_name == "t-needed": 

132 # ignore empty translation 

133 continue 

134 elif template_name in ("qualifier", "q"): 

135 raw_tag = clean_node(wxr, None, child) 

136 tr_data.raw_tags.append(raw_tag.strip("()")) 

137 else: 

138 # zh qualifier templates that use template "注释" 

139 # https://zh.wiktionary.org/wiki/Template:注释 

140 raw_tag = clean_node(wxr, None, child) 

141 if raw_tag.startswith("〈") and raw_tag.endswith("〉"): 141 ↛ 68line 141 didn't jump to line 68 because the condition on line 141 was always true

142 tr_data.raw_tags.append(raw_tag.strip("〈〉")) 

143 elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK: 

144 if len(tr_data.word) > 0: 

145 word_entry.translations.append(tr_data.model_copy(deep=True)) 

146 tr_data = Translation( 

147 word="", 

148 lang=tr_data.lang, 

149 lang_code=tr_data.lang_code, 

150 sense=sense, 

151 source=source, 

152 ) 

153 tr_data.word = clean_node(wxr, None, child) 

154 

155 if len(tr_data.word) > 0: 

156 translate_raw_tags(tr_data) 

157 word_entry.translations.append(tr_data.model_copy(deep=True)) 

158 

159 

160def extract_trans_see_template( 

161 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

162): 

163 # https://zh.wiktionary.org/wiki/Template:翻譯-見 

164 sense = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

165 page_titles = [] 

166 if 2 in t_node.template_parameters: 

167 for index in range(2, 11): 167 ↛ 177line 167 didn't jump to line 177 because the loop on line 167 didn't complete

168 if index not in t_node.template_parameters: 

169 break 

170 page_titles.append( 

171 clean_node(wxr, None, t_node.template_parameters[index]) 

172 ) 

173 else: 

174 page_titles.append( 

175 clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

176 ) 

177 for page_title in page_titles: 

178 if "#" in page_title: 

179 page_title = page_title[: page_title.index("#")] 

180 page = wxr.wtp.get_page(page_title) 

181 if page is None: 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true

182 return 

183 root = wxr.wtp.parse(page.body) 

184 target_node = find_subpage_section(wxr, root, TRANSLATIONS_TITLES) 

185 if target_node is not None: 185 ↛ 177line 185 didn't jump to line 177 because the condition on line 185 was always true

186 extract_translation_section( 

187 wxr, 

188 word_entry, 

189 target_node, 

190 sense=sense, 

191 is_subpage=True, 

192 source=page_title, 

193 ) 

194 

195 

196def extract_see_trans_subpage_template( 

197 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

198): 

199 # https://zh.wiktionary.org/wiki/Template:See_translation_subpage 

200 target_pos = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

201 if 2 in t_node.template_parameters: 201 ↛ 202line 201 didn't jump to line 202 because the condition on line 201 was never true

202 subpage_title = clean_node( 

203 wxr, None, t_node.template_parameters.get(2, "") 

204 ) 

205 if "#" in subpage_title: 

206 subpage_title = subpage_title[: subpage_title.index("#")] 

207 else: 

208 subpage_title = f"{wxr.wtp.title}/翻譯" 

209 

210 page = wxr.wtp.get_page(subpage_title) 

211 if page is None: 211 ↛ 212line 211 didn't jump to line 212 because the condition on line 211 was never true

212 return 

213 root = wxr.wtp.parse(page.body) 

214 target_section = find_subpage_section(wxr, root, target_pos) 

215 if target_section is not None: 215 ↛ 221line 215 didn't jump to line 221 because the condition on line 215 was always true

216 new_target_section = find_subpage_section( 

217 wxr, target_section, TRANSLATIONS_TITLES 

218 ) 

219 if new_target_section is not None: 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true

220 target_section = new_target_section 

221 if target_section is not None: 221 ↛ exitline 221 didn't return from function 'extract_see_trans_subpage_template' because the condition on line 221 was always true

222 extract_translation_section( 

223 wxr, 

224 word_entry, 

225 target_section, 

226 is_subpage=True, 

227 source=subpage_title, 

228 ) 

229 

230 

231def find_subpage_section( 

232 wxr: WiktextractContext, root: WikiNode, target_sections: set[str] 

233) -> WikiNode | None: 

234 for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS): 

235 section_title = clean_node(wxr, None, level_node.largs) 

236 if section_title in target_sections: 

237 return level_node 

238 return None