Coverage for src/wiktextract/extractor/zh/translation.py: 89%

102 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1from mediawiki_langcodes import code_to_name, name_to_code 

2from wikitextprocessor.parser import ( 

3 LEVEL_KIND_FLAGS, 

4 NodeKind, 

5 TemplateNode, 

6 WikiNode, 

7) 

8 

9from ...page import clean_node 

10from ...wxr_context import WiktextractContext 

11from .models import Translation, WordEntry 

12from .section_titles import TRANSLATIONS_TITLES 

13from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags 

14 

15 

16def extract_translation( 

17 wxr: WiktextractContext, 

18 page_data: list[WordEntry], 

19 level_node: WikiNode, 

20 sense: str = "", 

21 is_subpage: bool = False, 

22) -> None: 

23 for child in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST): 

24 if isinstance(child, TemplateNode): 

25 template_name = child.template_name.lower() 

26 if ( 

27 template_name in {"trans-top", "翻譯-頂", "trans-top-also"} 

28 and 1 in child.template_parameters 

29 ): 

30 sense = clean_node(wxr, None, child.template_parameters.get(1)) 

31 elif ( 

32 template_name in {"see translation subpage", "trans-see"} 

33 and not is_subpage 

34 ): 

35 translation_subpage(wxr, page_data, child) 

36 elif template_name == "multitrans": 

37 wikitext = "".join( 

38 wxr.wtp.node_to_wikitext(c) 

39 for c in child.template_parameters.get("data", []) 

40 ) 

41 multitrans = wxr.wtp.parse(wikitext) 

42 extract_translation(wxr, page_data, multitrans, sense) 

43 else: 

44 for list_item in child.find_child_recursively(NodeKind.LIST_ITEM): 

45 process_translation_list_item( 

46 wxr, 

47 page_data, 

48 list_item, 

49 sense, 

50 ) 

51 

52 

53def process_translation_list_item( 

54 wxr: WiktextractContext, 

55 page_data: list[WordEntry], 

56 list_item: WikiNode, 

57 sense: str, 

58) -> None: 

59 tr_data = Translation( 

60 word="", sense=sense, lang="unknown", lang_code="unknown" 

61 ) 

62 

63 for child_index, child in enumerate(list_item.filter_empty_str_child()): 

64 if child_index == 0: 

65 lang_text = "" 

66 if isinstance(child, str): 

67 if ":" in child: 

68 lang_text = child[: child.index(":")] 

69 elif ":" in child: 69 ↛ 73line 69 didn't jump to line 73 because the condition on line 69 was always true

70 lang_text = child[: child.index(":")] 

71 else: 

72 lang_text = clean_node(wxr, None, child) 

73 if len(lang_text) > 0: 73 ↛ 63line 73 didn't jump to line 63 because the condition on line 73 was always true

74 tr_data.lang = lang_text.strip() 

75 tr_data.lang_code = name_to_code(tr_data.lang, "zh") 

76 elif isinstance(child, TemplateNode): 

77 template_name = child.template_name.lower() 

78 if template_name in { 

79 "t", 

80 "t+", 

81 "tt", 

82 "tt+", 

83 "t-check", 

84 "t+check", 

85 "l", 

86 }: 

87 if len(tr_data.word) > 0: 

88 page_data[-1].translations.append( 

89 tr_data.model_copy(deep=True) 

90 ) 

91 tr_data = Translation( 

92 word="", 

93 lang=tr_data.lang, 

94 lang_code=tr_data.lang_code, 

95 sense=sense, 

96 ) 

97 if tr_data.lang_code == "": 

98 tr_data.lang_code = child.template_parameters.get(1, "") 

99 if tr_data.lang == "": 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true

100 tr_data.lang = code_to_name(tr_data.lang_code, "zh") 

101 tr_data.word = clean_node( 

102 wxr, None, child.template_parameters.get(2, "") 

103 ) 

104 tr_data.roman = clean_node( 

105 wxr, None, child.template_parameters.get("tr", "") 

106 ) 

107 tr_data.alt = clean_node( 

108 wxr, None, child.template_parameters.get("alt", "") 

109 ) 

110 tr_data.lit = clean_node( 

111 wxr, None, child.template_parameters.get("lit", "") 

112 ) 

113 for arg_key, arg_value in child.template_parameters.items(): 

114 if ( 

115 isinstance(arg_key, int) and arg_key >= 3 

116 ) or arg_key == "g": # template "l" uses the "g" arg 

117 for tag_arg in arg_value.split("-"): 

118 if tag_arg in TEMPLATE_TAG_ARGS: 118 ↛ 117line 118 didn't jump to line 117 because the condition on line 118 was always true

119 tag = TEMPLATE_TAG_ARGS[tag_arg] 

120 if isinstance(tag, str): 120 ↛ 122line 120 didn't jump to line 122 because the condition on line 120 was always true

121 tr_data.tags.append(tag) 

122 elif isinstance(tag, list): 

123 tr_data.tags.extend(tag) 

124 

125 elif template_name == "t-needed": 

126 # ignore empty translation 

127 continue 

128 elif template_name in ("qualifier", "q"): 

129 raw_tag = clean_node(wxr, None, child) 

130 tr_data.raw_tags.append(raw_tag.strip("()")) 

131 else: 

132 # zh qualifier templates that use template "注释" 

133 # https://zh.wiktionary.org/wiki/Template:注释 

134 raw_tag = clean_node(wxr, None, child) 

135 if raw_tag.startswith("〈") and raw_tag.endswith("〉"): 135 ↛ 63line 135 didn't jump to line 63 because the condition on line 135 was always true

136 tr_data.raw_tags.append(raw_tag.strip("〈〉")) 

137 elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK: 

138 if len(tr_data.word) > 0: 

139 page_data[-1].translations.append(tr_data.model_copy(deep=True)) 

140 tr_data = Translation( 

141 word="", 

142 lang=tr_data.lang, 

143 lang_code=tr_data.lang_code, 

144 sense=sense, 

145 ) 

146 tr_data.word = clean_node(wxr, None, child) 

147 

148 if len(tr_data.word) > 0: 

149 translate_raw_tags(tr_data) 

150 page_data[-1].translations.append(tr_data.model_copy(deep=True)) 

151 

152 

153def translation_subpage( 

154 wxr: WiktextractContext, 

155 page_data: list[WordEntry], 

156 template_node: TemplateNode, 

157) -> None: 

158 # https://zh.wiktionary.org/wiki/Template:翻譯-見 

159 # https://zh.wiktionary.org/wiki/Template:See_translation_subpage 

160 

161 page_title = wxr.wtp.title 

162 target_section = None 

163 if template_node.template_name == "see translation subpage": 

164 target_section = template_node.template_parameters.get(1) 

165 page_title = clean_node( 

166 wxr, None, template_node.template_parameters.get(2, wxr.wtp.title) 

167 ) 

168 if "#" in page_title: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 page_title = page_title[: page_title.index("#")] 

170 

171 translation_subpage_title = page_title 

172 if page_title == wxr.wtp.title: 

173 translation_subpage_title = f"{page_title}/翻譯" 

174 subpage = wxr.wtp.get_page(translation_subpage_title) 

175 if subpage is None: 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true

176 return 

177 

178 root = wxr.wtp.parse(subpage.body) 

179 target_section_node = ( 

180 find_subpage_section(wxr, root, target_section) or root 

181 ) 

182 if target_section_node is not None: 182 ↛ exitline 182 didn't return from function 'translation_subpage' because the condition on line 182 was always true

183 extract_translation( 

184 wxr, page_data, target_section_node, is_subpage=True 

185 ) 

186 

187 

188def find_subpage_section( 

189 wxr: WiktextractContext, 

190 node: WikiNode | str, 

191 target_section: str | None = None, 

192) -> WikiNode | None: 

193 if not isinstance(node, WikiNode): 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true

194 return None 

195 for level_node in node.find_child_recursively(LEVEL_KIND_FLAGS): 195 ↛ 201line 195 didn't jump to line 201 because the loop on line 195 didn't complete

196 section_title = clean_node(wxr, None, level_node.largs) 

197 if isinstance(target_section, str) and section_title == target_section: 

198 return level_node 

199 if section_title in TRANSLATIONS_TITLES: 

200 return level_node 

201 return None