Coverage for src/wiktextract/extractor/zh/translation.py: 88%

100 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1from typing import Optional, Union 

2 

3from mediawiki_langcodes import code_to_name, name_to_code 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .models import Translation, WordEntry 

14from .section_titles import TRANSLATIONS_TITLES 

15from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags 

16 

17 

18def extract_translation( 

19 wxr: WiktextractContext, 

20 page_data: list[WordEntry], 

21 level_node: WikiNode, 

22 sense: str = "", 

23 is_subpage: bool = False, 

24) -> None: 

25 for child in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST): 

26 if isinstance(child, TemplateNode): 

27 template_name = child.template_name.lower() 

28 if ( 

29 template_name in {"trans-top", "翻譯-頂", "trans-top-also"} 

30 and 1 in child.template_parameters 

31 ): 

32 sense = clean_node(wxr, None, child.template_parameters.get(1)) 

33 elif ( 

34 template_name in {"see translation subpage", "trans-see"} 

35 and not is_subpage 

36 ): 

37 translation_subpage(wxr, page_data, child) 

38 elif template_name == "multitrans": 

39 wikitext = "".join( 

40 wxr.wtp.node_to_wikitext(c) 

41 for c in child.template_parameters.get("data", []) 

42 ) 

43 multitrans = wxr.wtp.parse(wikitext) 

44 extract_translation(wxr, page_data, multitrans, sense) 

45 else: 

46 for list_item in child.find_child_recursively(NodeKind.LIST_ITEM): 

47 process_translation_list_item( 

48 wxr, 

49 page_data, 

50 list_item, 

51 sense, 

52 ) 

53 

54 

55def process_translation_list_item( 

56 wxr: WiktextractContext, 

57 page_data: list[WordEntry], 

58 list_item: WikiNode, 

59 sense: str, 

60) -> None: 

61 tr_data = Translation(word="", sense=sense) 

62 

63 for child_index, child in enumerate(list_item.filter_empty_str_child()): 

64 if child_index == 0: 

65 lang_text = "" 

66 if isinstance(child, str): 

67 if ":" in child: 

68 lang_text = child[: child.index(":")] 

69 elif ":" in child: 69 ↛ 73line 69 didn't jump to line 73 because the condition on line 69 was always true

70 lang_text = child[: child.index(":")] 

71 else: 

72 lang_text = clean_node(wxr, None, child) 

73 if len(lang_text) > 0: 73 ↛ 63line 73 didn't jump to line 63 because the condition on line 73 was always true

74 tr_data.lang = lang_text.strip() 

75 tr_data.lang_code = name_to_code(tr_data.lang, "zh") 

76 elif isinstance(child, TemplateNode): 

77 template_name = child.template_name.lower() 

78 if template_name in { 

79 "t", 

80 "t+", 

81 "tt", 

82 "tt+", 

83 "t-check", 

84 "t+check", 

85 "l", 

86 }: 

87 if len(tr_data.word) > 0: 

88 page_data[-1].translations.append( 

89 tr_data.model_copy(deep=True) 

90 ) 

91 tr_data = Translation( 

92 word="", 

93 lang=tr_data.lang, 

94 lang_code=tr_data.lang_code, 

95 sense=sense, 

96 ) 

97 if tr_data.lang_code == "": 

98 tr_data.lang_code = child.template_parameters.get(1, "") 

99 if tr_data.lang == "": 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true

100 tr_data.lang = code_to_name(tr_data.lang_code, "zh") 

101 tr_data.word = clean_node( 

102 wxr, None, child.template_parameters.get(2, "") 

103 ) 

104 tr_data.roman = clean_node( 

105 wxr, None, child.template_parameters.get("tr", "") 

106 ) 

107 tr_data.alt = clean_node( 

108 wxr, None, child.template_parameters.get("alt", "") 

109 ) 

110 tr_data.lit = clean_node( 

111 wxr, None, child.template_parameters.get("lit", "") 

112 ) 

113 for arg_key, arg_value in child.template_parameters.items(): 

114 if ( 

115 isinstance(arg_key, int) and arg_key >= 3 

116 ) or arg_key == "g": # template "l" uses the "g" arg 

117 for tag_arg in arg_value.split("-"): 

118 if tag_arg in TEMPLATE_TAG_ARGS: 118 ↛ 117line 118 didn't jump to line 117 because the condition on line 118 was always true

119 tr_data.tags.append(TEMPLATE_TAG_ARGS[tag_arg]) 

120 

121 elif template_name == "t-needed": 

122 # ignore empty translation 

123 continue 

124 elif template_name in ("qualifier", "q"): 

125 raw_tag = clean_node(wxr, None, child) 

126 tr_data.raw_tags.append(raw_tag.strip("()")) 

127 else: 

128 # zh qualifier templates that use template "注释" 

129 # https://zh.wiktionary.org/wiki/Template:注释 

130 raw_tag = clean_node(wxr, None, child) 

131 if raw_tag.startswith("〈") and raw_tag.endswith("〉"): 131 ↛ 63line 131 didn't jump to line 63 because the condition on line 131 was always true

132 tr_data.raw_tags.append(raw_tag.strip("〈〉")) 

133 elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK: 

134 if len(tr_data.word) > 0: 

135 page_data[-1].translations.append(tr_data.model_copy(deep=True)) 

136 tr_data = Translation( 

137 word="", 

138 lang=tr_data.lang, 

139 lang_code=tr_data.lang_code, 

140 sense=sense, 

141 ) 

142 tr_data.word = clean_node(wxr, None, child) 

143 

144 if len(tr_data.word) > 0: 

145 translate_raw_tags(tr_data) 

146 page_data[-1].translations.append(tr_data.model_copy(deep=True)) 

147 

148 

149def translation_subpage( 

150 wxr: WiktextractContext, 

151 page_data: list[WordEntry], 

152 template_node: TemplateNode, 

153) -> None: 

154 # https://zh.wiktionary.org/wiki/Template:翻譯-見 

155 # https://zh.wiktionary.org/wiki/Template:See_translation_subpage 

156 

157 page_title = wxr.wtp.title 

158 target_section = None 

159 if template_node.template_name == "see translation subpage": 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 target_section = template_node.template_parameters.get(1) 

161 page_title = clean_node( 

162 wxr, None, template_node.template_parameters.get(2, wxr.wtp.title) 

163 ) 

164 if "#" in page_title: 164 ↛ 165line 164 didn't jump to line 165 because the condition on line 164 was never true

165 page_title = page_title[: page_title.index("#")] 

166 

167 translation_subpage_title = page_title 

168 if page_title == wxr.wtp.title: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 translation_subpage_title = f"{page_title}/翻譯" 

170 subpage = wxr.wtp.get_page(translation_subpage_title) 

171 if subpage is None: 171 ↛ 172line 171 didn't jump to line 172 because the condition on line 171 was never true

172 return 

173 

174 root = wxr.wtp.parse(subpage.body, pre_expand=True) 

175 target_section_node = ( 

176 root 

177 if target_section is None 

178 else find_subpage_section(wxr, root, target_section) 

179 ) 

180 translation_node = find_subpage_section(wxr, target_section_node) 

181 if translation_node is not None: 181 ↛ exitline 181 didn't return from function 'translation_subpage' because the condition on line 181 was always true

182 extract_translation(wxr, page_data, translation_node, is_subpage=True) 

183 

184 

185def find_subpage_section( 

186 wxr: WiktextractContext, 

187 node: Union[WikiNode, str], 

188 target_section: Union[str, None] = None, 

189) -> Optional[WikiNode]: 

190 if not isinstance(node, WikiNode): 190 ↛ 191line 190 didn't jump to line 191 because the condition on line 190 was never true

191 return None 

192 for level_node in node.find_child_recursively(LEVEL_KIND_FLAGS): 192 ↛ 198line 192 didn't jump to line 198 because the loop on line 192 didn't complete

193 section_title = clean_node(wxr, None, level_node.largs) 

194 if isinstance(target_section, str) and section_title == target_section: 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true

195 return level_node 

196 if section_title in TRANSLATIONS_TITLES: 

197 return level_node 

198 return None