Coverage for src/wiktextract/extractor/ja/translation.py: 92%

83 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from typing import Optional 

2 

3from mediawiki_langcodes import name_to_code 

4from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Translation, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def extract_translation_section( 

13 wxr: WiktextractContext, 

14 word_entry: WordEntry, 

15 level_node: LevelNode, 

16) -> None: 

17 sense_text = "" 

18 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST): 

19 if isinstance(node, TemplateNode) and node.template_name == "trans-top": 

20 sense_text = clean_node( 

21 wxr, None, node.template_parameters.get(1, "") 

22 ) 

23 elif node.kind == NodeKind.LIST: 23 ↛ 18line 23 didn't jump to line 18 because the condition on line 23 was always true

24 for list_item in node.find_child(NodeKind.LIST_ITEM): 

25 process_translation_list_item( 

26 wxr, word_entry, list_item, sense_text, "", "" 

27 ) 

28 

29 

30def process_translation_list_item( 

31 wxr: WiktextractContext, 

32 word_entry: WordEntry, 

33 list_item: WikiNode, 

34 sense_text: str, 

35 lang_name: str, 

36 lang_code: str, 

37) -> None: 

38 after_collon = False 

39 last_tr: Optional[Translation] = None 

40 for node_index, node in enumerate(list_item.children): 

41 if isinstance(node, str) and ":" in node and not after_collon: 

42 after_collon = True 

43 lang_nodes = list_item.children[:node_index] 

44 lang_nodes.append(node[: node.index(":")]) 

45 new_lang_name = clean_node(wxr, None, lang_nodes) 

46 new_lang_code = name_to_code(new_lang_name, "ja") 

47 if new_lang_code != "" or lang_name == "": 

48 lang_code = new_lang_code 

49 lang_name = new_lang_name 

50 elif isinstance(node, TemplateNode): 

51 if not after_collon: 

52 lang_name = clean_node(wxr, None, node) 

53 if node.template_name == "T": 

54 lang_code = node.template_parameters.get(1, "") 

55 else: 

56 lang_code = node.template_name 

57 elif node.template_name.lower() in ["t+", "t", "t-", "l", "lang"]: 

58 last_tr = process_t_template( 

59 wxr, word_entry, node, sense_text, lang_name, lang_code 

60 ) 

61 elif node.template_name.lower() == "archar": 

62 tr_data = Translation( 

63 word=clean_node(wxr, None, node), 

64 sense=sense_text, 

65 lang_code=lang_code, 

66 lang=lang_name, 

67 ) 

68 word_entry.translations.append(tr_data) 

69 last_tr = tr_data 

70 elif ( 

71 node.template_name.lower() 

72 in [ 

73 "m", 

74 "f", 

75 "p", 

76 "n", 

77 "c", 

78 "s", 

79 "mf", 

80 "mpl", 

81 "fpl", 

82 "npl", 

83 "inv", 

84 ] 

85 and last_tr is not None 

86 ): 

87 last_tr.raw_tags.append(clean_node(wxr, None, node)) 

88 translate_raw_tags(last_tr) 

89 elif node.template_name.lower() == "zh-ts": 89 ↛ 40line 89 didn't jump to line 40 because the condition on line 89 was always true

90 last_tr = process_zh_ts_template( 

91 wxr, word_entry, node, sense_text, lang_name, lang_code 

92 ) 

93 elif ( 

94 isinstance(node, WikiNode) 

95 and node.kind == NodeKind.LINK 

96 and after_collon 

97 ): 

98 tr_word = clean_node(wxr, None, node) 

99 if len(tr_word) > 0: 99 ↛ 40line 99 didn't jump to line 40 because the condition on line 99 was always true

100 tr_data = Translation( 

101 word=tr_word, 

102 sense=sense_text, 

103 lang_code=lang_code, 

104 lang=lang_name, 

105 ) 

106 word_entry.translations.append(tr_data) 

107 last_tr = tr_data 

108 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

109 for nested_list_item in node.find_child_recursively( 

110 NodeKind.LIST_ITEM 

111 ): 

112 process_translation_list_item( 

113 wxr, 

114 word_entry, 

115 nested_list_item, 

116 sense_text, 

117 lang_name, 

118 lang_code, 

119 ) 

120 

121 

122T_TAGS = { 

123 "m": "masculine", 

124 "f": "feminine", 

125 "mf": ["masculine", "feminine"], 

126 "n": "neuter", 

127 "c": "common", 

128 "impf": "imperfective", 

129 "pf": "perfective", 

130 "s": "singular", 

131 "p": "plural", 

132} 

133 

134 

135def process_t_template( 

136 wxr: WiktextractContext, 

137 word_entry: WordEntry, 

138 node: TemplateNode, 

139 sense_text: str, 

140 lang_name: str, 

141 lang_code: str, 

142) -> Optional[Translation]: 

143 # https://ja.wiktionary.org/wiki/テンプレート:t 

144 tr_word = clean_node(wxr, None, node.template_parameters.get(2, "")) 

145 if "alt" in node.template_parameters: 

146 tr_word = clean_node(wxr, None, node.template_parameters["alt"]) 

147 roman = clean_node(wxr, None, node.template_parameters.get("tr", "")) 

148 tags = [] 

149 for arg_index in [3, 4]: 

150 if arg_index in node.template_parameters: 

151 tag_arg = clean_node( 

152 wxr, None, node.template_parameters.get(arg_index, "") 

153 ) 

154 tag_value = T_TAGS.get(tag_arg, []) 

155 if isinstance(tag_value, str): 155 ↛ 157line 155 didn't jump to line 157 because the condition on line 155 was always true

156 tags.append(tag_value) 

157 elif isinstance(tag_value, list): 

158 tags.extend(tag_value) 

159 if len(tr_word) > 0: 159 ↛ 170line 159 didn't jump to line 170 because the condition on line 159 was always true

160 tr_data = Translation( 

161 word=tr_word, 

162 roman=roman, 

163 sense=sense_text, 

164 lang_code=lang_code, 

165 lang=lang_name, 

166 tags=tags, 

167 ) 

168 word_entry.translations.append(tr_data) 

169 return tr_data 

170 return None 

171 

172 

173def process_zh_ts_template( 

174 wxr: WiktextractContext, 

175 word_entry: WordEntry, 

176 node: TemplateNode, 

177 sense_text: str, 

178 lang_name: str, 

179 lang_code: str, 

180) -> Optional[Translation]: 

181 # https://ja.wiktionary.org/wiki/テンプレート:zh-ts 

182 tr_data = None 

183 for arg in range(1, 3): 

184 tr_word = clean_node(wxr, None, node.template_parameters.get(arg, "")) 

185 if tr_word != "": 185 ↛ 183line 185 didn't jump to line 183 because the condition on line 185 was always true

186 tr_data = Translation( 

187 word=tr_word, 

188 sense=sense_text, 

189 lang_code=lang_code, 

190 lang=lang_name, 

191 ) 

192 tr_data.tags = ( 

193 ["Traditional Chinese"] if arg == 1 else ["Simplified Chinese"] 

194 ) 

195 word_entry.translations.append(tr_data) 

196 return tr_data