Coverage for src/wiktextract/extractor/ja/translation.py: 89%

119 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1from mediawiki_langcodes import name_to_code 

2from wikitextprocessor.parser import ( 

3 LEVEL_KIND_FLAGS, 

4 LevelNode, 

5 NodeKind, 

6 TemplateNode, 

7 WikiNode, 

8) 

9 

10from ...page import clean_node 

11from ...wxr_context import WiktextractContext 

12from .models import Translation, WordEntry 

13from .tags import translate_raw_tags 

14 

15 

16def extract_translation_section( 

17 wxr: WiktextractContext, 

18 word_entry: WordEntry, 

19 level_node: LevelNode, 

20 sense: str = "", 

21 is_subpage: bool = False, 

22) -> None: 

23 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST): 

24 if ( 

25 isinstance(node, TemplateNode) 

26 and node.template_name == "trans-top" 

27 and not (sense != "" and is_subpage) 

28 ): 

29 sense = clean_node(wxr, None, node.template_parameters.get(1, "")) 

30 elif ( 

31 isinstance(node, TemplateNode) 

32 and node.template_name in ["trans-see", "trans-see2"] 

33 and not is_subpage 

34 ): 

35 extract_trans_see_template(wxr, word_entry, node) 

36 elif node.kind == NodeKind.LIST: 36 ↛ 23line 36 didn't jump to line 23 because the condition on line 36 was always true

37 for list_item in node.find_child(NodeKind.LIST_ITEM): 

38 process_translation_list_item( 

39 wxr, word_entry, list_item, sense, "", "" 

40 ) 

41 

42 

43def process_translation_list_item( 

44 wxr: WiktextractContext, 

45 word_entry: WordEntry, 

46 list_item: WikiNode, 

47 sense_text: str, 

48 lang_name: str, 

49 lang_code: str, 

50) -> None: 

51 after_collon = False 

52 last_tr: Translation | None = None 

53 for node_index, node in enumerate(list_item.children): 

54 if isinstance(node, str) and ":" in node and not after_collon: 

55 after_collon = True 

56 lang_nodes = list_item.children[:node_index] 

57 lang_nodes.append(node[: node.index(":")]) 

58 new_lang_name = clean_node(wxr, None, lang_nodes) 

59 new_lang_code = name_to_code(new_lang_name, "ja") 

60 if new_lang_code != "" or lang_name == "": 

61 lang_code = new_lang_code 

62 lang_name = new_lang_name 

63 elif isinstance(node, TemplateNode): 

64 if not after_collon: 

65 lang_name = clean_node(wxr, None, node) 

66 if node.template_name == "T": 

67 lang_code = node.template_parameters.get(1, "") 

68 else: 

69 lang_code = node.template_name 

70 elif node.template_name.lower() in [ 

71 "t+", 

72 "t", 

73 "t-", 

74 "l", 

75 "lang", 

76 "tø", 

77 "t+check", 

78 "t-check", 

79 ]: 

80 for tr_data in process_t_template( 

81 wxr, word_entry, node, sense_text, lang_name, lang_code 

82 ): 

83 last_tr = tr_data 

84 elif node.template_name.lower() == "archar": 

85 tr_data = Translation( 

86 word=clean_node(wxr, None, node), 

87 sense=sense_text, 

88 lang_code=lang_code, 

89 lang=lang_name, 

90 ) 

91 word_entry.translations.append(tr_data) 

92 last_tr = tr_data 

93 elif ( 

94 node.template_name.lower() 

95 in [ 

96 "m", 

97 "f", 

98 "p", 

99 "n", 

100 "c", 

101 "s", 

102 "mf", 

103 "mpl", 

104 "fpl", 

105 "npl", 

106 "inv", 

107 ] 

108 and last_tr is not None 

109 ): 

110 last_tr.raw_tags.append(clean_node(wxr, None, node)) 

111 translate_raw_tags(last_tr) 

112 elif node.template_name.lower() == "zh-ts": 112 ↛ 53line 112 didn't jump to line 53 because the condition on line 112 was always true

113 last_tr = process_zh_ts_template( 

114 wxr, word_entry, node, sense_text, lang_name, lang_code 

115 ) 

116 elif ( 

117 isinstance(node, WikiNode) 

118 and node.kind == NodeKind.LINK 

119 and after_collon 

120 ): 

121 tr_word = clean_node(wxr, None, node) 

122 if len(tr_word) > 0: 122 ↛ 53line 122 didn't jump to line 53 because the condition on line 122 was always true

123 tr_data = Translation( 

124 word=tr_word, 

125 sense=sense_text, 

126 lang_code=lang_code, 

127 lang=lang_name, 

128 ) 

129 word_entry.translations.append(tr_data) 

130 last_tr = tr_data 

131 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

132 for nested_list_item in node.find_child_recursively( 

133 NodeKind.LIST_ITEM 

134 ): 

135 process_translation_list_item( 

136 wxr, 

137 word_entry, 

138 nested_list_item, 

139 sense_text, 

140 lang_name, 

141 lang_code, 

142 ) 

143 

144 

145T_TAGS = { 

146 "m": "masculine", 

147 "f": "feminine", 

148 "mf": ["masculine", "feminine"], 

149 "n": "neuter", 

150 "c": "common", 

151 "impf": "imperfective", 

152 "pf": "perfective", 

153 "s": "singular", 

154 "p": "plural", 

155} 

156 

157 

158def process_t_template( 

159 wxr: WiktextractContext, 

160 word_entry: WordEntry, 

161 node: TemplateNode, 

162 sense_text: str, 

163 lang_name: str, 

164 lang_code: str, 

165) -> list[Translation]: 

166 # https://ja.wiktionary.org/wiki/テンプレート:t 

167 second_arg = wxr.wtp.parse( 

168 wxr.wtp.node_to_wikitext(node.template_parameters.get(2, "")) 

169 ) 

170 for t_node in second_arg.find_child(NodeKind.TEMPLATE): 

171 if t_node.template_name == "zh-l": 171 ↛ 170line 171 didn't jump to line 170 because the condition on line 171 was always true

172 from .linkage import extract_zh_l_template 

173 

174 tr_list = [] 

175 for l_data in extract_zh_l_template(wxr, t_node): 

176 tr_data = Translation( 

177 word=l_data.word, 

178 tags=l_data.tags, 

179 roman=l_data.roman, 

180 lang=lang_name, 

181 lang_code=lang_code, 

182 ) 

183 tr_list.append(tr_data) 

184 word_entry.translations.append(tr_data) 

185 return tr_list 

186 

187 tr_word = clean_node(wxr, None, node.template_parameters.get(2, "")) 

188 if "alt" in node.template_parameters: 

189 tr_word = clean_node(wxr, None, node.template_parameters["alt"]) 

190 roman = clean_node(wxr, None, node.template_parameters.get("tr", "")) 

191 tags = [] 

192 for arg_index in [3, 4]: 

193 if arg_index in node.template_parameters: 

194 tag_arg = clean_node( 

195 wxr, None, node.template_parameters.get(arg_index, "") 

196 ) 

197 tag_value = T_TAGS.get(tag_arg, []) 

198 if isinstance(tag_value, str): 198 ↛ 200line 198 didn't jump to line 200 because the condition on line 198 was always true

199 tags.append(tag_value) 

200 elif isinstance(tag_value, list): 

201 tags.extend(tag_value) 

202 if len(tr_word) > 0: 202 ↛ 213line 202 didn't jump to line 213 because the condition on line 202 was always true

203 tr_data = Translation( 

204 word=tr_word, 

205 roman=roman, 

206 sense=sense_text, 

207 lang_code=lang_code, 

208 lang=lang_name, 

209 tags=tags, 

210 ) 

211 word_entry.translations.append(tr_data) 

212 return [tr_data] 

213 return [] 

214 

215 

216def process_zh_ts_template( 

217 wxr: WiktextractContext, 

218 word_entry: WordEntry, 

219 node: TemplateNode, 

220 sense_text: str, 

221 lang_name: str, 

222 lang_code: str, 

223) -> Translation | None: 

224 # https://ja.wiktionary.org/wiki/テンプレート:zh-ts 

225 tr_data = None 

226 for arg in range(1, 3): 

227 tr_word = clean_node(wxr, None, node.template_parameters.get(arg, "")) 

228 if tr_word != "": 228 ↛ 226line 228 didn't jump to line 226 because the condition on line 228 was always true

229 tr_data = Translation( 

230 word=tr_word, 

231 sense=sense_text, 

232 lang_code=lang_code, 

233 lang=lang_name, 

234 ) 

235 tr_data.tags = ( 

236 ["Traditional-Chinese"] if arg == 1 else ["Simplified-Chinese"] 

237 ) 

238 word_entry.translations.append(tr_data) 

239 return tr_data 

240 

241 

242def extract_trans_see_template( 

243 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

244): 

245 # テンプレート:trans-see, テンプレート:trans-see2 

246 page_title = clean_node( 

247 wxr, 

248 None, 

249 t_node.template_parameters.get( 

250 2, t_node.template_parameters.get(1, wxr.wtp.title) 

251 ), 

252 ) 

253 sense = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

254 target_id = "" 

255 if "#" in page_title: 255 ↛ 259line 255 didn't jump to line 259 because the condition on line 255 was always true

256 index = page_title.index("#") 

257 target_id = page_title[index + 1 :] 

258 page_title = page_title[:index] 

259 page_body = wxr.wtp.get_page_body(page_title, 0) 

260 if page_body is None: 260 ↛ 261line 260 didn't jump to line 261 because the condition on line 260 was never true

261 return 

262 root = wxr.wtp.parse(page_body) 

263 target_node = find_subpage_section(wxr, root, "翻訳", target_id) 

264 if target_node is not None: 264 ↛ exitline 264 didn't return from function 'extract_trans_see_template' because the condition on line 264 was always true

265 extract_translation_section( 

266 wxr, word_entry, target_node, sense=sense, is_subpage=True 

267 ) 

268 

269 

270def find_subpage_section( 

271 wxr: WiktextractContext, root: WikiNode, target_title: str, target_id: str 

272) -> WikiNode | None: 

273 for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS): 273 ↛ 282line 273 didn't jump to line 282 because the loop on line 273 didn't complete

274 section_title = clean_node(wxr, None, level_node.largs) 

275 if section_title == target_title: 

276 if target_id == "": 276 ↛ 277line 276 didn't jump to line 277 because the condition on line 276 was never true

277 return level_node 

278 else: 

279 for span in level_node.find_html("span"): 279 ↛ 273line 279 didn't jump to line 273 because the loop on line 279 didn't complete

280 if span.attrs.get("id", "") == target_id: 280 ↛ 279line 280 didn't jump to line 279 because the condition on line 280 was always true

281 return level_node 

282 return None