Coverage for src/wiktextract/extractor/vi/translation.py: 66%

105 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1from itertools import count 

2 

3from mediawiki_langcodes import name_to_code 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .linkage import QUALIFIER_TEMPALTES, extract_qualifier_template 

15from .models import Translation, WordEntry 

16from .section_titles import TRANSLATION_SECTIONS 

17from .tags import translate_raw_tags 

18 

19 

20def extract_translation_section( 

21 wxr: WiktextractContext, 

22 word_entry: WordEntry, 

23 level_node: LevelNode, 

24 sense: str = "", 

25 from_trans_see: bool = False, 

26 source: str = "", 

27): 

28 for node in level_node.children: 

29 if isinstance(node, TemplateNode): 

30 if node.template_name == "trans-top" and not ( 

31 sense != "" and from_trans_see 

32 ): 

33 sense = clean_node( 

34 wxr, None, node.template_parameters.get(1, "") 

35 ) 

36 clean_node(wxr, word_entry, node) 

37 elif node.template_name == "trans-see" and not from_trans_see: 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 extract_trans_see_template(wxr, word_entry, node) 

39 elif node.template_name == "multitrans": 

40 extract_multitrans_template(wxr, word_entry, node) 

41 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

42 for list_item in node.find_child(NodeKind.LIST_ITEM): 

43 extract_translation_list_item( 

44 wxr, word_entry, list_item, sense, source 

45 ) 

46 

47 

48def extract_translation_list_item( 

49 wxr: WiktextractContext, 

50 word_entry: WordEntry, 

51 list_item: WikiNode, 

52 sense: str, 

53 source: str, 

54): 

55 lang_name = "unknown" 

56 lang_code = "unknown" 

57 for index, node in enumerate(list_item.children): 

58 if isinstance(node, str) and ":" in node and lang_name == "unknown": 

59 lang_name = ( 

60 clean_node(wxr, None, list_item.children[:index]) 

61 + node[: node.index(":")].strip() 

62 ) or "unknown" 

63 if lang_name != "unknown": 63 ↛ 57line 63 didn't jump to line 57 because the condition on line 63 was always true

64 lang_code = name_to_code(lang_name, "vi") or "unknown" 

65 elif isinstance(node, TemplateNode) and node.template_name in [ 

66 "t", 

67 "t-", 

68 "t+", 

69 "t2", 

70 "t2+", 

71 "tt+", 

72 ]: 

73 extract_t_template(wxr, word_entry, node, lang_name, sense, source) 

74 elif ( 74 ↛ 79line 74 didn't jump to line 79 because the condition on line 74 was never true

75 isinstance(node, WikiNode) 

76 and node.kind == NodeKind.LINK 

77 and lang_name != "unknown" 

78 ): 

79 word = clean_node(wxr, None, node) 

80 if word != "": 

81 word_entry.translations.append( 

82 Translation( 

83 word=word, 

84 lang=lang_name, 

85 lang_code=lang_code, 

86 sense=sense, 

87 source=source, 

88 ) 

89 ) 

90 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

91 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

92 extract_translation_list_item( 

93 wxr, word_entry, child_list_item, sense, source 

94 ) 

95 elif ( 

96 isinstance(node, TemplateNode) 

97 and node.template_name in QUALIFIER_TEMPALTES 

98 and len(word_entry.translations) > 0 

99 ): 

100 word_entry.translations[-1].raw_tags.extend( 

101 extract_qualifier_template(wxr, node) 

102 ) 

103 translate_raw_tags(word_entry.translations[-1]) 

104 

105 

106def extract_t_template( 

107 wxr: WiktextractContext, 

108 word_entry: WordEntry, 

109 t_node: TemplateNode, 

110 lang_name: str, 

111 sense: str, 

112 source: str, 

113) -> None: 

114 lang_code = ( 

115 clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

116 or "unknown" 

117 ) 

118 expanded_node = wxr.wtp.parse( 

119 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

120 ) 

121 for e_node in expanded_node.find_child(NodeKind.TEMPLATE): 121 ↛ 122line 121 didn't jump to line 122 because the loop on line 121 never started

122 if e_node.template_name in ["t", "t+"]: 

123 expanded_node = wxr.wtp.parse( 

124 wxr.wtp.node_to_wikitext(e_node), expand_all=True 

125 ) 

126 lit = clean_node(wxr, None, t_node.template_parameters.get("lit", "")) 

127 raw_tags = [] 

128 roman = "" 

129 other = "" 

130 for abbr_tag in expanded_node.find_html_recursively("abbr"): 130 ↛ 131line 130 didn't jump to line 131 because the loop on line 130 never started

131 gender = abbr_tag.attrs.get("title", "") 

132 if gender != "": 

133 raw_tags.append(gender) 

134 for span_tag in expanded_node.find_html_recursively("span"): 

135 if ( 

136 span_tag.attrs.get("lang", "").endswith("-Latn") 

137 or span_tag.attrs.get("class", "") == "tr" 

138 ): 

139 roman = clean_node(wxr, None, span_tag) 

140 if lang_code == "ja" and "," in roman: 

141 other, roman = roman.split(",", maxsplit=1) 

142 other = other.strip() 

143 roman = roman.strip() 

144 for span_tag in expanded_node.find_html_recursively("span"): 

145 span_class = span_tag.attrs.get("class", "").split() 

146 if span_tag.attrs.get("lang") == lang_code: 

147 word = clean_node(wxr, None, span_tag) 

148 if word != "": 148 ↛ 144line 148 didn't jump to line 144 because the condition on line 148 was always true

149 tr_data = Translation( 

150 word=word, 

151 lang=lang_name, 

152 lang_code=lang_code, 

153 sense=sense, 

154 source=source, 

155 roman=roman, 

156 lit=lit, 

157 raw_tags=raw_tags, 

158 other=other, 

159 ) 

160 if "Hant" in span_class: 

161 tr_data.tags.append("Traditional-Chinese") 

162 elif "Hans" in span_class: 

163 tr_data.tags.append("Simplified-Chinese") 

164 translate_raw_tags(tr_data) 

165 word_entry.translations.append(tr_data) 

166 

167 for link_node in expanded_node.find_child(NodeKind.LINK): 

168 clean_node(wxr, word_entry, link_node) 

169 

170 

171def extract_trans_see_template( 

172 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

173): 

174 sense = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

175 page_titles = [] 

176 if 2 in t_node.template_parameters: 

177 for index in count(2): 

178 if index not in t_node.template_parameters: 

179 break 

180 page_titles.append( 

181 clean_node(wxr, None, t_node.template_parameters[index]) 

182 ) 

183 else: 

184 page_titles.append(sense) 

185 for page_title in page_titles: 

186 if "#" in page_title: 

187 page_title = page_title[: page_title.index("#")] 

188 page = wxr.wtp.get_page(page_title) 

189 if page is None: 

190 return 

191 root = wxr.wtp.parse(page.body, pre_expand=True) 

192 target_node = find_subpage_section(wxr, root, TRANSLATION_SECTIONS) 

193 if target_node is not None: 

194 extract_translation_section( 

195 wxr, 

196 word_entry, 

197 target_node, 

198 sense=sense, 

199 from_trans_see=True, 

200 source=page_title, 

201 ) 

202 

203 

204def find_subpage_section( 

205 wxr: WiktextractContext, root: WikiNode, target_sections: set[str] 

206) -> WikiNode | None: 

207 for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS): 

208 section_title = clean_node(wxr, None, level_node.largs) 

209 if section_title in target_sections: 

210 return level_node 

211 return None 

212 

213 

214def extract_multitrans_template( 

215 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

216): 

217 arg = wxr.wtp.parse( 

218 wxr.wtp.node_to_wikitext(t_node.template_parameters.get("data", "")) 

219 ) 

220 extract_translation_section(wxr, word_entry, arg)