Coverage for src/wiktextract/extractor/vi/translation.py: 65%

1from itertools import count

3from mediawiki_langcodes import name_to_code

4from wikitextprocessor.parser import (

5 LEVEL_KIND_FLAGS,

6 LevelNode,

7 NodeKind,

8 TemplateNode,

9 WikiNode,

10)

12from ...page import clean_node

13from ...wxr_context import WiktextractContext

14from .linkage import QUALIFIER_TEMPLATES, extract_qualifier_template

15from .models import Translation, WordEntry

16from .section_titles import TRANSLATION_SECTIONS

17from .tags import translate_raw_tags

20def extract_translation_section(

21 wxr: WiktextractContext,

22 word_entry: WordEntry,

23 level_node: LevelNode,

24 sense: str = "",

25 from_trans_see: bool = False,

26 source: str = "",

27):

28 for node in level_node.children:

29 if isinstance(node, TemplateNode):

30 if node.template_name == "trans-top" and not (

31 sense != "" and from_trans_see

32 ):

33 sense = clean_node(

34 wxr, None, node.template_parameters.get(1, "")

35 )

36 clean_node(wxr, word_entry, node)

37 elif node.template_name == "trans-see" and not from_trans_see: 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 extract_trans_see_template(wxr, word_entry, node)

39 elif node.template_name == "multitrans":

40 extract_multitrans_template(wxr, word_entry, node)

41 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

42 for list_item in node.find_child(NodeKind.LIST_ITEM):

43 extract_translation_list_item(

44 wxr, word_entry, list_item, sense, source

45 )

46 elif isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:

47 sense = clean_node(wxr, None, node)

50# Thể loại:Bản mẫu ngữ pháp

51ABBR_TAG_TEMPLATES = {"f", "fm", "g", "inv", "m", "mf", "mn", "n", "p"}

54def extract_translation_list_item(

55 wxr: WiktextractContext,

56 word_entry: WordEntry,

57 list_item: WikiNode,

58 sense: str,

59 source: str,

60):

61 lang_name = "unknown"

62 lang_code = "unknown"

63 for index, node in enumerate(list_item.children):

64 if isinstance(node, str) and ":" in node and lang_name == "unknown":

65 lang_name = (

66 clean_node(wxr, None, list_item.children[:index])

67 + node[: node.index(":")].strip()

68 ) or "unknown"

69 if lang_name != "unknown": 69 ↛ 63line 69 didn't jump to line 63 because the condition on line 69 was always true

70 lang_code = name_to_code(lang_name, "vi") or "unknown"

71 elif isinstance(node, TemplateNode) and node.template_name in [

72 "t",

73 "t-",

74 "t+",

75 "t2",

76 "t2+",

77 "tt+",

78 ]:

79 extract_t_template(wxr, word_entry, node, lang_name, sense, source)

80 elif (

81 isinstance(node, WikiNode)

82 and node.kind == NodeKind.LINK

83 and lang_name != "unknown"

84 ):

85 word = clean_node(wxr, None, node)

86 if word != "": 86 ↛ 63line 86 didn't jump to line 63 because the condition on line 86 was always true

87 word_entry.translations.append(

88 Translation(

89 word=word,

90 lang=lang_name,

91 lang_code=lang_code,

92 sense=sense,

93 source=source,

94 )

95 )

96 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

97 for child_list_item in node.find_child(NodeKind.LIST_ITEM):

98 extract_translation_list_item(

99 wxr, word_entry, child_list_item, sense, source

100 )

101 elif (

102 isinstance(node, TemplateNode)

103 and node.template_name in QUALIFIER_TEMPLATES

104 and len(word_entry.translations) > 0

105 ):

106 word_entry.translations[-1].raw_tags.extend(

107 extract_qualifier_template(wxr, node)

108 )

109 translate_raw_tags(word_entry.translations[-1])

110 elif ( 110 ↛ 115line 110 didn't jump to line 115 because the condition on line 110 was never true

111 isinstance(node, TemplateNode)

112 and node.template_name in ABBR_TAG_TEMPLATES

113 and len(word_entry.translations) > 0

114 ):

115 word_entry.translations[-1].raw_tags.extend(

116 extract_abbr_tag_template(wxr, node)

117 )

118 translate_raw_tags(word_entry.translations[-1])

119

120

121def extract_t_template(

122 wxr: WiktextractContext,

123 word_entry: WordEntry,

124 t_node: TemplateNode,

125 lang_name: str,

126 sense: str,

127 source: str,

128) -> None:

129 lang_code = (

130 clean_node(wxr, None, t_node.template_parameters.get(1, ""))

131 or "unknown"

132 )

133 expanded_node = wxr.wtp.parse(

134 wxr.wtp.node_to_wikitext(t_node), expand_all=True

135 )

136 for e_node in expanded_node.find_child(NodeKind.TEMPLATE): 136 ↛ 137line 136 didn't jump to line 137 because the loop on line 136 never started

137 if e_node.template_name in ["t", "t+"]:

138 expanded_node = wxr.wtp.parse(

139 wxr.wtp.node_to_wikitext(e_node), expand_all=True

140 )

141 lit = clean_node(wxr, None, t_node.template_parameters.get("lit", ""))

142 raw_tags = []

143 roman = ""

144 other = ""

145 for abbr_tag in expanded_node.find_html_recursively("abbr"): 145 ↛ 146line 145 didn't jump to line 146 because the loop on line 145 never started

146 gender = abbr_tag.attrs.get("title", "")

147 if gender != "":

148 raw_tags.append(gender)

149 for span_tag in expanded_node.find_html_recursively("span"):

150 if (

151 span_tag.attrs.get("lang", "").endswith("-Latn")

152 or span_tag.attrs.get("class", "") == "tr"

153 ):

154 roman = clean_node(wxr, None, span_tag)

155 if lang_code == "ja" and "," in roman:

156 other, roman = roman.split(",", maxsplit=1)

157 other = other.strip()

158 roman = roman.strip()

159 for span_tag in expanded_node.find_html_recursively("span"):

160 span_class = span_tag.attrs.get("class", "").split()

161 if span_tag.attrs.get("lang") == lang_code:

162 word = clean_node(wxr, None, span_tag)

163 if word != "": 163 ↛ 159line 163 didn't jump to line 159 because the condition on line 163 was always true

164 tr_data = Translation(

165 word=word,

166 lang=lang_name,

167 lang_code=lang_code,

168 sense=sense,

169 source=source,

170 roman=roman,

171 lit=lit,

172 raw_tags=raw_tags,

173 other=other,

174 )

175 if "Hant" in span_class:

176 tr_data.tags.append("Traditional-Chinese")

177 elif "Hans" in span_class:

178 tr_data.tags.append("Simplified-Chinese")

179 translate_raw_tags(tr_data)

180 word_entry.translations.append(tr_data)

181

182 for link_node in expanded_node.find_child(NodeKind.LINK):

183 clean_node(wxr, word_entry, link_node)

184

185

186def extract_trans_see_template(

187 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

188):

189 sense = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

190 page_titles = []

191 if 2 in t_node.template_parameters:

192 for index in count(2):

193 if index not in t_node.template_parameters:

194 break

195 page_titles.append(

196 clean_node(wxr, None, t_node.template_parameters[index])

197 )

198 else:

199 page_titles.append(sense)

200 for page_title in page_titles:

201 if "#" in page_title:

202 page_title = page_title[: page_title.index("#")]

203 page = wxr.wtp.get_page(page_title)

204 if page is None:

205 return

206 root = wxr.wtp.parse(page.body, pre_expand=True)

207 target_node = find_subpage_section(wxr, root, TRANSLATION_SECTIONS)

208 if target_node is not None:

209 extract_translation_section(

210 wxr,

211 word_entry,

212 target_node,

213 sense=sense,

214 from_trans_see=True,

215 source=page_title,

216 )

217

218

219def find_subpage_section(

220 wxr: WiktextractContext, root: WikiNode, target_sections: set[str]

221) -> WikiNode | None:

222 for level_node in root.find_child_recursively(LEVEL_KIND_FLAGS):

223 section_title = clean_node(wxr, None, level_node.largs)

224 if section_title in target_sections:

225 return level_node

226 return None

227

228

229def extract_multitrans_template(

230 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

231):

232 arg = wxr.wtp.parse(

233 wxr.wtp.node_to_wikitext(t_node.template_parameters.get("data", ""))

234 )

235 extract_translation_section(wxr, word_entry, arg)

236

237

238def extract_abbr_tag_template(

239 wxr: WiktextractContext, t_node: TemplateNode

240) -> list[str]:

241 raw_tags = []

242 expanded_node = wxr.wtp.parse(

243 wxr.wtp.node_to_wikitext(t_node), expand_all=True

244 )

245 for abbr_tag in expanded_node.find_html_recursively("abbr"):

246 raw_tag = clean_node(wxr, None, abbr_tag.attrs.get("title", ""))

247 if raw_tag != "":

248 raw_tags.append(raw_tag)

249 return raw_tags

Coverage for src / wiktextract / extractor / vi / translation.py: 65%

119 statements