Coverage for src/wiktextract/extractor/pt/translation.py: 85%

124 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Translation, WordEntry 

8 

9 

10def extract_translation_section( 

11 wxr: WiktextractContext, 

12 word_entry: WordEntry, 

13 level_node: LevelNode, 

14 title_text: str, 

15) -> None: 

16 sense = "" 

17 sense_index = 0 

18 target_field = "translations" 

19 match title_text: 

20 case "Cognatos": 20 ↛ 21line 20 didn't jump to line 21 because the pattern on line 20 never matched

21 target_field = "cognates" 

22 case "Descendentes": 22 ↛ 23line 22 didn't jump to line 23 because the pattern on line 22 never matched

23 target_field = "descendants" 

24 

25 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST): 

26 match node.kind: 

27 case NodeKind.TEMPLATE: 

28 if node.template_name == "tradini": 

29 sense, sense_index = extract_tradini_template(wxr, node) 

30 case NodeKind.LIST: 30 ↛ 25line 30 didn't jump to line 25 because the pattern on line 30 always matched

31 for list_item in node.find_child(NodeKind.LIST_ITEM): 

32 extract_translation_list_item( 

33 wxr, 

34 word_entry, 

35 list_item, 

36 sense, 

37 sense_index, 

38 target_field, 

39 ) 

40 

41 

42def extract_tradini_template( 

43 wxr: WiktextractContext, t_node: TemplateNode 

44) -> tuple[str, str]: 

45 # https://pt.wiktionary.org/wiki/Predefinição:tradini 

46 sense = "" 

47 sense_index = 0 

48 first_arg_str = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

49 m = re.match(r"De (\d+)", first_arg_str) 

50 if m is not None: 50 ↛ 54line 50 didn't jump to line 54 because the condition on line 50 was always true

51 sense_index = int(m.group(1)) 

52 sense = first_arg_str[m.end() :].strip("() ") 

53 else: 

54 sense = first_arg_str 

55 return sense, sense_index 

56 

57 

58def extract_translation_list_item( 

59 wxr: WiktextractContext, 

60 word_entry: WordEntry, 

61 list_item: WikiNode, 

62 sense: str, 

63 sense_index: int, 

64 target_field: str, 

65) -> None: 

66 translations = [] 

67 lang_name = "unknown" 

68 for node in list_item.children: 

69 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

70 link_str = clean_node(wxr, None, node) 

71 if "/traduções" in link_str or "/tradução" in link_str: 

72 extract_translation_subpage(wxr, word_entry, link_str) 

73 elif lang_name == "unknown": 73 ↛ 68line 73 didn't jump to line 68 because the condition on line 73 was always true

74 lang_name = link_str 

75 elif isinstance(node, TemplateNode): 

76 match node.template_name: 

77 case "trad": 

78 translations.extend( 

79 extract_trad_template(wxr, node, sense, sense_index) 

80 ) 

81 case "trad-": 

82 translations.extend( 

83 extract_trad_minus_template( 

84 wxr, node, sense, sense_index 

85 ) 

86 ) 

87 case "t": 

88 translations.extend( 

89 extract_t_template(wxr, node, sense, sense_index) 

90 ) 

91 case "xlatio": 91 ↛ 68line 91 didn't jump to line 68 because the pattern on line 91 always matched

92 translations.extend( 

93 extract_xlatio_template( 

94 wxr, 

95 node, 

96 sense, 

97 sense_index, 

98 translations[-1].lang 

99 if len(translations) > 0 

100 else lang_name, 

101 ) 

102 ) 

103 elif isinstance(node, str) and re.search(r"\(.+\)", node) is not None: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true

104 roman = node.strip("() \n") 

105 for tr_data in translations: 

106 tr_data.roman = roman 

107 elif ( 

108 isinstance(node, WikiNode) 

109 and node.kind == NodeKind.ITALIC 

110 and len(translations) > 0 

111 ): 

112 raw_tag = clean_node(wxr, None, node) 

113 if raw_tag != "": 113 ↛ 68line 113 didn't jump to line 68 because the condition on line 113 was always true

114 translations[-1].raw_tags.append(raw_tag) 

115 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

116 for next_list_item in node.find_child(NodeKind.LIST_ITEM): 

117 extract_translation_list_item( 

118 wxr, 

119 word_entry, 

120 next_list_item, 

121 sense, 

122 sense_index, 

123 target_field, 

124 ) 

125 

126 getattr(word_entry, target_field).extend(translations) 

127 

128 

129def extract_trad_template( 

130 wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int 

131) -> list[Translation]: 

132 # https://pt.wiktionary.org/wiki/Predefinição:trad 

133 translations = [] 

134 roman = clean_node(wxr, None, t_node.template_parameters.get("t", "")) 

135 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

136 lang_name = "unknown" 

137 expanded_node = wxr.wtp.parse( 

138 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

139 ) 

140 for link_node in expanded_node.find_child(NodeKind.LINK): 140 ↛ 143line 140 didn't jump to line 143 because the loop on line 140 didn't complete

141 lang_name = clean_node(wxr, None, link_node) 

142 break 

143 for arg in range(2, 12): 143 ↛ 158line 143 didn't jump to line 158 because the loop on line 143 didn't complete

144 if arg not in t_node.template_parameters: 

145 break 

146 tr_str = clean_node(wxr, None, t_node.template_parameters.get(arg, "")) 

147 if tr_str != "": 147 ↛ 143line 147 didn't jump to line 143 because the condition on line 147 was always true

148 translations.append( 

149 Translation( 

150 word=tr_str, 

151 lang=lang_name, 

152 lang_code=lang_code, 

153 roman=roman, 

154 sense=sense, 

155 sense_index=sense_index, 

156 ) 

157 ) 

158 return translations 

159 

160 

161def extract_trad_minus_template( 

162 wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int 

163) -> list[Translation]: 

164 # https://pt.wiktionary.org/wiki/Predefinição:trad- 

165 translations = [] 

166 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

167 lang_name = "unknown" 

168 expanded_node = wxr.wtp.parse( 

169 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

170 ) 

171 for link_node in expanded_node.find_child(NodeKind.LINK): 171 ↛ 174line 171 didn't jump to line 174 because the loop on line 171 didn't complete

172 lang_name = clean_node(wxr, None, link_node) 

173 break 

174 tr_data = Translation( 

175 word=clean_node(wxr, None, t_node.template_parameters.get(2, "")), 

176 lang=lang_name, 

177 lang_code=lang_code, 

178 roman=clean_node( 

179 wxr, None, t_node.template_parameters.get(3, "") 

180 ).strip("() "), 

181 sense=sense, 

182 sense_index=sense_index, 

183 ) 

184 if tr_data.word != "": 184 ↛ 186line 184 didn't jump to line 186 because the condition on line 184 was always true

185 translations.append(tr_data) 

186 return translations 

187 

188 

189TRANSLATION_GENDER_TAGS = { 

190 "c": "common", 

191 "f": "feminine", 

192 "m": "masculine", 

193 "n": "neuter", 

194} 

195 

196 

197def extract_t_template( 

198 wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int 

199) -> list[Translation]: 

200 # https://pt.wiktionary.org/wiki/Predefinição:t 

201 translations = [] 

202 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

203 lang_name = "unknown" 

204 expanded_node = wxr.wtp.parse( 

205 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

206 ) 

207 for link_node in expanded_node.find_child(NodeKind.LINK): 207 ↛ 210line 207 didn't jump to line 210 because the loop on line 207 didn't complete

208 lang_name = clean_node(wxr, None, link_node) 

209 break 

210 tr_data = Translation( 

211 word=clean_node(wxr, None, t_node.template_parameters.get(2, "")), 

212 lang=lang_name, 

213 lang_code=lang_code, 

214 roman=clean_node( 

215 wxr, None, t_node.template_parameters.get(4, "") 

216 ).strip("() "), 

217 sense=sense, 

218 sense_index=sense_index, 

219 ) 

220 gender_arg = clean_node(wxr, None, t_node.template_parameters.get(3, "")) 

221 if gender_arg in TRANSLATION_GENDER_TAGS: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 tr_data.tags.append(TRANSLATION_GENDER_TAGS[gender_arg]) 

223 if tr_data.word != "": 223 ↛ 225line 223 didn't jump to line 225 because the condition on line 223 was always true

224 translations.append(tr_data) 

225 return translations 

226 

227 

228def extract_xlatio_template( 

229 wxr: WiktextractContext, 

230 t_node: TemplateNode, 

231 sense: str, 

232 sense_index: int, 

233 lang_name: str, 

234) -> list[Translation]: 

235 # https://pt.wiktionary.org/wiki/Predefinição:xlatio 

236 translations = [] 

237 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

238 tr_data = Translation( 

239 word=clean_node(wxr, None, t_node.template_parameters.get(2, "")), 

240 lang=lang_name, 

241 lang_code=lang_code, 

242 sense=sense, 

243 sense_index=sense_index, 

244 ) 

245 third_arg = clean_node(wxr, None, t_node.template_parameters.get(3, "")) 

246 if third_arg.strip(".") in TRANSLATION_GENDER_TAGS: 246 ↛ 247line 246 didn't jump to line 247 because the condition on line 246 was never true

247 tr_data.tags.append(TRANSLATION_GENDER_TAGS[third_arg.strip(".")]) 

248 else: 

249 tr_data.roman = third_arg.strip("() ") 

250 if tr_data.word != "": 250 ↛ 252line 250 didn't jump to line 252 because the condition on line 250 was always true

251 translations.append(tr_data) 

252 return translations 

253 

254 

255def extract_translation_subpage( 

256 wxr: WiktextractContext, word_entry: WordEntry, page_title: str 

257) -> None: 

258 page = wxr.wtp.get_page(page_title, 0) 

259 if page is not None and page.body is not None: 259 ↛ exitline 259 didn't return from function 'extract_translation_subpage' because the condition on line 259 was always true

260 root = wxr.wtp.parse(page.body) 

261 extract_translation_section(wxr, word_entry, root, "Tradução")