Coverage for src/wiktextract/extractor/pt/translation.py: 85%

1import re

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from .models import Translation, WordEntry

10def extract_translation_section(

11 wxr: WiktextractContext,

12 word_entry: WordEntry,

13 level_node: LevelNode,

14 title_text: str,

15) -> None:

16 sense = ""

17 sense_index = 0

18 target_field = "translations"

19 match title_text:

20 case "Cognatos": 20 ↛ 21line 20 didn't jump to line 21 because the pattern on line 20 never matched

21 target_field = "cognates"

22 case "Descendentes": 22 ↛ 23line 22 didn't jump to line 23 because the pattern on line 22 never matched

23 target_field = "descendants"

25 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):

26 match node.kind:

27 case NodeKind.TEMPLATE:

28 if node.template_name == "tradini":

29 sense, sense_index = extract_tradini_template(wxr, node)

30 case NodeKind.LIST: 30 ↛ 25line 30 didn't jump to line 25 because the pattern on line 30 always matched

31 for list_item in node.find_child(NodeKind.LIST_ITEM):

32 extract_translation_list_item(

33 wxr,

34 word_entry,

35 list_item,

36 sense,

37 sense_index,

38 target_field,

39 )

42def extract_tradini_template(

43 wxr: WiktextractContext, t_node: TemplateNode

44) -> tuple[str, str]:

45 # https://pt.wiktionary.org/wiki/Predefinição:tradini

46 sense = ""

47 sense_index = 0

48 first_arg_str = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

49 m = re.match(r"De (\d+)", first_arg_str)

50 if m is not None: 50 ↛ 54line 50 didn't jump to line 54 because the condition on line 50 was always true

51 sense_index = int(m.group(1))

52 sense = first_arg_str[m.end() :].strip("() ")

53 else:

54 sense = first_arg_str

55 return sense, sense_index

58def extract_translation_list_item(

59 wxr: WiktextractContext,

60 word_entry: WordEntry,

61 list_item: WikiNode,

62 sense: str,

63 sense_index: int,

64 target_field: str,

65) -> None:

66 translations = []

67 lang_name = "unknown"

68 for node in list_item.children:

69 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:

70 link_str = clean_node(wxr, None, node)

71 if "/traduções" in link_str or "/tradução" in link_str:

72 extract_translation_subpage(wxr, word_entry, link_str)

73 elif lang_name == "unknown": 73 ↛ 68line 73 didn't jump to line 68 because the condition on line 73 was always true

74 lang_name = link_str

75 elif isinstance(node, TemplateNode):

76 match node.template_name:

77 case "trad":

78 translations.extend(

79 extract_trad_template(wxr, node, sense, sense_index)

80 )

81 case "trad-":

82 translations.extend(

83 extract_trad_minus_template(

84 wxr, node, sense, sense_index

85 )

86 )

87 case "t":

88 translations.extend(

89 extract_t_template(wxr, node, sense, sense_index)

90 )

91 case "xlatio": 91 ↛ 68line 91 didn't jump to line 68 because the pattern on line 91 always matched

92 translations.extend(

93 extract_xlatio_template(

94 wxr,

95 node,

96 sense,

97 sense_index,

98 translations[-1].lang

99 if len(translations) > 0

100 else lang_name,

101 )

102 )

103 elif isinstance(node, str) and re.search(r"\(.+\)", node) is not None: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true

104 roman = node.strip("() \n")

105 for tr_data in translations:

106 tr_data.roman = roman

107 elif (

108 isinstance(node, WikiNode)

109 and node.kind == NodeKind.ITALIC

110 and len(translations) > 0

111 ):

112 raw_tag = clean_node(wxr, None, node)

113 if raw_tag != "": 113 ↛ 68line 113 didn't jump to line 68 because the condition on line 113 was always true

114 translations[-1].raw_tags.append(raw_tag)

115 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

116 for next_list_item in node.find_child(NodeKind.LIST_ITEM):

117 extract_translation_list_item(

118 wxr,

119 word_entry,

120 next_list_item,

121 sense,

122 sense_index,

123 target_field,

124 )

125

126 getattr(word_entry, target_field).extend(translations)

127

128

129def extract_trad_template(

130 wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int

131) -> list[Translation]:

132 # https://pt.wiktionary.org/wiki/Predefinição:trad

133 translations = []

134 roman = clean_node(wxr, None, t_node.template_parameters.get("t", ""))

135 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

136 lang_name = "unknown"

137 expanded_node = wxr.wtp.parse(

138 wxr.wtp.node_to_wikitext(t_node), expand_all=True

139 )

140 for link_node in expanded_node.find_child(NodeKind.LINK): 140 ↛ 143line 140 didn't jump to line 143 because the loop on line 140 didn't complete

141 lang_name = clean_node(wxr, None, link_node)

142 break

143 for arg in range(2, 12): 143 ↛ 158line 143 didn't jump to line 158 because the loop on line 143 didn't complete

144 if arg not in t_node.template_parameters:

145 break

146 tr_str = clean_node(wxr, None, t_node.template_parameters.get(arg, ""))

147 if tr_str != "": 147 ↛ 143line 147 didn't jump to line 143 because the condition on line 147 was always true

148 translations.append(

149 Translation(

150 word=tr_str,

151 lang=lang_name,

152 lang_code=lang_code,

153 roman=roman,

154 sense=sense,

155 sense_index=sense_index,

156 )

157 )

158 return translations

159

160

161def extract_trad_minus_template(

162 wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int

163) -> list[Translation]:

164 # https://pt.wiktionary.org/wiki/Predefinição:trad-

165 translations = []

166 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

167 lang_name = "unknown"

168 expanded_node = wxr.wtp.parse(

169 wxr.wtp.node_to_wikitext(t_node), expand_all=True

170 )

171 for link_node in expanded_node.find_child(NodeKind.LINK): 171 ↛ 174line 171 didn't jump to line 174 because the loop on line 171 didn't complete

172 lang_name = clean_node(wxr, None, link_node)

173 break

174 tr_data = Translation(

175 word=clean_node(wxr, None, t_node.template_parameters.get(2, "")),

176 lang=lang_name,

177 lang_code=lang_code,

178 roman=clean_node(

179 wxr, None, t_node.template_parameters.get(3, "")

180 ).strip("() "),

181 sense=sense,

182 sense_index=sense_index,

183 )

184 if tr_data.word != "": 184 ↛ 186line 184 didn't jump to line 186 because the condition on line 184 was always true

185 translations.append(tr_data)

186 return translations

187

188

189TRANSLATION_GENDER_TAGS = {

190 "c": "common",

191 "f": "feminine",

192 "m": "masculine",

193 "n": "neuter",

194}

195

196

197def extract_t_template(

198 wxr: WiktextractContext, t_node: TemplateNode, sense: str, sense_index: int

199) -> list[Translation]:

200 # https://pt.wiktionary.org/wiki/Predefinição:t

201 translations = []

202 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

203 lang_name = "unknown"

204 expanded_node = wxr.wtp.parse(

205 wxr.wtp.node_to_wikitext(t_node), expand_all=True

206 )

207 for link_node in expanded_node.find_child(NodeKind.LINK): 207 ↛ 210line 207 didn't jump to line 210 because the loop on line 207 didn't complete

208 lang_name = clean_node(wxr, None, link_node)

209 break

210 tr_data = Translation(

211 word=clean_node(wxr, None, t_node.template_parameters.get(2, "")),

212 lang=lang_name,

213 lang_code=lang_code,

214 roman=clean_node(

215 wxr, None, t_node.template_parameters.get(4, "")

216 ).strip("() "),

217 sense=sense,

218 sense_index=sense_index,

219 )

220 gender_arg = clean_node(wxr, None, t_node.template_parameters.get(3, ""))

221 if gender_arg in TRANSLATION_GENDER_TAGS: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 tr_data.tags.append(TRANSLATION_GENDER_TAGS[gender_arg])

223 if tr_data.word != "": 223 ↛ 225line 223 didn't jump to line 225 because the condition on line 223 was always true

224 translations.append(tr_data)

225 return translations

226

227

228def extract_xlatio_template(

229 wxr: WiktextractContext,

230 t_node: TemplateNode,

231 sense: str,

232 sense_index: int,

233 lang_name: str,

234) -> list[Translation]:

235 # https://pt.wiktionary.org/wiki/Predefinição:xlatio

236 translations = []

237 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

238 tr_data = Translation(

239 word=clean_node(wxr, None, t_node.template_parameters.get(2, "")),

240 lang=lang_name,

241 lang_code=lang_code,

242 sense=sense,

243 sense_index=sense_index,

244 )

245 third_arg = clean_node(wxr, None, t_node.template_parameters.get(3, ""))

246 if third_arg.strip(".") in TRANSLATION_GENDER_TAGS: 246 ↛ 247line 246 didn't jump to line 247 because the condition on line 246 was never true

247 tr_data.tags.append(TRANSLATION_GENDER_TAGS[third_arg.strip(".")])

248 else:

249 tr_data.roman = third_arg.strip("() ")

250 if tr_data.word != "": 250 ↛ 252line 250 didn't jump to line 252 because the condition on line 250 was always true

251 translations.append(tr_data)

252 return translations

253

254

255def extract_translation_subpage(

256 wxr: WiktextractContext, word_entry: WordEntry, page_title: str

257) -> None:

258 page = wxr.wtp.get_page(page_title, 0)

259 if page is not None and page.body is not None: 259 ↛ exitline 259 didn't return from function 'extract_translation_subpage' because the condition on line 259 was always true

260 root = wxr.wtp.parse(page.body)

261 extract_translation_section(wxr, word_entry, root, "Tradução")