Coverage for src/wiktextract/extractor/pt/linkage.py: 83%

141 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import Linkage, WordEntry 

8from .section_titles import LINKAGE_SECTIONS 

9from .tags import translate_raw_tags 

10 

11 

12def extract_expression_section( 

13 wxr: WiktextractContext, 

14 word_entry: WordEntry, 

15 level_node: LevelNode, 

16) -> None: 

17 for list_node in level_node.find_child(NodeKind.LIST): 

18 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

19 extract_expression_list_item(wxr, word_entry, list_item) 

20 

21 

22def extract_expression_list_item( 

23 wxr: WiktextractContext, 

24 word_entry: WordEntry, 

25 list_item: WikiNode, 

26) -> None: 

27 from .pos import extract_gloss_list_item 

28 

29 expression_data = Linkage(word="") 

30 sense_nodes = [] 

31 for node in list_item.children: 

32 if isinstance(node, WikiNode) and node.kind == NodeKind.BOLD: 

33 expression_data.word = clean_node(wxr, None, node) 

34 elif isinstance(node, str) and ":" in node: 

35 node = node.lstrip(": ") 

36 if node != "": 

37 sense_nodes.append(node) 

38 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

39 sense_nodes.append(node) 

40 

41 sense_str = clean_node( 

42 wxr, 

43 None, 

44 [ 

45 n 

46 for n in sense_nodes 

47 if not ( 

48 isinstance(n, TemplateNode) and n.template_name == "escopo2" 

49 ) 

50 ], 

51 ) 

52 if sense_str != "": 

53 gloss_list_item = WikiNode(NodeKind.LIST_ITEM, 0) 

54 gloss_list_item.children = sense_nodes 

55 for child_list in list_item.find_child(NodeKind.LIST): 

56 gloss_list_item.children.append(child_list) 

57 extract_gloss_list_item(wxr, expression_data, gloss_list_item) 

58 else: 

59 for child_list in list_item.find_child(NodeKind.LIST): 

60 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

61 extract_gloss_list_item(wxr, expression_data, child_list_item) 

62 

63 if expression_data.word != "": 63 ↛ exitline 63 didn't return from function 'extract_expression_list_item' because the condition on line 63 was always true

64 word_entry.expressions.append(expression_data) 

65 

66 

67def extract_linkage_section( 

68 wxr: WiktextractContext, 

69 word_entry: WordEntry, 

70 level_node: LevelNode, 

71 linkage_type: str, 

72 sense: str, 

73 sense_index: int, 

74 source: str, 

75 tags: list[str], 

76) -> None: 

77 for node in level_node.children: 

78 if isinstance(node, TemplateNode) and node.template_name == "fraseini": 

79 sense, sense_index = extract_fraseini_template(wxr, node) 

80 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

81 for list_item in node.find_child(NodeKind.LIST_ITEM): 

82 extract_linkage_list_item( 

83 wxr, 

84 word_entry, 

85 list_item, 

86 linkage_type, 

87 sense, 

88 sense_index, 

89 source, 

90 tags, 

91 ) 

92 

93 

94def extract_fraseini_template( 

95 wxr: WiktextractContext, t_node: TemplateNode 

96) -> tuple[str, int]: 

97 sense = "" 

98 sense_index = 0 

99 first_arg = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

100 m = re.search(r"\((\d+)\)$", first_arg) 

101 if m is not None: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true

102 sense_index = int(m.group(1)) 

103 sense = first_arg[: m.start()].strip() 

104 elif (m := re.match(r"De (\d+)", first_arg)) is not None: 

105 sense_index = int(m.group(1)) 

106 sense = first_arg[m.end() :].strip("() \n") 

107 else: 

108 sense = first_arg 

109 return sense, sense_index 

110 

111 

112def extract_linkage_list_item( 

113 wxr: WiktextractContext, 

114 word_entry: WordEntry, 

115 list_item: WikiNode, 

116 linkage_type: str, 

117 sense: str, 

118 sense_index: int, 

119 source: str, 

120 tags: list[str], 

121) -> None: 

122 linkage_words = [] 

123 raw_tags = [] 

124 for node in list_item.children: 

125 if isinstance(node, TemplateNode): 

126 match node.template_name: 

127 case "link preto": 

128 word = clean_node( 

129 wxr, None, node.template_parameters.get(1, "") 

130 ) 

131 if word != "": 131 ↛ 124line 131 didn't jump to line 124 because the condition on line 131 was always true

132 linkage_words.append(word) 

133 case "escopo2": 133 ↛ 124line 133 didn't jump to line 124 because the pattern on line 133 always matched

134 from .pos import extract_escopo2_template 

135 

136 raw_tags.extend(extract_escopo2_template(wxr, node)) 

137 elif isinstance(node, WikiNode): 

138 match node.kind: 

139 case NodeKind.LINK: 

140 word = clean_node(wxr, None, node) 

141 if word.startswith("Wikisaurus:"): 

142 extract_wikisaurus_page( 

143 wxr, 

144 word_entry, 

145 word, 

146 linkage_type, 

147 sense, 

148 sense_index, 

149 tags, 

150 ) 

151 elif word != "": 151 ↛ 124line 151 didn't jump to line 124 because the condition on line 151 was always true

152 linkage_words.append(word) 

153 case NodeKind.BOLD: 

154 bold_str = clean_node(wxr, None, node) 

155 if re.fullmatch(r"\d+", bold_str): 155 ↛ 124line 155 didn't jump to line 124 because the condition on line 155 was always true

156 sense_index = int(bold_str) 

157 case NodeKind.ITALIC: 

158 raw_tag = clean_node(wxr, None, node) 

159 if raw_tag.startswith("Wikisaurus:"): 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 extract_wikisaurus_page( 

161 wxr, 

162 word_entry, 

163 raw_tag, 

164 linkage_type, 

165 sense, 

166 sense_index, 

167 tags, 

168 ) 

169 elif raw_tag != "": 169 ↛ 124line 169 didn't jump to line 124 because the condition on line 169 was always true

170 raw_tags.append(raw_tag) 

171 case NodeKind.LIST: 171 ↛ 124line 171 didn't jump to line 124 because the pattern on line 171 always matched

172 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

173 extract_linkage_list_item( 

174 wxr, 

175 word_entry, 

176 child_list_item, 

177 linkage_type, 

178 sense, 

179 sense_index, 

180 source, 

181 tags, 

182 ) 

183 elif isinstance(node, str): 183 ↛ 124line 183 didn't jump to line 124 because the condition on line 183 was always true

184 m = re.search(r"\((.+)\)", node) 

185 if m is not None: 

186 sense = m.group(1) 

187 

188 for word in linkage_words: 

189 linkage = Linkage( 

190 word=word, 

191 sense=sense, 

192 sense_index=sense_index, 

193 raw_tags=raw_tags, 

194 source=source, 

195 tags=tags, 

196 ) 

197 translate_raw_tags(linkage) 

198 getattr(word_entry, linkage_type).append(linkage) 

199 

200 

201def extract_wikisaurus_page( 

202 wxr: WiktextractContext, 

203 word_entry: WordEntry, 

204 page_title: str, 

205 linkage_type: str, 

206 sense: str, 

207 sense_index: int, 

208 tags: list[str], 

209) -> None: 

210 page = wxr.wtp.get_page(page_title, 0) 

211 if page is None or page.body is None: 211 ↛ 213line 211 didn't jump to line 213 because the condition on line 211 was always true

212 return 

213 root = wxr.wtp.parse(page.body) 

214 for level1_node in root.find_child(NodeKind.LEVEL1): 

215 lang_name = clean_node(wxr, None, level1_node.largs) 

216 if lang_name != word_entry.lang: 

217 continue 

218 for level2_node in level1_node.find_child(NodeKind.LEVEL2): 

219 pos_title = clean_node(wxr, None, level2_node.largs) 

220 if pos_title != word_entry.pos_title: 

221 continue 

222 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

223 linkage_title = clean_node(wxr, None, level3_node.largs) 

224 if LINKAGE_SECTIONS.get(linkage_title) != linkage_type: 

225 continue 

226 extract_linkage_section( 

227 wxr, 

228 word_entry, 

229 level3_node, 

230 linkage_type, 

231 sense, 

232 sense_index, 

233 page_title, 

234 tags, 

235 ) 

236 

237 

238def extract_phraseology_section( 

239 wxr: WiktextractContext, 

240 word_entry: WordEntry, 

241 level_node: LevelNode, 

242) -> None: 

243 sense = "" 

244 sense_index = 0 

245 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): 

246 if isinstance(node, TemplateNode) and node.template_name == "fraseini": 

247 sense, sense_index = extract_fraseini_template(wxr, node) 

248 elif node.kind == NodeKind.LIST: 248 ↛ 245line 248 didn't jump to line 245 because the condition on line 248 was always true

249 for list_item in node.find_child(NodeKind.LIST_ITEM): 

250 extract_phraseology_list_item( 

251 wxr, word_entry, list_item, sense, sense_index 

252 ) 

253 

254 

255def extract_phraseology_list_item( 

256 wxr: WiktextractContext, 

257 word_entry: WordEntry, 

258 list_item: WikiNode, 

259 sense: str, 

260 sense_index: int, 

261) -> None: 

262 l_data = Linkage(word="", sense=sense, sense_index=sense_index) 

263 for index, node in enumerate(list_item.children): 

264 if ( 

265 isinstance(node, WikiNode) 

266 and node.kind in NodeKind.BOLD | NodeKind.LINK 

267 and l_data.word == "" 

268 ): 

269 l_data.word = clean_node(wxr, None, node) 

270 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

271 l_data.roman = clean_node(wxr, None, node) 

272 elif isinstance(node, str) and ("=" in node or ":" in node): 

273 sense_start = node.index("=" if "=" in node else ":") + 1 

274 l_data.sense = clean_node( 

275 wxr, 

276 None, 

277 [node[sense_start:]] 

278 + [ 

279 n 

280 for n in list_item.children[index + 1 :] 

281 if not (isinstance(n, WikiNode) and n.kind == NodeKind.LIST) 

282 ], 

283 ) 

284 break 

285 

286 if l_data.word != "": 286 ↛ 289line 286 didn't jump to line 289 because the condition on line 286 was always true

287 word_entry.phraseology.append(l_data) 

288 

289 for child_list in list_item.find_child(NodeKind.LIST): 

290 for next_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

291 extract_phraseology_list_item( 

292 wxr, word_entry, next_list_item, sense, sense_index 

293 )