Coverage for src/wiktextract/extractor/pt/linkage.py: 78%

173 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1import re 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from .models import Form, Linkage, WordEntry 

14from .section_titles import FORM_SECTION_TAGS, LINKAGE_SECTIONS 

15from .tags import translate_raw_tags 

16 

17 

18def extract_expression_section( 

19 wxr: WiktextractContext, 

20 word_entry: WordEntry, 

21 level_node: LevelNode, 

22) -> None: 

23 for list_node in level_node.find_child(NodeKind.LIST): 

24 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

25 extract_expression_list_item(wxr, word_entry, list_item) 

26 

27 

28def extract_expression_list_item( 

29 wxr: WiktextractContext, 

30 word_entry: WordEntry, 

31 list_item: WikiNode, 

32) -> None: 

33 from .pos import extract_gloss_list_item 

34 

35 expression_data = Linkage(word="") 

36 sense_nodes = [] 

37 for node in list_item.children: 

38 if isinstance(node, WikiNode) and node.kind == NodeKind.BOLD: 

39 expression_data.word = clean_node(wxr, None, node) 

40 elif isinstance(node, str) and ":" in node: 

41 node = node.lstrip(": ") 

42 if node != "": 

43 sense_nodes.append(node) 

44 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 link_str = clean_node(wxr, None, node) 

46 if link_str.startswith("Wikisaurus:"): 

47 extract_wikisaurus_page( 

48 wxr, word_entry, link_str, "expressions", "", 0, [] 

49 ) 

50 elif expression_data.word == "": 

51 expression_data.word = link_str 

52 else: 

53 sense_nodes.append(node) 

54 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

55 sense_nodes.append(node) 

56 

57 sense_str = clean_node( 

58 wxr, 

59 None, 

60 [ 

61 n 

62 for n in sense_nodes 

63 if not ( 

64 isinstance(n, TemplateNode) and n.template_name == "escopo2" 

65 ) 

66 ], 

67 ) 

68 if sense_str != "": 

69 gloss_list_item = WikiNode(NodeKind.LIST_ITEM, 0) 

70 gloss_list_item.children = sense_nodes 

71 for child_list in list_item.find_child(NodeKind.LIST): 

72 gloss_list_item.children.append(child_list) 

73 extract_gloss_list_item(wxr, expression_data, gloss_list_item) 

74 else: 

75 for child_list in list_item.find_child(NodeKind.LIST): 

76 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

77 extract_gloss_list_item(wxr, expression_data, child_list_item) 

78 

79 if expression_data.word != "": 79 ↛ exitline 79 didn't return from function 'extract_expression_list_item' because the condition on line 79 was always true

80 word_entry.expressions.append(expression_data) 

81 

82 

83def extract_linkage_section( 

84 wxr: WiktextractContext, 

85 word_entry: WordEntry, 

86 level_node: LevelNode, 

87 linkage_type: str, 

88 sense: str, 

89 sense_index: int, 

90 source: str, 

91 tags: list[str], 

92) -> None: 

93 for node in level_node.children: 

94 if isinstance(node, TemplateNode) and node.template_name == "fraseini": 

95 sense, sense_index = extract_fraseini_template(wxr, node) 

96 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

97 for list_item in node.find_child(NodeKind.LIST_ITEM): 

98 extract_linkage_list_item( 

99 wxr, 

100 word_entry, 

101 list_item, 

102 linkage_type, 

103 sense, 

104 sense_index, 

105 source, 

106 tags, 

107 ) 

108 elif isinstance(node, WikiNode) and node.kind in LEVEL_KIND_FLAGS: 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true

109 extract_linkage_section( 

110 wxr, 

111 word_entry, 

112 node, 

113 linkage_type, 

114 sense, 

115 sense_index, 

116 source, 

117 tags, 

118 ) 

119 

120 

121def extract_fraseini_template( 

122 wxr: WiktextractContext, t_node: TemplateNode 

123) -> tuple[str, int]: 

124 sense = "" 

125 sense_index = 0 

126 first_arg = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

127 m = re.search(r"\((\d+)\)$", first_arg) 

128 if m is not None: 128 ↛ 129line 128 didn't jump to line 129 because the condition on line 128 was never true

129 sense_index = int(m.group(1)) 

130 sense = first_arg[: m.start()].strip() 

131 elif (m := re.match(r"De (\d+)", first_arg)) is not None: 

132 sense_index = int(m.group(1)) 

133 sense = first_arg[m.end() :].strip("() \n") 

134 else: 

135 sense = first_arg 

136 return sense, sense_index 

137 

138 

139def extract_linkage_list_item( 

140 wxr: WiktextractContext, 

141 word_entry: WordEntry, 

142 list_item: WikiNode, 

143 linkage_type: str, 

144 sense: str, 

145 sense_index: int, 

146 source: str, 

147 tags: list[str], 

148) -> None: 

149 linkage_words = [] 

150 raw_tags = [] 

151 for node in list_item.children: 

152 if isinstance(node, TemplateNode): 

153 if node.template_name.startswith("link "): 

154 word = clean_node( 

155 wxr, None, node.template_parameters.get(1, "") 

156 ) 

157 if word != "": 157 ↛ 151line 157 didn't jump to line 151 because the condition on line 157 was always true

158 linkage_words.append(word) 

159 elif node.template_name == "escopo2": 159 ↛ 151line 159 didn't jump to line 151 because the condition on line 159 was always true

160 from .pos import extract_escopo2_template 

161 

162 raw_tags.extend(extract_escopo2_template(wxr, node)) 

163 elif isinstance(node, WikiNode): 

164 match node.kind: 

165 case NodeKind.LINK: 

166 word = clean_node(wxr, None, node) 

167 if word.startswith("Wikisaurus:"): 

168 extract_wikisaurus_page( 

169 wxr, 

170 word_entry, 

171 word, 

172 linkage_type, 

173 sense, 

174 sense_index, 

175 tags, 

176 ) 

177 elif word != "": 177 ↛ 151line 177 didn't jump to line 151 because the condition on line 177 was always true

178 linkage_words.append(word) 

179 case NodeKind.BOLD: 

180 bold_str = clean_node(wxr, None, node) 

181 if re.fullmatch(r"\d+", bold_str): 181 ↛ 151line 181 didn't jump to line 151 because the condition on line 181 was always true

182 sense_index = int(bold_str) 

183 case NodeKind.ITALIC: 

184 raw_tag = clean_node(wxr, None, node) 

185 if raw_tag.startswith("Wikisaurus:"): 185 ↛ 186line 185 didn't jump to line 186 because the condition on line 185 was never true

186 extract_wikisaurus_page( 

187 wxr, 

188 word_entry, 

189 raw_tag, 

190 linkage_type, 

191 sense, 

192 sense_index, 

193 tags, 

194 ) 

195 elif raw_tag != "": 195 ↛ 151line 195 didn't jump to line 151 because the condition on line 195 was always true

196 raw_tags.append(raw_tag) 

197 case NodeKind.LIST: 197 ↛ 151line 197 didn't jump to line 151 because the pattern on line 197 always matched

198 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

199 extract_linkage_list_item( 

200 wxr, 

201 word_entry, 

202 child_list_item, 

203 linkage_type, 

204 sense, 

205 sense_index, 

206 source, 

207 tags, 

208 ) 

209 elif isinstance(node, str): 209 ↛ 151line 209 didn't jump to line 151 because the condition on line 209 was always true

210 m = re.search(r"\((.+)\)", node) 

211 if m is not None: 

212 sense = m.group(1) 

213 

214 for word in linkage_words: 

215 linkage = Linkage( 

216 word=word, 

217 sense=sense, 

218 sense_index=sense_index, 

219 raw_tags=raw_tags, 

220 source=source, 

221 tags=tags, 

222 ) 

223 translate_raw_tags(linkage) 

224 getattr(word_entry, linkage_type).append(linkage) 

225 

226 

227def extract_wikisaurus_page( 

228 wxr: WiktextractContext, 

229 word_entry: WordEntry, 

230 page_title: str, 

231 linkage_type: str, 

232 sense: str, 

233 sense_index: int, 

234 tags: list[str], 

235) -> None: 

236 page = wxr.wtp.get_page(page_title, 0) 

237 if page is None or page.body is None: 237 ↛ 239line 237 didn't jump to line 239 because the condition on line 237 was always true

238 return 

239 root = wxr.wtp.parse(page.body) 

240 for level1_node in root.find_child(NodeKind.LEVEL1): 

241 lang_name = clean_node(wxr, None, level1_node.largs) 

242 if lang_name != word_entry.lang: 

243 continue 

244 for level2_node in level1_node.find_child(NodeKind.LEVEL2): 

245 pos_title = clean_node(wxr, None, level2_node.largs) 

246 if pos_title != word_entry.pos_title: 

247 continue 

248 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

249 linkage_title = clean_node(wxr, None, level3_node.largs).lower() 

250 if LINKAGE_SECTIONS.get(linkage_title) != linkage_type: 

251 continue 

252 extract_linkage_section( 

253 wxr, 

254 word_entry, 

255 level3_node, 

256 linkage_type, 

257 sense, 

258 sense_index, 

259 page_title, 

260 tags, 

261 ) 

262 

263 

264def extract_phraseology_section( 

265 wxr: WiktextractContext, 

266 word_entry: WordEntry, 

267 level_node: LevelNode, 

268) -> None: 

269 sense = "" 

270 sense_index = 0 

271 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE): 

272 if isinstance(node, TemplateNode) and node.template_name == "fraseini": 

273 sense, sense_index = extract_fraseini_template(wxr, node) 

274 elif node.kind == NodeKind.LIST: 274 ↛ 271line 274 didn't jump to line 271 because the condition on line 274 was always true

275 for list_item in node.find_child(NodeKind.LIST_ITEM): 

276 extract_phraseology_list_item( 

277 wxr, word_entry, list_item, sense, sense_index 

278 ) 

279 

280 

281def extract_phraseology_list_item( 

282 wxr: WiktextractContext, 

283 word_entry: WordEntry, 

284 list_item: WikiNode, 

285 sense: str, 

286 sense_index: int, 

287) -> None: 

288 l_data = Linkage(word="", sense=sense, sense_index=sense_index) 

289 for index, node in enumerate(list_item.children): 

290 if ( 

291 isinstance(node, WikiNode) 

292 and node.kind in NodeKind.BOLD | NodeKind.LINK 

293 and l_data.word == "" 

294 ): 

295 l_data.word = clean_node(wxr, None, node) 

296 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

297 l_data.roman = clean_node(wxr, None, node) 

298 elif isinstance(node, str) and ("=" in node or ":" in node): 

299 sense_start = node.index("=" if "=" in node else ":") + 1 

300 l_data.sense = clean_node( 

301 wxr, 

302 None, 

303 [node[sense_start:]] 

304 + [ 

305 n 

306 for n in list_item.children[index + 1 :] 

307 if not (isinstance(n, WikiNode) and n.kind == NodeKind.LIST) 

308 ], 

309 ) 

310 break 

311 

312 if l_data.word != "": 312 ↛ 315line 312 didn't jump to line 315 because the condition on line 312 was always true

313 word_entry.phraseology.append(l_data) 

314 

315 for child_list in list_item.find_child(NodeKind.LIST): 

316 for next_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

317 extract_phraseology_list_item( 

318 wxr, word_entry, next_list_item, sense, sense_index 

319 ) 

320 

321 

322def extract_forms_section( 

323 wxr: WiktextractContext, 

324 word_entry: WordEntry, 

325 level_node: LevelNode, 

326 section_text: str, 

327) -> None: 

328 for list_node in level_node.find_child(NodeKind.LIST): 

329 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

330 for node in list_item.children: 

331 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

332 word = clean_node(wxr, None, node) 

333 if word != "": 333 ↛ 330line 333 didn't jump to line 330 because the condition on line 333 was always true

334 form = Form(form=word) 

335 tag = FORM_SECTION_TAGS[section_text] 

336 if tag != "": 336 ↛ 338line 336 didn't jump to line 338 because the condition on line 336 was always true

337 form.tags.append(tag) 

338 word_entry.forms.append(form) 

339 elif ( 

340 isinstance(node, TemplateNode) 

341 and node.template_name == "escopo2" 

342 and len(word_entry.forms) > 0 

343 ): 

344 from .pos import extract_escopo2_template 

345 

346 word_entry.forms[-1].raw_tags.extend( 

347 extract_escopo2_template(wxr, node) 

348 ) 

349 translate_raw_tags(word_entry.forms[-1]) 

350 elif isinstance( 350 ↛ 353line 350 didn't jump to line 353 because the condition on line 350 was never true

351 node, TemplateNode 

352 ) and node.template_name.startswith("link "): 

353 word = clean_node( 

354 wxr, None, node.template_parameters.get(1, "") 

355 ) 

356 if word != "": 

357 form = Form(form=word) 

358 tag = FORM_SECTION_TAGS[section_text] 

359 if tag != "": 

360 form.tags.append(tag) 

361 word_entry.forms.append(form)