Coverage for src/wiktextract/extractor/pt/linkage.py: 78%

1import re

3from wikitextprocessor.parser import (

4 LEVEL_KIND_FLAGS,

5 LevelNode,

6 NodeKind,

7 TemplateNode,

8 WikiNode,

11from ...page import clean_node

12from ...wxr_context import WiktextractContext

13from .models import Form, Linkage, WordEntry

14from .section_titles import FORM_SECTION_TAGS, LINKAGE_SECTIONS

15from .tags import translate_raw_tags

18def extract_expression_section(

19 wxr: WiktextractContext,

20 word_entry: WordEntry,

21 level_node: LevelNode,

22) -> None:

23 for list_node in level_node.find_child(NodeKind.LIST):

24 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

25 extract_expression_list_item(wxr, word_entry, list_item)

28def extract_expression_list_item(

29 wxr: WiktextractContext,

30 word_entry: WordEntry,

31 list_item: WikiNode,

32) -> None:

33 from .pos import extract_gloss_list_item

35 expression_data = Linkage(word="")

36 sense_nodes = []

37 for node in list_item.children:

38 if isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:

39 expression_data.word = clean_node(wxr, None, node)

40 elif isinstance(node, str) and ":" in node:

41 node = node.lstrip(": ")

42 if node != "":

43 sense_nodes.append(node)

44 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 link_str = clean_node(wxr, None, node)

46 if link_str.startswith("Wikisaurus:"):

47 extract_wikisaurus_page(

48 wxr, word_entry, link_str, "expressions", "", 0, []

49 )

50 elif expression_data.word == "":

51 expression_data.word = link_str

52 else:

53 sense_nodes.append(node)

54 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):

55 sense_nodes.append(node)

57 sense_str = clean_node(

58 wxr,

59 None,

60 [

61 n

62 for n in sense_nodes

63 if not (

64 isinstance(n, TemplateNode) and n.template_name == "escopo2"

65 )

66 ],

67 )

68 if sense_str != "":

69 gloss_list_item = WikiNode(NodeKind.LIST_ITEM, 0)

70 gloss_list_item.children = sense_nodes

71 for child_list in list_item.find_child(NodeKind.LIST):

72 gloss_list_item.children.append(child_list)

73 extract_gloss_list_item(wxr, expression_data, gloss_list_item)

74 else:

75 for child_list in list_item.find_child(NodeKind.LIST):

76 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):

77 extract_gloss_list_item(wxr, expression_data, child_list_item)

79 if expression_data.word != "": 79 ↛ exitline 79 didn't return from function 'extract_expression_list_item' because the condition on line 79 was always true

80 word_entry.expressions.append(expression_data)

83def extract_linkage_section(

84 wxr: WiktextractContext,

85 word_entry: WordEntry,

86 level_node: LevelNode,

87 linkage_type: str,

88 sense: str,

89 sense_index: int,

90 source: str,

91 tags: list[str],

92) -> None:

93 for node in level_node.children:

94 if isinstance(node, TemplateNode) and node.template_name == "fraseini":

95 sense, sense_index = extract_fraseini_template(wxr, node)

96 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

97 for list_item in node.find_child(NodeKind.LIST_ITEM):

98 extract_linkage_list_item(

99 wxr,

100 word_entry,

101 list_item,

102 linkage_type,

103 sense,

104 sense_index,

105 source,

106 tags,

107 )

108 elif isinstance(node, WikiNode) and node.kind in LEVEL_KIND_FLAGS: 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true

109 extract_linkage_section(

110 wxr,

111 word_entry,

112 node,

113 linkage_type,

114 sense,

115 sense_index,

116 source,

117 tags,

118 )

119

120

121def extract_fraseini_template(

122 wxr: WiktextractContext, t_node: TemplateNode

123) -> tuple[str, int]:

124 sense = ""

125 sense_index = 0

126 first_arg = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

127 m = re.search(r"$(\d+)$$", first_arg)

128 if m is not None: 128 ↛ 129line 128 didn't jump to line 129 because the condition on line 128 was never true

129 sense_index = int(m.group(1))

130 sense = first_arg[: m.start()].strip()

131 elif (m := re.match(r"De (\d+)", first_arg)) is not None:

132 sense_index = int(m.group(1))

133 sense = first_arg[m.end() :].strip("() \n")

134 else:

135 sense = first_arg

136 return sense, sense_index

137

138

139def extract_linkage_list_item(

140 wxr: WiktextractContext,

141 word_entry: WordEntry,

142 list_item: WikiNode,

143 linkage_type: str,

144 sense: str,

145 sense_index: int,

146 source: str,

147 tags: list[str],

148) -> None:

149 linkage_words = []

150 raw_tags = []

151 for node in list_item.children:

152 if isinstance(node, TemplateNode):

153 if node.template_name.startswith("link "):

154 word = clean_node(

155 wxr, None, node.template_parameters.get(1, "")

156 )

157 if word != "": 157 ↛ 151line 157 didn't jump to line 151 because the condition on line 157 was always true

158 linkage_words.append(word)

159 elif node.template_name == "escopo2": 159 ↛ 151line 159 didn't jump to line 151 because the condition on line 159 was always true

160 from .pos import extract_escopo2_template

161

162 raw_tags.extend(extract_escopo2_template(wxr, node))

163 elif isinstance(node, WikiNode):

164 match node.kind:

165 case NodeKind.LINK:

166 word = clean_node(wxr, None, node)

167 if word.startswith("Wikisaurus:"):

168 extract_wikisaurus_page(

169 wxr,

170 word_entry,

171 word,

172 linkage_type,

173 sense,

174 sense_index,

175 tags,

176 )

177 elif word != "": 177 ↛ 151line 177 didn't jump to line 151 because the condition on line 177 was always true

178 linkage_words.append(word)

179 case NodeKind.BOLD:

180 bold_str = clean_node(wxr, None, node)

181 if re.fullmatch(r"\d+", bold_str): 181 ↛ 151line 181 didn't jump to line 151 because the condition on line 181 was always true

182 sense_index = int(bold_str)

183 case NodeKind.ITALIC:

184 raw_tag = clean_node(wxr, None, node)

185 if raw_tag.startswith("Wikisaurus:"): 185 ↛ 186line 185 didn't jump to line 186 because the condition on line 185 was never true

186 extract_wikisaurus_page(

187 wxr,

188 word_entry,

189 raw_tag,

190 linkage_type,

191 sense,

192 sense_index,

193 tags,

194 )

195 elif raw_tag != "": 195 ↛ 151line 195 didn't jump to line 151 because the condition on line 195 was always true

196 raw_tags.append(raw_tag)

197 case NodeKind.LIST: 197 ↛ 151line 197 didn't jump to line 151 because the pattern on line 197 always matched

198 for child_list_item in node.find_child(NodeKind.LIST_ITEM):

199 extract_linkage_list_item(

200 wxr,

201 word_entry,

202 child_list_item,

203 linkage_type,

204 sense,

205 sense_index,

206 source,

207 tags,

208 )

209 elif isinstance(node, str): 209 ↛ 151line 209 didn't jump to line 151 because the condition on line 209 was always true

210 m = re.search(r"$(.+)$", node)

211 if m is not None:

212 sense = m.group(1)

213

214 for word in linkage_words:

215 linkage = Linkage(

216 word=word,

217 sense=sense,

218 sense_index=sense_index,

219 raw_tags=raw_tags,

220 source=source,

221 tags=tags,

222 )

223 translate_raw_tags(linkage)

224 getattr(word_entry, linkage_type).append(linkage)

225

226

227def extract_wikisaurus_page(

228 wxr: WiktextractContext,

229 word_entry: WordEntry,

230 page_title: str,

231 linkage_type: str,

232 sense: str,

233 sense_index: int,

234 tags: list[str],

235) -> None:

236 page = wxr.wtp.get_page(page_title, 0)

237 if page is None or page.body is None: 237 ↛ 239line 237 didn't jump to line 239 because the condition on line 237 was always true

238 return

239 root = wxr.wtp.parse(page.body)

240 for level1_node in root.find_child(NodeKind.LEVEL1):

241 lang_name = clean_node(wxr, None, level1_node.largs)

242 if lang_name != word_entry.lang:

243 continue

244 for level2_node in level1_node.find_child(NodeKind.LEVEL2):

245 pos_title = clean_node(wxr, None, level2_node.largs)

246 if pos_title != word_entry.pos_title:

247 continue

248 for level3_node in level2_node.find_child(NodeKind.LEVEL3):

249 linkage_title = clean_node(wxr, None, level3_node.largs).lower()

250 if LINKAGE_SECTIONS.get(linkage_title) != linkage_type:

251 continue

252 extract_linkage_section(

253 wxr,

254 word_entry,

255 level3_node,

256 linkage_type,

257 sense,

258 sense_index,

259 page_title,

260 tags,

261 )

262

263

264def extract_phraseology_section(

265 wxr: WiktextractContext,

266 word_entry: WordEntry,

267 level_node: LevelNode,

268) -> None:

269 sense = ""

270 sense_index = 0

271 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):

272 if isinstance(node, TemplateNode) and node.template_name == "fraseini":

273 sense, sense_index = extract_fraseini_template(wxr, node)

274 elif node.kind == NodeKind.LIST: 274 ↛ 271line 274 didn't jump to line 271 because the condition on line 274 was always true

275 for list_item in node.find_child(NodeKind.LIST_ITEM):

276 extract_phraseology_list_item(

277 wxr, word_entry, list_item, sense, sense_index

278 )

279

280

281def extract_phraseology_list_item(

282 wxr: WiktextractContext,

283 word_entry: WordEntry,

284 list_item: WikiNode,

285 sense: str,

286 sense_index: int,

287) -> None:

288 l_data = Linkage(word="", sense=sense, sense_index=sense_index)

289 for index, node in enumerate(list_item.children):

290 if (

291 isinstance(node, WikiNode)

292 and node.kind in NodeKind.BOLD | NodeKind.LINK

293 and l_data.word == ""

294 ):

295 l_data.word = clean_node(wxr, None, node)

296 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:

297 l_data.roman = clean_node(wxr, None, node)

298 elif isinstance(node, str) and ("=" in node or ":" in node):

299 sense_start = node.index("=" if "=" in node else ":") + 1

300 l_data.sense = clean_node(

301 wxr,

302 None,

303 [node[sense_start:]]

304 + [

305 n

306 for n in list_item.children[index + 1 :]

307 if not (isinstance(n, WikiNode) and n.kind == NodeKind.LIST)

308 ],

309 )

310 break

311

312 if l_data.word != "": 312 ↛ 315line 312 didn't jump to line 315 because the condition on line 312 was always true

313 word_entry.phraseology.append(l_data)

314

315 for child_list in list_item.find_child(NodeKind.LIST):

316 for next_list_item in child_list.find_child(NodeKind.LIST_ITEM):

317 extract_phraseology_list_item(

318 wxr, word_entry, next_list_item, sense, sense_index

319 )

320

321

322def extract_forms_section(

323 wxr: WiktextractContext,

324 word_entry: WordEntry,

325 level_node: LevelNode,

326 section_text: str,

327) -> None:

328 for list_node in level_node.find_child(NodeKind.LIST):

329 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

330 for node in list_item.children:

331 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:

332 word = clean_node(wxr, None, node)

333 if word != "": 333 ↛ 330line 333 didn't jump to line 330 because the condition on line 333 was always true

334 form = Form(form=word)

335 tag = FORM_SECTION_TAGS[section_text]

336 if tag != "": 336 ↛ 338line 336 didn't jump to line 338 because the condition on line 336 was always true

337 form.tags.append(tag)

338 word_entry.forms.append(form)

339 elif (

340 isinstance(node, TemplateNode)

341 and node.template_name == "escopo2"

342 and len(word_entry.forms) > 0

343 ):

344 from .pos import extract_escopo2_template

345

346 word_entry.forms[-1].raw_tags.extend(

347 extract_escopo2_template(wxr, node)

348 )

349 translate_raw_tags(word_entry.forms[-1])

350 elif isinstance( 350 ↛ 353line 350 didn't jump to line 353 because the condition on line 350 was never true

351 node, TemplateNode

352 ) and node.template_name.startswith("link "):

353 word = clean_node(

354 wxr, None, node.template_parameters.get(1, "")

355 )

356 if word != "":

357 form = Form(form=word)

358 tag = FORM_SECTION_TAGS[section_text]

359 if tag != "":

360 form.tags.append(tag)

361 word_entry.forms.append(form)