Coverage for src/wiktextract/extractor/pt/linkage.py: 78%
173 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import re
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .models import Form, Linkage, WordEntry
14from .section_titles import FORM_SECTION_TAGS, LINKAGE_SECTIONS
15from .tags import translate_raw_tags
18def extract_expression_section(
19 wxr: WiktextractContext,
20 word_entry: WordEntry,
21 level_node: LevelNode,
22) -> None:
23 for list_node in level_node.find_child(NodeKind.LIST):
24 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
25 extract_expression_list_item(wxr, word_entry, list_item)
28def extract_expression_list_item(
29 wxr: WiktextractContext,
30 word_entry: WordEntry,
31 list_item: WikiNode,
32) -> None:
33 from .pos import extract_gloss_list_item
35 expression_data = Linkage(word="")
36 sense_nodes = []
37 for node in list_item.children:
38 if isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
39 expression_data.word = clean_node(wxr, None, node)
40 elif isinstance(node, str) and ":" in node:
41 node = node.lstrip(": ")
42 if node != "":
43 sense_nodes.append(node)
44 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 link_str = clean_node(wxr, None, node)
46 if link_str.startswith("Wikisaurus:"):
47 extract_wikisaurus_page(
48 wxr, word_entry, link_str, "expressions", "", 0, []
49 )
50 elif expression_data.word == "":
51 expression_data.word = link_str
52 else:
53 sense_nodes.append(node)
54 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
55 sense_nodes.append(node)
57 sense_str = clean_node(
58 wxr,
59 None,
60 [
61 n
62 for n in sense_nodes
63 if not (
64 isinstance(n, TemplateNode) and n.template_name == "escopo2"
65 )
66 ],
67 )
68 if sense_str != "":
69 gloss_list_item = WikiNode(NodeKind.LIST_ITEM, 0)
70 gloss_list_item.children = sense_nodes
71 for child_list in list_item.find_child(NodeKind.LIST):
72 gloss_list_item.children.append(child_list)
73 extract_gloss_list_item(wxr, expression_data, gloss_list_item)
74 else:
75 for child_list in list_item.find_child(NodeKind.LIST):
76 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
77 extract_gloss_list_item(wxr, expression_data, child_list_item)
79 if expression_data.word != "": 79 ↛ exitline 79 didn't return from function 'extract_expression_list_item' because the condition on line 79 was always true
80 word_entry.expressions.append(expression_data)
83def extract_linkage_section(
84 wxr: WiktextractContext,
85 word_entry: WordEntry,
86 level_node: LevelNode,
87 linkage_type: str,
88 sense: str,
89 sense_index: int,
90 source: str,
91 tags: list[str],
92) -> None:
93 for node in level_node.children:
94 if isinstance(node, TemplateNode) and node.template_name == "fraseini":
95 sense, sense_index = extract_fraseini_template(wxr, node)
96 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
97 for list_item in node.find_child(NodeKind.LIST_ITEM):
98 extract_linkage_list_item(
99 wxr,
100 word_entry,
101 list_item,
102 linkage_type,
103 sense,
104 sense_index,
105 source,
106 tags,
107 )
108 elif isinstance(node, WikiNode) and node.kind in LEVEL_KIND_FLAGS: 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true
109 extract_linkage_section(
110 wxr,
111 word_entry,
112 node,
113 linkage_type,
114 sense,
115 sense_index,
116 source,
117 tags,
118 )
121def extract_fraseini_template(
122 wxr: WiktextractContext, t_node: TemplateNode
123) -> tuple[str, int]:
124 sense = ""
125 sense_index = 0
126 first_arg = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
127 m = re.search(r"\((\d+)\)$", first_arg)
128 if m is not None: 128 ↛ 129line 128 didn't jump to line 129 because the condition on line 128 was never true
129 sense_index = int(m.group(1))
130 sense = first_arg[: m.start()].strip()
131 elif (m := re.match(r"De (\d+)", first_arg)) is not None:
132 sense_index = int(m.group(1))
133 sense = first_arg[m.end() :].strip("() \n")
134 else:
135 sense = first_arg
136 return sense, sense_index
139def extract_linkage_list_item(
140 wxr: WiktextractContext,
141 word_entry: WordEntry,
142 list_item: WikiNode,
143 linkage_type: str,
144 sense: str,
145 sense_index: int,
146 source: str,
147 tags: list[str],
148) -> None:
149 linkage_words = []
150 raw_tags = []
151 for node in list_item.children:
152 if isinstance(node, TemplateNode):
153 if node.template_name.startswith("link "):
154 word = clean_node(
155 wxr, None, node.template_parameters.get(1, "")
156 )
157 if word != "": 157 ↛ 151line 157 didn't jump to line 151 because the condition on line 157 was always true
158 linkage_words.append(word)
159 elif node.template_name == "escopo2": 159 ↛ 151line 159 didn't jump to line 151 because the condition on line 159 was always true
160 from .pos import extract_escopo2_template
162 raw_tags.extend(extract_escopo2_template(wxr, node))
163 elif isinstance(node, WikiNode):
164 match node.kind:
165 case NodeKind.LINK:
166 word = clean_node(wxr, None, node)
167 if word.startswith("Wikisaurus:"):
168 extract_wikisaurus_page(
169 wxr,
170 word_entry,
171 word,
172 linkage_type,
173 sense,
174 sense_index,
175 tags,
176 )
177 elif word != "": 177 ↛ 151line 177 didn't jump to line 151 because the condition on line 177 was always true
178 linkage_words.append(word)
179 case NodeKind.BOLD:
180 bold_str = clean_node(wxr, None, node)
181 if re.fullmatch(r"\d+", bold_str): 181 ↛ 151line 181 didn't jump to line 151 because the condition on line 181 was always true
182 sense_index = int(bold_str)
183 case NodeKind.ITALIC:
184 raw_tag = clean_node(wxr, None, node)
185 if raw_tag.startswith("Wikisaurus:"): 185 ↛ 186line 185 didn't jump to line 186 because the condition on line 185 was never true
186 extract_wikisaurus_page(
187 wxr,
188 word_entry,
189 raw_tag,
190 linkage_type,
191 sense,
192 sense_index,
193 tags,
194 )
195 elif raw_tag != "": 195 ↛ 151line 195 didn't jump to line 151 because the condition on line 195 was always true
196 raw_tags.append(raw_tag)
197 case NodeKind.LIST: 197 ↛ 151line 197 didn't jump to line 151 because the pattern on line 197 always matched
198 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
199 extract_linkage_list_item(
200 wxr,
201 word_entry,
202 child_list_item,
203 linkage_type,
204 sense,
205 sense_index,
206 source,
207 tags,
208 )
209 elif isinstance(node, str): 209 ↛ 151line 209 didn't jump to line 151 because the condition on line 209 was always true
210 m = re.search(r"\((.+)\)", node)
211 if m is not None:
212 sense = m.group(1)
214 for word in linkage_words:
215 linkage = Linkage(
216 word=word,
217 sense=sense,
218 sense_index=sense_index,
219 raw_tags=raw_tags,
220 source=source,
221 tags=tags,
222 )
223 translate_raw_tags(linkage)
224 getattr(word_entry, linkage_type).append(linkage)
227def extract_wikisaurus_page(
228 wxr: WiktextractContext,
229 word_entry: WordEntry,
230 page_title: str,
231 linkage_type: str,
232 sense: str,
233 sense_index: int,
234 tags: list[str],
235) -> None:
236 page = wxr.wtp.get_page(page_title, 0)
237 if page is None or page.body is None: 237 ↛ 239line 237 didn't jump to line 239 because the condition on line 237 was always true
238 return
239 root = wxr.wtp.parse(page.body)
240 for level1_node in root.find_child(NodeKind.LEVEL1):
241 lang_name = clean_node(wxr, None, level1_node.largs)
242 if lang_name != word_entry.lang:
243 continue
244 for level2_node in level1_node.find_child(NodeKind.LEVEL2):
245 pos_title = clean_node(wxr, None, level2_node.largs)
246 if pos_title != word_entry.pos_title:
247 continue
248 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
249 linkage_title = clean_node(wxr, None, level3_node.largs).lower()
250 if LINKAGE_SECTIONS.get(linkage_title) != linkage_type:
251 continue
252 extract_linkage_section(
253 wxr,
254 word_entry,
255 level3_node,
256 linkage_type,
257 sense,
258 sense_index,
259 page_title,
260 tags,
261 )
264def extract_phraseology_section(
265 wxr: WiktextractContext,
266 word_entry: WordEntry,
267 level_node: LevelNode,
268) -> None:
269 sense = ""
270 sense_index = 0
271 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
272 if isinstance(node, TemplateNode) and node.template_name == "fraseini":
273 sense, sense_index = extract_fraseini_template(wxr, node)
274 elif node.kind == NodeKind.LIST: 274 ↛ 271line 274 didn't jump to line 271 because the condition on line 274 was always true
275 for list_item in node.find_child(NodeKind.LIST_ITEM):
276 extract_phraseology_list_item(
277 wxr, word_entry, list_item, sense, sense_index
278 )
281def extract_phraseology_list_item(
282 wxr: WiktextractContext,
283 word_entry: WordEntry,
284 list_item: WikiNode,
285 sense: str,
286 sense_index: int,
287) -> None:
288 l_data = Linkage(word="", sense=sense, sense_index=sense_index)
289 for index, node in enumerate(list_item.children):
290 if (
291 isinstance(node, WikiNode)
292 and node.kind in NodeKind.BOLD | NodeKind.LINK
293 and l_data.word == ""
294 ):
295 l_data.word = clean_node(wxr, None, node)
296 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
297 l_data.roman = clean_node(wxr, None, node)
298 elif isinstance(node, str) and ("=" in node or ":" in node):
299 sense_start = node.index("=" if "=" in node else ":") + 1
300 l_data.sense = clean_node(
301 wxr,
302 None,
303 [node[sense_start:]]
304 + [
305 n
306 for n in list_item.children[index + 1 :]
307 if not (isinstance(n, WikiNode) and n.kind == NodeKind.LIST)
308 ],
309 )
310 break
312 if l_data.word != "": 312 ↛ 315line 312 didn't jump to line 315 because the condition on line 312 was always true
313 word_entry.phraseology.append(l_data)
315 for child_list in list_item.find_child(NodeKind.LIST):
316 for next_list_item in child_list.find_child(NodeKind.LIST_ITEM):
317 extract_phraseology_list_item(
318 wxr, word_entry, next_list_item, sense, sense_index
319 )
322def extract_forms_section(
323 wxr: WiktextractContext,
324 word_entry: WordEntry,
325 level_node: LevelNode,
326 section_text: str,
327) -> None:
328 for list_node in level_node.find_child(NodeKind.LIST):
329 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
330 for node in list_item.children:
331 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
332 word = clean_node(wxr, None, node)
333 if word != "": 333 ↛ 330line 333 didn't jump to line 330 because the condition on line 333 was always true
334 form = Form(form=word)
335 tag = FORM_SECTION_TAGS[section_text]
336 if tag != "": 336 ↛ 338line 336 didn't jump to line 338 because the condition on line 336 was always true
337 form.tags.append(tag)
338 word_entry.forms.append(form)
339 elif (
340 isinstance(node, TemplateNode)
341 and node.template_name == "escopo2"
342 and len(word_entry.forms) > 0
343 ):
344 from .pos import extract_escopo2_template
346 word_entry.forms[-1].raw_tags.extend(
347 extract_escopo2_template(wxr, node)
348 )
349 translate_raw_tags(word_entry.forms[-1])
350 elif isinstance( 350 ↛ 353line 350 didn't jump to line 353 because the condition on line 350 was never true
351 node, TemplateNode
352 ) and node.template_name.startswith("link "):
353 word = clean_node(
354 wxr, None, node.template_parameters.get(1, "")
355 )
356 if word != "":
357 form = Form(form=word)
358 tag = FORM_SECTION_TAGS[section_text]
359 if tag != "":
360 form.tags.append(tag)
361 word_entry.forms.append(form)