Coverage for src/wiktextract/extractor/pt/linkage.py: 83%
141 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Linkage, WordEntry
8from .section_titles import LINKAGE_SECTIONS
9from .tags import translate_raw_tags
12def extract_expression_section(
13 wxr: WiktextractContext,
14 word_entry: WordEntry,
15 level_node: LevelNode,
16) -> None:
17 for list_node in level_node.find_child(NodeKind.LIST):
18 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
19 extract_expression_list_item(wxr, word_entry, list_item)
22def extract_expression_list_item(
23 wxr: WiktextractContext,
24 word_entry: WordEntry,
25 list_item: WikiNode,
26) -> None:
27 from .pos import extract_gloss_list_item
29 expression_data = Linkage(word="")
30 sense_nodes = []
31 for node in list_item.children:
32 if isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
33 expression_data.word = clean_node(wxr, None, node)
34 elif isinstance(node, str) and ":" in node:
35 node = node.lstrip(": ")
36 if node != "":
37 sense_nodes.append(node)
38 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
39 sense_nodes.append(node)
41 sense_str = clean_node(
42 wxr,
43 None,
44 [
45 n
46 for n in sense_nodes
47 if not (
48 isinstance(n, TemplateNode) and n.template_name == "escopo2"
49 )
50 ],
51 )
52 if sense_str != "":
53 gloss_list_item = WikiNode(NodeKind.LIST_ITEM, 0)
54 gloss_list_item.children = sense_nodes
55 for child_list in list_item.find_child(NodeKind.LIST):
56 gloss_list_item.children.append(child_list)
57 extract_gloss_list_item(wxr, expression_data, gloss_list_item)
58 else:
59 for child_list in list_item.find_child(NodeKind.LIST):
60 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
61 extract_gloss_list_item(wxr, expression_data, child_list_item)
63 if expression_data.word != "": 63 ↛ exitline 63 didn't return from function 'extract_expression_list_item' because the condition on line 63 was always true
64 word_entry.expressions.append(expression_data)
67def extract_linkage_section(
68 wxr: WiktextractContext,
69 word_entry: WordEntry,
70 level_node: LevelNode,
71 linkage_type: str,
72 sense: str,
73 sense_index: int,
74 source: str,
75 tags: list[str],
76) -> None:
77 for node in level_node.children:
78 if isinstance(node, TemplateNode) and node.template_name == "fraseini":
79 sense, sense_index = extract_fraseini_template(wxr, node)
80 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
81 for list_item in node.find_child(NodeKind.LIST_ITEM):
82 extract_linkage_list_item(
83 wxr,
84 word_entry,
85 list_item,
86 linkage_type,
87 sense,
88 sense_index,
89 source,
90 tags,
91 )
94def extract_fraseini_template(
95 wxr: WiktextractContext, t_node: TemplateNode
96) -> tuple[str, int]:
97 sense = ""
98 sense_index = 0
99 first_arg = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
100 m = re.search(r"\((\d+)\)$", first_arg)
101 if m is not None: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true
102 sense_index = int(m.group(1))
103 sense = first_arg[: m.start()].strip()
104 elif (m := re.match(r"De (\d+)", first_arg)) is not None:
105 sense_index = int(m.group(1))
106 sense = first_arg[m.end() :].strip("() \n")
107 else:
108 sense = first_arg
109 return sense, sense_index
112def extract_linkage_list_item(
113 wxr: WiktextractContext,
114 word_entry: WordEntry,
115 list_item: WikiNode,
116 linkage_type: str,
117 sense: str,
118 sense_index: int,
119 source: str,
120 tags: list[str],
121) -> None:
122 linkage_words = []
123 raw_tags = []
124 for node in list_item.children:
125 if isinstance(node, TemplateNode):
126 match node.template_name:
127 case "link preto":
128 word = clean_node(
129 wxr, None, node.template_parameters.get(1, "")
130 )
131 if word != "": 131 ↛ 124line 131 didn't jump to line 124 because the condition on line 131 was always true
132 linkage_words.append(word)
133 case "escopo2": 133 ↛ 124line 133 didn't jump to line 124 because the pattern on line 133 always matched
134 from .pos import extract_escopo2_template
136 raw_tags.extend(extract_escopo2_template(wxr, node))
137 elif isinstance(node, WikiNode):
138 match node.kind:
139 case NodeKind.LINK:
140 word = clean_node(wxr, None, node)
141 if word.startswith("Wikisaurus:"):
142 extract_wikisaurus_page(
143 wxr,
144 word_entry,
145 word,
146 linkage_type,
147 sense,
148 sense_index,
149 tags,
150 )
151 elif word != "": 151 ↛ 124line 151 didn't jump to line 124 because the condition on line 151 was always true
152 linkage_words.append(word)
153 case NodeKind.BOLD:
154 bold_str = clean_node(wxr, None, node)
155 if re.fullmatch(r"\d+", bold_str): 155 ↛ 124line 155 didn't jump to line 124 because the condition on line 155 was always true
156 sense_index = int(bold_str)
157 case NodeKind.ITALIC:
158 raw_tag = clean_node(wxr, None, node)
159 if raw_tag.startswith("Wikisaurus:"): 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true
160 extract_wikisaurus_page(
161 wxr,
162 word_entry,
163 raw_tag,
164 linkage_type,
165 sense,
166 sense_index,
167 tags,
168 )
169 elif raw_tag != "": 169 ↛ 124line 169 didn't jump to line 124 because the condition on line 169 was always true
170 raw_tags.append(raw_tag)
171 case NodeKind.LIST: 171 ↛ 124line 171 didn't jump to line 124 because the pattern on line 171 always matched
172 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
173 extract_linkage_list_item(
174 wxr,
175 word_entry,
176 child_list_item,
177 linkage_type,
178 sense,
179 sense_index,
180 source,
181 tags,
182 )
183 elif isinstance(node, str): 183 ↛ 124line 183 didn't jump to line 124 because the condition on line 183 was always true
184 m = re.search(r"\((.+)\)", node)
185 if m is not None:
186 sense = m.group(1)
188 for word in linkage_words:
189 linkage = Linkage(
190 word=word,
191 sense=sense,
192 sense_index=sense_index,
193 raw_tags=raw_tags,
194 source=source,
195 tags=tags,
196 )
197 translate_raw_tags(linkage)
198 getattr(word_entry, linkage_type).append(linkage)
201def extract_wikisaurus_page(
202 wxr: WiktextractContext,
203 word_entry: WordEntry,
204 page_title: str,
205 linkage_type: str,
206 sense: str,
207 sense_index: int,
208 tags: list[str],
209) -> None:
210 page = wxr.wtp.get_page(page_title, 0)
211 if page is None or page.body is None: 211 ↛ 213line 211 didn't jump to line 213 because the condition on line 211 was always true
212 return
213 root = wxr.wtp.parse(page.body)
214 for level1_node in root.find_child(NodeKind.LEVEL1):
215 lang_name = clean_node(wxr, None, level1_node.largs)
216 if lang_name != word_entry.lang:
217 continue
218 for level2_node in level1_node.find_child(NodeKind.LEVEL2):
219 pos_title = clean_node(wxr, None, level2_node.largs)
220 if pos_title != word_entry.pos_title:
221 continue
222 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
223 linkage_title = clean_node(wxr, None, level3_node.largs)
224 if LINKAGE_SECTIONS.get(linkage_title) != linkage_type:
225 continue
226 extract_linkage_section(
227 wxr,
228 word_entry,
229 level3_node,
230 linkage_type,
231 sense,
232 sense_index,
233 page_title,
234 tags,
235 )
238def extract_phraseology_section(
239 wxr: WiktextractContext,
240 word_entry: WordEntry,
241 level_node: LevelNode,
242) -> None:
243 sense = ""
244 sense_index = 0
245 for node in level_node.find_child(NodeKind.LIST | NodeKind.TEMPLATE):
246 if isinstance(node, TemplateNode) and node.template_name == "fraseini":
247 sense, sense_index = extract_fraseini_template(wxr, node)
248 elif node.kind == NodeKind.LIST: 248 ↛ 245line 248 didn't jump to line 245 because the condition on line 248 was always true
249 for list_item in node.find_child(NodeKind.LIST_ITEM):
250 extract_phraseology_list_item(
251 wxr, word_entry, list_item, sense, sense_index
252 )
255def extract_phraseology_list_item(
256 wxr: WiktextractContext,
257 word_entry: WordEntry,
258 list_item: WikiNode,
259 sense: str,
260 sense_index: int,
261) -> None:
262 l_data = Linkage(word="", sense=sense, sense_index=sense_index)
263 for index, node in enumerate(list_item.children):
264 if (
265 isinstance(node, WikiNode)
266 and node.kind in NodeKind.BOLD | NodeKind.LINK
267 and l_data.word == ""
268 ):
269 l_data.word = clean_node(wxr, None, node)
270 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
271 l_data.roman = clean_node(wxr, None, node)
272 elif isinstance(node, str) and ("=" in node or ":" in node):
273 sense_start = node.index("=" if "=" in node else ":") + 1
274 l_data.sense = clean_node(
275 wxr,
276 None,
277 [node[sense_start:]]
278 + [
279 n
280 for n in list_item.children[index + 1 :]
281 if not (isinstance(n, WikiNode) and n.kind == NodeKind.LIST)
282 ],
283 )
284 break
286 if l_data.word != "": 286 ↛ 289line 286 didn't jump to line 289 because the condition on line 286 was always true
287 word_entry.phraseology.append(l_data)
289 for child_list in list_item.find_child(NodeKind.LIST):
290 for next_list_item in child_list.find_child(NodeKind.LIST_ITEM):
291 extract_phraseology_list_item(
292 wxr, word_entry, next_list_item, sense, sense_index
293 )