Coverage for src/wiktextract/extractor/el/linkages.py: 90%
114 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import re
3from wikitextprocessor import TemplateNode, WikiNode
4from wikitextprocessor.parser import NodeKind
6from wiktextract.page import clean_node
7from wiktextract.wxr_context import WiktextractContext
9from .models import Form, Linkage, WordEntry
10from .parse_utils import Heading
12Node = str | WikiNode
14LINK_RE = re.compile(r"(__/?[IL]__)")
16EXAMPLES_RE = re.compile(r"(?sm)__E__(.*?)__/E__")
19def process_linkage_section(
20 wxr: WiktextractContext,
21 data: WordEntry,
22 rnode: WikiNode,
23 linkage_type: Heading,
24) -> None:
25 transliteration_template_data: list[Form] = []
27 def prehandle_templates_fn(
28 node: WikiNode,
29 ) -> list[Node] | None:
30 """Handle nodes in the parse tree specially."""
31 # print(f"{node=}")
32 if not isinstance(node, TemplateNode):
33 return None
34 if node.template_name == "βλ":
35 # print("REACHED")
36 # print(f"{node.largs=}")
37 ret: list[Node] = []
38 # print(f"{ret=}")
39 comma = False
40 for arg in node.largs[1:]:
41 if comma:
42 ret.append(", ")
43 ret.append("__L__")
44 ret.append(wxr.wtp.node_to_text(arg))
45 ret.append("__/L__")
46 comma = True
47 return ret
48 if node.template_name in ("eo-h", "eo-x"):
49 transliteration_template_data.append(
50 Form(
51 form="".join(
52 wxr.wtp.node_to_text(arg) for arg in node.largs[1]
53 ),
54 raw_tags=[
55 "H-sistemo"
56 if node.template_name == "eo-h"
57 else "X-sistemo"
58 ],
59 tags=["transliteration"],
60 )
61 )
62 return []
63 return None
65 def links_node_fn(
66 node: WikiNode,
67 ) -> list[Node] | None:
68 """Handle nodes in the parse tree specially."""
69 # print(f"{node=}")
70 if node.kind == NodeKind.ITALIC:
71 return ["__I__", *node.children, "__/I__"]
72 if node.kind == NodeKind.LINK:
73 if not isinstance(node.largs[0][0], str): 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true
74 return None
75 return [
76 "__L__",
77 # unpacking a list-comprehension, unpacking into a list
78 # seems to be more performant than adding lists together.
79 *(
80 wxr.wtp.node_to_text(
81 node.largs[1:2] or node.largs[0],
82 )
83 # output the "visible" half of the link.
84 ),
85 # XXX collect link data if it turns out to be important.
86 "__/L__",
87 ]
88 # print(f"{node.largs=}")
89 if isinstance(node, TemplateNode) and node.template_name == "βλ": 89 ↛ 92line 89 didn't jump to line 92 because the condition on line 89 was never true
90 # print("REACHED")
91 # print(f"{node=}")
92 return node.children
93 if node.kind == NodeKind.LIST_ITEM and node.sarg.endswith(":"):
94 return [node.sarg, "__E__", *node.children, "__/E__\n"]
95 return None
97 # parse nodes to get lists and list_items
98 reparsed = wxr.wtp.parse(
99 wxr.wtp.node_to_wikitext(rnode, node_handler_fn=prehandle_templates_fn),
100 expand_all=True,
101 )
103 combined_line_data: list[tuple[list[str], list[str], list[str]]] = []
105 for list_item in reparsed.find_child_recursively(NodeKind.LIST_ITEM):
106 # print(f"{list_item=}")
107 text = wxr.wtp.node_to_text(list_item, node_handler_fn=links_node_fn)
109 chained_links: list[str] = []
110 line_tags: list[str] = []
111 inside_link = False
112 inside_italics = False
113 interrupted_link = False
115 examples = []
116 for m in EXAMPLES_RE.finditer(text):
117 example = re.sub(r"__/?[IL]__", "", m.group(1))
118 parsed = wxr.wtp.parse(example)
119 example = clean_node(wxr, None, parsed)
120 example = example.strip(" \n*:⮡")
121 examples.append(example)
123 text = EXAMPLES_RE.sub("", text)
125 for i, token in enumerate(LINK_RE.split(text)):
126 # print(f"{token=}")
127 token = token.strip()
129 if not token:
130 continue
132 if i % 2 == 0:
133 # Actual text, not __L__or __/L__
134 # print(f"{i=}, {token=}, {line_tags=}")
135 if inside_italics:
136 line_tags.append(token)
137 continue
138 if inside_link is False and token:
139 # There's something between two link nodes
140 interrupted_link = True
141 continue
142 if inside_link is True: 142 ↛ 151line 142 didn't jump to line 151 because the condition on line 142 was always true
143 if interrupted_link is True and len(chained_links) > 0:
144 combined_line_data.append(
145 (chained_links, line_tags, examples)
146 )
147 chained_links = [token]
148 else:
149 chained_links.append(token)
150 continue
151 if token == "__I__":
152 inside_italics = True
153 continue
154 if token == "__/I__":
155 inside_italics = False
156 continue
157 if token == "__L__":
158 inside_link = True
159 continue
160 if token == "__/L__": 160 ↛ 125line 160 didn't jump to line 125 because the condition on line 160 was always true
161 inside_link = False
162 interrupted_link = False
163 continue
164 if chained_links:
165 combined_line_data.append((chained_links, line_tags, examples))
167 new_combined = []
168 for link_parts, tags, examples in combined_line_data:
169 if link_parts: 169 ↛ 168line 169 didn't jump to line 168 because the condition on line 169 was always true
170 new_combined.append((link_parts, tags, examples))
171 combined_line_data = new_combined
173 match linkage_type:
174 case Heading.Related:
175 target_field = data.related
176 case Heading.Synonyms: 176 ↛ 177line 176 didn't jump to line 177 because the pattern on line 176 never matched
177 target_field = data.synonyms
178 case Heading.Antonyms: 178 ↛ 179line 178 didn't jump to line 179 because the pattern on line 178 never matched
179 target_field = data.antonyms
180 case Heading.Derived: 180 ↛ 181line 180 didn't jump to line 181 because the pattern on line 180 never matched
181 target_field = data.derived
182 case Heading.Transliterations: 182 ↛ 195line 182 didn't jump to line 195 because the pattern on line 182 always matched
183 # For transliteration sections we add these to forms instead.
184 data.forms.extend(
185 Form(
186 form=" ".join(link_parts),
187 raw_tags=ltags,
188 tags=["transliteration"],
189 )
190 for link_parts, ltags, _ in combined_line_data
191 )
192 if transliteration_template_data:
193 data.forms.extend(transliteration_template_data)
194 return
195 case _:
196 wxr.wtp.error(
197 "process_linkage_section() given unhandled Heading: "
198 f"{linkage_type=}",
199 sortid="linkages/83",
200 )
201 return
203 target_field.extend(
204 Linkage(word=" ".join(link_parts), raw_tags=ltags, examples=lexamples)
205 for link_parts, ltags, lexamples in combined_line_data
206 )
208 # iterate over list item lines and get links
210 # if links are next to each other with only whitespace between,
211 # that's part of one entry
213 # if there's something that isn't a link in-between, then they're
214 # separate words