Coverage for src/wiktextract/extractor/el/linkages.py: 91%
121 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1import re
3from wikitextprocessor import TemplateNode, WikiNode
4from wikitextprocessor.parser import NodeKind
6from wiktextract.extractor.el.tags import translate_raw_tags
7from wiktextract.page import clean_node
8from wiktextract.wxr_context import WiktextractContext
10from .models import Form, Linkage, WordEntry
11from .section_titles import Heading
13Node = str | WikiNode
15LINK_RE = re.compile(r"(__/?[IL]__)")
17EXAMPLES_RE = re.compile(r"(?sm)__E__(.*?)__/E__")
20def process_linkage_section(
21 wxr: WiktextractContext,
22 data: WordEntry,
23 rnode: WikiNode,
24 linkage_type: Heading,
25) -> None:
26 transliteration_template_data: list[Form] = []
28 def prehandle_templates_fn(
29 node: WikiNode,
30 ) -> list[Node] | None:
31 """Handle nodes in the parse tree specially."""
32 # print(f"{node=}")
33 if not isinstance(node, TemplateNode):
34 return None
35 if node.template_name == "βλ":
36 # print("REACHED")
37 # print(f"{node.largs=}")
38 ret: list[Node] = []
39 # print(f"{ret=}")
40 comma = False
41 for arg in node.largs[1:]:
42 if comma:
43 ret.append(", ")
44 ret.append("__L__")
45 ret.append(wxr.wtp.node_to_text(arg))
46 ret.append("__/L__")
47 comma = True
48 return ret
49 if node.template_name in ("eo-h", "eo-x"):
50 transliteration_template_data.append(
51 Form(
52 form="".join(
53 wxr.wtp.node_to_text(arg) for arg in node.largs[1]
54 ),
55 raw_tags=[
56 "H-sistemo"
57 if node.template_name == "eo-h"
58 else "X-sistemo"
59 ],
60 tags=["transliteration"],
61 source="linkage",
62 )
63 )
64 return []
65 return None
67 def links_node_fn(
68 node: WikiNode,
69 ) -> list[Node] | None:
70 """Handle nodes in the parse tree specially."""
71 # print(f"{node=}")
72 if node.kind == NodeKind.ITALIC:
73 return ["__I__", *node.children, "__/I__"]
74 if node.kind == NodeKind.LINK:
75 if not isinstance(node.largs[0][0], str): 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 return None
77 return [
78 "__L__",
79 # unpacking a list-comprehension, unpacking into a list
80 # seems to be more performant than adding lists together.
81 *(
82 wxr.wtp.node_to_text(
83 node.largs[1:2] or node.largs[0],
84 )
85 # output the "visible" half of the link.
86 ),
87 # XXX collect link data if it turns out to be important.
88 "__/L__",
89 ]
90 # print(f"{node.largs=}")
91 if isinstance(node, TemplateNode) and node.template_name == "βλ": 91 ↛ 94line 91 didn't jump to line 94 because the condition on line 91 was never true
92 # print("REACHED")
93 # print(f"{node=}")
94 return node.children
95 if node.kind == NodeKind.LIST_ITEM and node.sarg.endswith(":"):
96 return [node.sarg, "__E__", *node.children, "__/E__\n"]
97 return None
99 # parse nodes to get lists and list_items
100 reparsed = wxr.wtp.parse(
101 wxr.wtp.node_to_wikitext(rnode, node_handler_fn=prehandle_templates_fn),
102 expand_all=True,
103 )
105 combined_line_data: list[tuple[list[str], list[str], list[str]]] = []
107 for list_item in reparsed.find_child_recursively(NodeKind.LIST_ITEM):
108 # print(f"{list_item=}")
109 text = wxr.wtp.node_to_text(list_item, node_handler_fn=links_node_fn)
111 chained_links: list[str] = []
112 line_tags: list[str] = []
113 inside_link = False
114 inside_italics = False
115 interrupted_link = False
117 examples = []
118 for m in EXAMPLES_RE.finditer(text):
119 example = re.sub(r"__/?[IL]__", "", m.group(1))
120 parsed = wxr.wtp.parse(example)
121 example = clean_node(wxr, None, parsed)
122 example = example.strip(" \n*:⮡")
123 examples.append(example)
125 text = EXAMPLES_RE.sub("", text)
127 for i, token in enumerate(LINK_RE.split(text)):
128 # print(f"{token=}")
129 token = token.strip()
131 if not token:
132 continue
134 if i % 2 == 0:
135 # Actual text, not __L__or __/L__
136 # print(f"{i=}, {token=}, {line_tags=}")
137 if inside_italics:
138 line_tags.append(token)
139 continue
140 if inside_link is False and token:
141 # There's something between two link nodes
142 interrupted_link = True
143 continue
144 if inside_link is True: 144 ↛ 153line 144 didn't jump to line 153 because the condition on line 144 was always true
145 if interrupted_link is True and len(chained_links) > 0:
146 combined_line_data.append(
147 (chained_links, line_tags, examples)
148 )
149 chained_links = [token]
150 else:
151 chained_links.append(token)
152 continue
153 if token == "__I__":
154 inside_italics = True
155 continue
156 if token == "__/I__":
157 inside_italics = False
158 continue
159 if token == "__L__":
160 inside_link = True
161 continue
162 if token == "__/L__": 162 ↛ 127line 162 didn't jump to line 127 because the condition on line 162 was always true
163 inside_link = False
164 interrupted_link = False
165 continue
166 if chained_links:
167 combined_line_data.append((chained_links, line_tags, examples))
169 new_combined = []
170 for link_parts, tags, examples in combined_line_data:
171 if link_parts: 171 ↛ 170line 171 didn't jump to line 170 because the condition on line 171 was always true
172 new_combined.append((link_parts, tags, examples))
173 combined_line_data = new_combined
175 match linkage_type:
176 case Heading.Related:
177 target_field = data.related
178 case Heading.Synonyms: 178 ↛ 179line 178 didn't jump to line 179 because the pattern on line 178 never matched
179 target_field = data.synonyms
180 case Heading.Antonyms: 180 ↛ 181line 180 didn't jump to line 181 because the pattern on line 180 never matched
181 target_field = data.antonyms
182 case Heading.Derived: 182 ↛ 183line 182 didn't jump to line 183 because the pattern on line 182 never matched
183 target_field = data.derived
184 case Heading.Transliterations: 184 ↛ 201line 184 didn't jump to line 201 because the pattern on line 184 always matched
185 # For transliteration sections we add these to forms instead.
186 transliteration_forms = [
187 Form(
188 form=" ".join(link_parts),
189 raw_tags=ltags,
190 tags=["transliteration"],
191 source="linkage",
192 )
193 for link_parts, ltags, _ in combined_line_data
194 ]
195 for form in transliteration_forms:
196 translate_raw_tags(form)
197 data.forms.extend(transliteration_forms)
198 if transliteration_template_data:
199 data.forms.extend(transliteration_template_data)
200 return
201 case _:
202 wxr.wtp.error(
203 "process_linkage_section() given unhandled Heading: "
204 f"{linkage_type=}",
205 sortid="linkages/83",
206 )
207 return
209 linkages = [
210 Linkage(word=" ".join(link_parts), raw_tags=ltags, examples=lexamples)
211 for link_parts, ltags, lexamples in combined_line_data
212 ]
213 for linkage in linkages:
214 translate_raw_tags(linkage)
215 target_field.extend(linkages)
217 # iterate over list item lines and get links
219 # if links are next to each other with only whitespace between,
220 # that's part of one entry
222 # if there's something that isn't a link in-between, then they're
223 # separate words