Coverage for src / wiktextract / extractor / el / linkages.py: 92%
134 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import re
2from typing import Literal
4from wikitextprocessor import TemplateNode, WikiNode
5from wikitextprocessor.parser import NodeKind
7from wiktextract.extractor.el.tags import translate_raw_tags
8from wiktextract.page import clean_node
9from wiktextract.wxr_context import WiktextractContext
11from .models import AltForm, Form, Linkage, WordEntry
12from .section_titles import Heading
14Node = str | WikiNode
16LINK_RE = re.compile(r"(__/?[IL]__)")
18EXAMPLES_RE = re.compile(r"(?sm)__E__(.*?)__/E__")
20LinkageType = Literal[
21 Heading.Related,
22 Heading.Synonyms,
23 Heading.Antonyms,
24 Heading.Transliterations,
25 Heading.AltOf,
26 Heading.FormOf,
27]
28"""Headings variants supported by process_linkage_section."""
31def process_linkage_section(
32 wxr: WiktextractContext,
33 data: WordEntry,
34 rnode: WikiNode,
35 linkage_type: LinkageType,
36) -> None:
37 esperanto_template_data: list[Form] = []
39 def prehandle_templates_fn(
40 node: WikiNode,
41 ) -> list[Node] | None:
42 """Handle nodes in the parse tree specially."""
43 # print(f"{node=}")
44 if not isinstance(node, TemplateNode):
45 return None
46 if node.template_name == "βλ":
47 # print("REACHED")
48 # print(f"{node.largs=}")
49 ret: list[Node] = []
50 comma = False
51 for k, v in node.template_parameters.items():
52 if not isinstance(k, int):
53 continue
54 if comma:
55 ret.append(", ")
56 if isinstance(v, list): 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true
57 ret.extend(["__L__", *v, "__/L__"])
58 else:
59 ret.extend(["__L__", v, "__/L__"])
60 comma = True
61 return ret
62 if node.template_name in ("eo-h", "eo-x"):
63 esperanto_template_data.append(
64 Form(
65 form="".join(
66 wxr.wtp.node_to_text(arg) for arg in node.largs[1]
67 ),
68 raw_tags=[
69 "H-sistemo"
70 if node.template_name == "eo-h"
71 else "X-sistemo"
72 ],
73 tags=["transliteration"],
74 source="linkage",
75 )
76 )
77 return []
78 return None
80 def links_node_fn(
81 node: WikiNode,
82 ) -> list[Node] | None:
83 """Handle nodes in the parse tree specially."""
84 # print(f"{node=}")
85 if node.kind == NodeKind.ITALIC:
86 return ["__I__", *node.children, "__/I__"]
87 if node.kind == NodeKind.LINK:
88 if not isinstance(node.largs[0][0], str): 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 return None
90 return [
91 "__L__",
92 # unpacking a list-comprehension, unpacking into a list
93 # seems to be more performant than adding lists together.
94 *(
95 wxr.wtp.node_to_text(
96 node.largs[1:2] or node.largs[0],
97 )
98 # output the "visible" half of the link.
99 ),
100 # XXX collect link data if it turns out to be important.
101 "__/L__",
102 ]
103 # print(f"{node.largs=}")
104 if isinstance(node, TemplateNode) and node.template_name == "βλ": 104 ↛ 107line 104 didn't jump to line 107 because the condition on line 104 was never true
105 # print("REACHED")
106 # print(f"{node=}")
107 return node.children
108 if node.kind == NodeKind.LIST_ITEM and node.sarg.endswith(":"):
109 return [node.sarg, "__E__", *node.children, "__/E__\n"]
110 return None
112 # parse nodes to get lists and list_items
113 reparsed = wxr.wtp.parse(
114 wxr.wtp.node_to_wikitext(rnode, node_handler_fn=prehandle_templates_fn),
115 expand_all=True,
116 )
118 combined_line_data: list[tuple[list[str], list[str], list[str]]] = []
120 for list_item in reparsed.find_child_recursively(NodeKind.LIST_ITEM):
121 # print(f"{list_item=}")
122 text = wxr.wtp.node_to_text(list_item, node_handler_fn=links_node_fn)
124 chained_links: list[str] = []
125 line_tags: list[str] = []
126 inside_link = False
127 inside_italics = False
128 interrupted_link = False
130 examples = []
131 for m in EXAMPLES_RE.finditer(text):
132 example = re.sub(r"__/?[IL]__", "", m.group(1))
133 parsed = wxr.wtp.parse(example)
134 example = clean_node(wxr, None, parsed)
135 example = example.strip(" \n*:⮡")
136 examples.append(example)
138 text = EXAMPLES_RE.sub("", text)
140 for i, token in enumerate(LINK_RE.split(text)):
141 # print(f"{token=}")
142 token = token.strip()
144 if not token:
145 continue
147 if i % 2 == 0:
148 # Actual text, not __L__or __/L__
149 # print(f"{i=}, {token=}, {line_tags=}")
150 if inside_italics:
151 line_tags.append(token)
152 continue
153 if inside_link is False and token:
154 # There's something between two link nodes
155 interrupted_link = True
156 continue
157 if inside_link is True: 157 ↛ 166line 157 didn't jump to line 166 because the condition on line 157 was always true
158 if interrupted_link is True and len(chained_links) > 0:
159 combined_line_data.append(
160 (chained_links, line_tags, examples)
161 )
162 chained_links = [token]
163 else:
164 chained_links.append(token)
165 continue
166 if token == "__I__":
167 inside_italics = True
168 continue
169 if token == "__/I__":
170 inside_italics = False
171 continue
172 if token == "__L__":
173 inside_link = True
174 continue
175 if token == "__/L__": 175 ↛ 140line 175 didn't jump to line 140 because the condition on line 175 was always true
176 inside_link = False
177 interrupted_link = False
178 continue
179 if chained_links:
180 combined_line_data.append((chained_links, line_tags, examples))
182 new_combined = []
183 for link_parts, tags, examples in combined_line_data:
184 if link_parts: 184 ↛ 183line 184 didn't jump to line 183 because the condition on line 184 was always true
185 new_combined.append((link_parts, tags, examples))
186 combined_line_data = new_combined
188 match linkage_type:
189 case Heading.Related:
190 target_field = data.related
191 case Heading.Synonyms:
192 target_field = data.synonyms
193 case Heading.Antonyms: 193 ↛ 194line 193 didn't jump to line 194 because the pattern on line 193 never matched
194 target_field = data.antonyms
195 case Heading.Transliterations:
196 # For transliteration sections we add these to forms instead.
197 combined_line_forms = [
198 Form(
199 form=" ".join(link_parts),
200 raw_tags=ltags,
201 tags=["transliteration"],
202 source="linkage",
203 )
204 for link_parts, ltags, _ in combined_line_data
205 ]
206 for form in combined_line_forms:
207 translate_raw_tags(form)
208 data.forms.extend(combined_line_forms)
209 if esperanto_template_data: 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was never true
210 data.forms.extend(esperanto_template_data)
211 return
212 case Heading.AltOf | Heading.FormOf: 212 ↛ 225line 212 didn't jump to line 225 because the pattern on line 212 always matched
213 combined_line_forms = [
214 AltForm(word=" ".join(link_parts))
215 for link_parts, _, _ in combined_line_data
216 ]
217 match linkage_type:
218 case Heading.AltOf:
219 data.alt_of.extend(combined_line_forms)
220 case Heading.FormOf: 220 ↛ 222line 220 didn't jump to line 222 because the pattern on line 220 always matched
221 data.form_of.extend(combined_line_forms)
222 if esperanto_template_data:
223 data.forms.extend(esperanto_template_data)
224 return
225 case _:
226 # unreachable
227 wxr.wtp.error(
228 "process_linkage_section() given unhandled Heading: "
229 f"{linkage_type=}",
230 sortid="linkages/83",
231 )
232 return
234 linkages = [
235 Linkage(word=" ".join(link_parts), raw_tags=ltags, examples=lexamples)
236 for link_parts, ltags, lexamples in combined_line_data
237 ]
238 for linkage in linkages:
239 translate_raw_tags(linkage)
240 target_field.extend(linkages)
242 # iterate over list item lines and get links
244 # if links are next to each other with only whitespace between,
245 # that's part of one entry
247 # if there's something that isn't a link in-between, then they're
248 # separate words