Coverage for src/wiktextract/extractor/el/linkages.py: 90%

1import re

3from wikitextprocessor import TemplateNode, WikiNode

4from wikitextprocessor.parser import NodeKind

6from wiktextract.page import clean_node

7from wiktextract.wxr_context import WiktextractContext

9from .models import Form, Linkage, WordEntry

10from .parse_utils import Heading

12Node = str | WikiNode

14LINK_RE = re.compile(r"(__/?[IL]__)")

16EXAMPLES_RE = re.compile(r"(?sm)__E__(.*?)__/E__")

19def process_linkage_section(

20 wxr: WiktextractContext,

21 data: WordEntry,

22 rnode: WikiNode,

23 linkage_type: Heading,

24) -> None:

25 transliteration_template_data: list[Form] = []

27 def prehandle_templates_fn(

28 node: WikiNode,

29 ) -> list[Node] | None:

30 """Handle nodes in the parse tree specially."""

31 # print(f"{node=}")

32 if not isinstance(node, TemplateNode):

33 return None

34 if node.template_name == "βλ":

35 # print("REACHED")

36 # print(f"{node.largs=}")

37 ret: list[Node] = []

38 # print(f"{ret=}")

39 comma = False

40 for arg in node.largs[1:]:

41 if comma:

42 ret.append(", ")

43 ret.append("__L__")

44 ret.append(wxr.wtp.node_to_text(arg))

45 ret.append("__/L__")

46 comma = True

47 return ret

48 if node.template_name in ("eo-h", "eo-x"):

49 transliteration_template_data.append(

50 Form(

51 form="".join(

52 wxr.wtp.node_to_text(arg) for arg in node.largs[1]

53 ),

54 raw_tags=[

55 "H-sistemo"

56 if node.template_name == "eo-h"

57 else "X-sistemo"

58 ],

59 tags=["transliteration"],

60 )

61 )

62 return []

63 return None

65 def links_node_fn(

66 node: WikiNode,

67 ) -> list[Node] | None:

68 """Handle nodes in the parse tree specially."""

69 # print(f"{node=}")

70 if node.kind == NodeKind.ITALIC:

71 return ["__I__", *node.children, "__/I__"]

72 if node.kind == NodeKind.LINK:

73 if not isinstance(node.largs[0][0], str): 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true

74 return None

75 return [

76 "__L__",

77 # unpacking a list-comprehension, unpacking into a list

78 # seems to be more performant than adding lists together.

79 *(

80 wxr.wtp.node_to_text(

81 node.largs[1:2] or node.largs[0],

82 )

83 # output the "visible" half of the link.

84 ),

85 # XXX collect link data if it turns out to be important.

86 "__/L__",

87 ]

88 # print(f"{node.largs=}")

89 if isinstance(node, TemplateNode) and node.template_name == "βλ": 89 ↛ 92line 89 didn't jump to line 92 because the condition on line 89 was never true

90 # print("REACHED")

91 # print(f"{node=}")

92 return node.children

93 if node.kind == NodeKind.LIST_ITEM and node.sarg.endswith(":"):

94 return [node.sarg, "__E__", *node.children, "__/E__\n"]

95 return None

97 # parse nodes to get lists and list_items

98 reparsed = wxr.wtp.parse(

99 wxr.wtp.node_to_wikitext(rnode, node_handler_fn=prehandle_templates_fn),

100 expand_all=True,

101 )

102

103 combined_line_data: list[tuple[list[str], list[str], list[str]]] = []

104

105 for list_item in reparsed.find_child_recursively(NodeKind.LIST_ITEM):

106 # print(f"{list_item=}")

107 text = wxr.wtp.node_to_text(list_item, node_handler_fn=links_node_fn)

108

109 chained_links: list[str] = []

110 line_tags: list[str] = []

111 inside_link = False

112 inside_italics = False

113 interrupted_link = False

114

115 examples = []

116 for m in EXAMPLES_RE.finditer(text):

117 example = re.sub(r"__/?[IL]__", "", m.group(1))

118 parsed = wxr.wtp.parse(example)

119 example = clean_node(wxr, None, parsed)

120 example = example.strip(" \n*:⮡")

121 examples.append(example)

122

123 text = EXAMPLES_RE.sub("", text)

124

125 for i, token in enumerate(LINK_RE.split(text)):

126 # print(f"{token=}")

127 token = token.strip()

128

129 if not token:

130 continue

131

132 if i % 2 == 0:

133 # Actual text, not __L__or __/L__

134 # print(f"{i=}, {token=}, {line_tags=}")

135 if inside_italics:

136 line_tags.append(token)

137 continue

138 if inside_link is False and token:

139 # There's something between two link nodes

140 interrupted_link = True

141 continue

142 if inside_link is True: 142 ↛ 151line 142 didn't jump to line 151 because the condition on line 142 was always true

143 if interrupted_link is True and len(chained_links) > 0:

144 combined_line_data.append(

145 (chained_links, line_tags, examples)

146 )

147 chained_links = [token]

148 else:

149 chained_links.append(token)

150 continue

151 if token == "__I__":

152 inside_italics = True

153 continue

154 if token == "__/I__":

155 inside_italics = False

156 continue

157 if token == "__L__":

158 inside_link = True

159 continue

160 if token == "__/L__": 160 ↛ 125line 160 didn't jump to line 125 because the condition on line 160 was always true

161 inside_link = False

162 interrupted_link = False

163 continue

164 if chained_links:

165 combined_line_data.append((chained_links, line_tags, examples))

166

167 new_combined = []

168 for link_parts, tags, examples in combined_line_data:

169 if link_parts: 169 ↛ 168line 169 didn't jump to line 168 because the condition on line 169 was always true

170 new_combined.append((link_parts, tags, examples))

171 combined_line_data = new_combined

172

173 match linkage_type:

174 case Heading.Related:

175 target_field = data.related

176 case Heading.Synonyms: 176 ↛ 177line 176 didn't jump to line 177 because the pattern on line 176 never matched

177 target_field = data.synonyms

178 case Heading.Antonyms: 178 ↛ 179line 178 didn't jump to line 179 because the pattern on line 178 never matched

179 target_field = data.antonyms

180 case Heading.Derived: 180 ↛ 181line 180 didn't jump to line 181 because the pattern on line 180 never matched

181 target_field = data.derived

182 case Heading.Transliterations: 182 ↛ 195line 182 didn't jump to line 195 because the pattern on line 182 always matched

183 # For transliteration sections we add these to forms instead.

184 data.forms.extend(

185 Form(

186 form=" ".join(link_parts),

187 raw_tags=ltags,

188 tags=["transliteration"],

189 )

190 for link_parts, ltags, _ in combined_line_data

191 )

192 if transliteration_template_data:

193 data.forms.extend(transliteration_template_data)

194 return

195 case _:

196 wxr.wtp.error(

197 "process_linkage_section() given unhandled Heading: "

198 f"{linkage_type=}",

199 sortid="linkages/83",

200 )

201 return

202

203 target_field.extend(

204 Linkage(word=" ".join(link_parts), raw_tags=ltags, examples=lexamples)

205 for link_parts, ltags, lexamples in combined_line_data

206 )

207

208 # iterate over list item lines and get links

209

210 # if links are next to each other with only whitespace between,

211 # that's part of one entry

212

213 # if there's something that isn't a link in-between, then they're

214 # separate words