Coverage for src / wiktextract / extractor / el / linkages.py: 92%

134 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2from typing import Literal 

3 

4from wikitextprocessor import TemplateNode, WikiNode 

5from wikitextprocessor.parser import NodeKind 

6 

7from wiktextract.extractor.el.tags import translate_raw_tags 

8from wiktextract.page import clean_node 

9from wiktextract.wxr_context import WiktextractContext 

10 

11from .models import AltForm, Form, Linkage, WordEntry 

12from .section_titles import Heading 

13 

14Node = str | WikiNode 

15 

16LINK_RE = re.compile(r"(__/?[IL]__)") 

17 

18EXAMPLES_RE = re.compile(r"(?sm)__E__(.*?)__/E__") 

19 

20LinkageType = Literal[ 

21 Heading.Related, 

22 Heading.Synonyms, 

23 Heading.Antonyms, 

24 Heading.Transliterations, 

25 Heading.AltOf, 

26 Heading.FormOf, 

27] 

28"""Headings variants supported by process_linkage_section.""" 

29 

30 

31def process_linkage_section( 

32 wxr: WiktextractContext, 

33 data: WordEntry, 

34 rnode: WikiNode, 

35 linkage_type: LinkageType, 

36) -> None: 

37 esperanto_template_data: list[Form] = [] 

38 

39 def prehandle_templates_fn( 

40 node: WikiNode, 

41 ) -> list[Node] | None: 

42 """Handle nodes in the parse tree specially.""" 

43 # print(f"{node=}") 

44 if not isinstance(node, TemplateNode): 

45 return None 

46 if node.template_name == "βλ": 

47 # print("REACHED") 

48 # print(f"{node.largs=}") 

49 ret: list[Node] = [] 

50 comma = False 

51 for k, v in node.template_parameters.items(): 

52 if not isinstance(k, int): 

53 continue 

54 if comma: 

55 ret.append(", ") 

56 if isinstance(v, list): 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true

57 ret.extend(["__L__", *v, "__/L__"]) 

58 else: 

59 ret.extend(["__L__", v, "__/L__"]) 

60 comma = True 

61 return ret 

62 if node.template_name in ("eo-h", "eo-x"): 

63 esperanto_template_data.append( 

64 Form( 

65 form="".join( 

66 wxr.wtp.node_to_text(arg) for arg in node.largs[1] 

67 ), 

68 raw_tags=[ 

69 "H-sistemo" 

70 if node.template_name == "eo-h" 

71 else "X-sistemo" 

72 ], 

73 tags=["transliteration"], 

74 source="linkage", 

75 ) 

76 ) 

77 return [] 

78 return None 

79 

80 def links_node_fn( 

81 node: WikiNode, 

82 ) -> list[Node] | None: 

83 """Handle nodes in the parse tree specially.""" 

84 # print(f"{node=}") 

85 if node.kind == NodeKind.ITALIC: 

86 return ["__I__", *node.children, "__/I__"] 

87 if node.kind == NodeKind.LINK: 

88 if not isinstance(node.largs[0][0], str): 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 return None 

90 return [ 

91 "__L__", 

92 # unpacking a list-comprehension, unpacking into a list 

93 # seems to be more performant than adding lists together. 

94 *( 

95 wxr.wtp.node_to_text( 

96 node.largs[1:2] or node.largs[0], 

97 ) 

98 # output the "visible" half of the link. 

99 ), 

100 # XXX collect link data if it turns out to be important. 

101 "__/L__", 

102 ] 

103 # print(f"{node.largs=}") 

104 if isinstance(node, TemplateNode) and node.template_name == "βλ": 104 ↛ 107line 104 didn't jump to line 107 because the condition on line 104 was never true

105 # print("REACHED") 

106 # print(f"{node=}") 

107 return node.children 

108 if node.kind == NodeKind.LIST_ITEM and node.sarg.endswith(":"): 

109 return [node.sarg, "__E__", *node.children, "__/E__\n"] 

110 return None 

111 

112 # parse nodes to get lists and list_items 

113 reparsed = wxr.wtp.parse( 

114 wxr.wtp.node_to_wikitext(rnode, node_handler_fn=prehandle_templates_fn), 

115 expand_all=True, 

116 ) 

117 

118 combined_line_data: list[tuple[list[str], list[str], list[str]]] = [] 

119 

120 for list_item in reparsed.find_child_recursively(NodeKind.LIST_ITEM): 

121 # print(f"{list_item=}") 

122 text = wxr.wtp.node_to_text(list_item, node_handler_fn=links_node_fn) 

123 

124 chained_links: list[str] = [] 

125 line_tags: list[str] = [] 

126 inside_link = False 

127 inside_italics = False 

128 interrupted_link = False 

129 

130 examples = [] 

131 for m in EXAMPLES_RE.finditer(text): 

132 example = re.sub(r"__/?[IL]__", "", m.group(1)) 

133 parsed = wxr.wtp.parse(example) 

134 example = clean_node(wxr, None, parsed) 

135 example = example.strip(" \n*:⮡") 

136 examples.append(example) 

137 

138 text = EXAMPLES_RE.sub("", text) 

139 

140 for i, token in enumerate(LINK_RE.split(text)): 

141 # print(f"{token=}") 

142 token = token.strip() 

143 

144 if not token: 

145 continue 

146 

147 if i % 2 == 0: 

148 # Actual text, not __L__or __/L__ 

149 # print(f"{i=}, {token=}, {line_tags=}") 

150 if inside_italics: 

151 line_tags.append(token) 

152 continue 

153 if inside_link is False and token: 

154 # There's something between two link nodes 

155 interrupted_link = True 

156 continue 

157 if inside_link is True: 157 ↛ 166line 157 didn't jump to line 166 because the condition on line 157 was always true

158 if interrupted_link is True and len(chained_links) > 0: 

159 combined_line_data.append( 

160 (chained_links, line_tags, examples) 

161 ) 

162 chained_links = [token] 

163 else: 

164 chained_links.append(token) 

165 continue 

166 if token == "__I__": 

167 inside_italics = True 

168 continue 

169 if token == "__/I__": 

170 inside_italics = False 

171 continue 

172 if token == "__L__": 

173 inside_link = True 

174 continue 

175 if token == "__/L__": 175 ↛ 140line 175 didn't jump to line 140 because the condition on line 175 was always true

176 inside_link = False 

177 interrupted_link = False 

178 continue 

179 if chained_links: 

180 combined_line_data.append((chained_links, line_tags, examples)) 

181 

182 new_combined = [] 

183 for link_parts, tags, examples in combined_line_data: 

184 if link_parts: 184 ↛ 183line 184 didn't jump to line 183 because the condition on line 184 was always true

185 new_combined.append((link_parts, tags, examples)) 

186 combined_line_data = new_combined 

187 

188 match linkage_type: 

189 case Heading.Related: 

190 target_field = data.related 

191 case Heading.Synonyms: 

192 target_field = data.synonyms 

193 case Heading.Antonyms: 193 ↛ 194line 193 didn't jump to line 194 because the pattern on line 193 never matched

194 target_field = data.antonyms 

195 case Heading.Transliterations: 

196 # For transliteration sections we add these to forms instead. 

197 combined_line_forms = [ 

198 Form( 

199 form=" ".join(link_parts), 

200 raw_tags=ltags, 

201 tags=["transliteration"], 

202 source="linkage", 

203 ) 

204 for link_parts, ltags, _ in combined_line_data 

205 ] 

206 for form in combined_line_forms: 

207 translate_raw_tags(form) 

208 data.forms.extend(combined_line_forms) 

209 if esperanto_template_data: 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was never true

210 data.forms.extend(esperanto_template_data) 

211 return 

212 case Heading.AltOf | Heading.FormOf: 212 ↛ 225line 212 didn't jump to line 225 because the pattern on line 212 always matched

213 combined_line_forms = [ 

214 AltForm(word=" ".join(link_parts)) 

215 for link_parts, _, _ in combined_line_data 

216 ] 

217 match linkage_type: 

218 case Heading.AltOf: 

219 data.alt_of.extend(combined_line_forms) 

220 case Heading.FormOf: 220 ↛ 222line 220 didn't jump to line 222 because the pattern on line 220 always matched

221 data.form_of.extend(combined_line_forms) 

222 if esperanto_template_data: 

223 data.forms.extend(esperanto_template_data) 

224 return 

225 case _: 

226 # unreachable 

227 wxr.wtp.error( 

228 "process_linkage_section() given unhandled Heading: " 

229 f"{linkage_type=}", 

230 sortid="linkages/83", 

231 ) 

232 return 

233 

234 linkages = [ 

235 Linkage(word=" ".join(link_parts), raw_tags=ltags, examples=lexamples) 

236 for link_parts, ltags, lexamples in combined_line_data 

237 ] 

238 for linkage in linkages: 

239 translate_raw_tags(linkage) 

240 target_field.extend(linkages) 

241 

242 # iterate over list item lines and get links 

243 

244 # if links are next to each other with only whitespace between, 

245 # that's part of one entry 

246 

247 # if there's something that isn't a link in-between, then they're 

248 # separate words