Coverage for src/wiktextract/extractor/el/linkages.py: 90%

114 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1import re 

2 

3from wikitextprocessor import TemplateNode, WikiNode 

4from wikitextprocessor.parser import NodeKind 

5 

6from wiktextract.page import clean_node 

7from wiktextract.wxr_context import WiktextractContext 

8 

9from .models import Form, Linkage, WordEntry 

10from .parse_utils import Heading 

11 

12Node = str | WikiNode 

13 

14LINK_RE = re.compile(r"(__/?[IL]__)") 

15 

16EXAMPLES_RE = re.compile(r"(?sm)__E__(.*?)__/E__") 

17 

18 

19def process_linkage_section( 

20 wxr: WiktextractContext, 

21 data: WordEntry, 

22 rnode: WikiNode, 

23 linkage_type: Heading, 

24) -> None: 

25 transliteration_template_data: list[Form] = [] 

26 

27 def prehandle_templates_fn( 

28 node: WikiNode, 

29 ) -> list[Node] | None: 

30 """Handle nodes in the parse tree specially.""" 

31 # print(f"{node=}") 

32 if not isinstance(node, TemplateNode): 

33 return None 

34 if node.template_name == "βλ": 

35 # print("REACHED") 

36 # print(f"{node.largs=}") 

37 ret: list[Node] = [] 

38 # print(f"{ret=}") 

39 comma = False 

40 for arg in node.largs[1:]: 

41 if comma: 

42 ret.append(", ") 

43 ret.append("__L__") 

44 ret.append(wxr.wtp.node_to_text(arg)) 

45 ret.append("__/L__") 

46 comma = True 

47 return ret 

48 if node.template_name in ("eo-h", "eo-x"): 

49 transliteration_template_data.append( 

50 Form( 

51 form="".join( 

52 wxr.wtp.node_to_text(arg) for arg in node.largs[1] 

53 ), 

54 raw_tags=[ 

55 "H-sistemo" 

56 if node.template_name == "eo-h" 

57 else "X-sistemo" 

58 ], 

59 tags=["transliteration"], 

60 ) 

61 ) 

62 return [] 

63 return None 

64 

65 def links_node_fn( 

66 node: WikiNode, 

67 ) -> list[Node] | None: 

68 """Handle nodes in the parse tree specially.""" 

69 # print(f"{node=}") 

70 if node.kind == NodeKind.ITALIC: 

71 return ["__I__", *node.children, "__/I__"] 

72 if node.kind == NodeKind.LINK: 

73 if not isinstance(node.largs[0][0], str): 73 ↛ 74line 73 didn't jump to line 74 because the condition on line 73 was never true

74 return None 

75 return [ 

76 "__L__", 

77 # unpacking a list-comprehension, unpacking into a list 

78 # seems to be more performant than adding lists together. 

79 *( 

80 wxr.wtp.node_to_text( 

81 node.largs[1:2] or node.largs[0], 

82 ) 

83 # output the "visible" half of the link. 

84 ), 

85 # XXX collect link data if it turns out to be important. 

86 "__/L__", 

87 ] 

88 # print(f"{node.largs=}") 

89 if isinstance(node, TemplateNode) and node.template_name == "βλ": 89 ↛ 92line 89 didn't jump to line 92 because the condition on line 89 was never true

90 # print("REACHED") 

91 # print(f"{node=}") 

92 return node.children 

93 if node.kind == NodeKind.LIST_ITEM and node.sarg.endswith(":"): 

94 return [node.sarg, "__E__", *node.children, "__/E__\n"] 

95 return None 

96 

97 # parse nodes to get lists and list_items 

98 reparsed = wxr.wtp.parse( 

99 wxr.wtp.node_to_wikitext(rnode, node_handler_fn=prehandle_templates_fn), 

100 expand_all=True, 

101 ) 

102 

103 combined_line_data: list[tuple[list[str], list[str], list[str]]] = [] 

104 

105 for list_item in reparsed.find_child_recursively(NodeKind.LIST_ITEM): 

106 # print(f"{list_item=}") 

107 text = wxr.wtp.node_to_text(list_item, node_handler_fn=links_node_fn) 

108 

109 chained_links: list[str] = [] 

110 line_tags: list[str] = [] 

111 inside_link = False 

112 inside_italics = False 

113 interrupted_link = False 

114 

115 examples = [] 

116 for m in EXAMPLES_RE.finditer(text): 

117 example = re.sub(r"__/?[IL]__", "", m.group(1)) 

118 parsed = wxr.wtp.parse(example) 

119 example = clean_node(wxr, None, parsed) 

120 example = example.strip(" \n*:⮡") 

121 examples.append(example) 

122 

123 text = EXAMPLES_RE.sub("", text) 

124 

125 for i, token in enumerate(LINK_RE.split(text)): 

126 # print(f"{token=}") 

127 token = token.strip() 

128 

129 if not token: 

130 continue 

131 

132 if i % 2 == 0: 

133 # Actual text, not __L__or __/L__ 

134 # print(f"{i=}, {token=}, {line_tags=}") 

135 if inside_italics: 

136 line_tags.append(token) 

137 continue 

138 if inside_link is False and token: 

139 # There's something between two link nodes 

140 interrupted_link = True 

141 continue 

142 if inside_link is True: 142 ↛ 151line 142 didn't jump to line 151 because the condition on line 142 was always true

143 if interrupted_link is True and len(chained_links) > 0: 

144 combined_line_data.append( 

145 (chained_links, line_tags, examples) 

146 ) 

147 chained_links = [token] 

148 else: 

149 chained_links.append(token) 

150 continue 

151 if token == "__I__": 

152 inside_italics = True 

153 continue 

154 if token == "__/I__": 

155 inside_italics = False 

156 continue 

157 if token == "__L__": 

158 inside_link = True 

159 continue 

160 if token == "__/L__": 160 ↛ 125line 160 didn't jump to line 125 because the condition on line 160 was always true

161 inside_link = False 

162 interrupted_link = False 

163 continue 

164 if chained_links: 

165 combined_line_data.append((chained_links, line_tags, examples)) 

166 

167 new_combined = [] 

168 for link_parts, tags, examples in combined_line_data: 

169 if link_parts: 169 ↛ 168line 169 didn't jump to line 168 because the condition on line 169 was always true

170 new_combined.append((link_parts, tags, examples)) 

171 combined_line_data = new_combined 

172 

173 match linkage_type: 

174 case Heading.Related: 

175 target_field = data.related 

176 case Heading.Synonyms: 176 ↛ 177line 176 didn't jump to line 177 because the pattern on line 176 never matched

177 target_field = data.synonyms 

178 case Heading.Antonyms: 178 ↛ 179line 178 didn't jump to line 179 because the pattern on line 178 never matched

179 target_field = data.antonyms 

180 case Heading.Derived: 180 ↛ 181line 180 didn't jump to line 181 because the pattern on line 180 never matched

181 target_field = data.derived 

182 case Heading.Transliterations: 182 ↛ 195line 182 didn't jump to line 195 because the pattern on line 182 always matched

183 # For transliteration sections we add these to forms instead. 

184 data.forms.extend( 

185 Form( 

186 form=" ".join(link_parts), 

187 raw_tags=ltags, 

188 tags=["transliteration"], 

189 ) 

190 for link_parts, ltags, _ in combined_line_data 

191 ) 

192 if transliteration_template_data: 

193 data.forms.extend(transliteration_template_data) 

194 return 

195 case _: 

196 wxr.wtp.error( 

197 "process_linkage_section() given unhandled Heading: " 

198 f"{linkage_type=}", 

199 sortid="linkages/83", 

200 ) 

201 return 

202 

203 target_field.extend( 

204 Linkage(word=" ".join(link_parts), raw_tags=ltags, examples=lexamples) 

205 for link_parts, ltags, lexamples in combined_line_data 

206 ) 

207 

208 # iterate over list item lines and get links 

209 

210 # if links are next to each other with only whitespace between, 

211 # that's part of one entry 

212 

213 # if there's something that isn't a link in-between, then they're 

214 # separate words