Coverage for src/wiktextract/extractor/el/linkages.py: 91%

121 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1import re 

2 

3from wikitextprocessor import TemplateNode, WikiNode 

4from wikitextprocessor.parser import NodeKind 

5 

6from wiktextract.extractor.el.tags import translate_raw_tags 

7from wiktextract.page import clean_node 

8from wiktextract.wxr_context import WiktextractContext 

9 

10from .models import Form, Linkage, WordEntry 

11from .section_titles import Heading 

12 

13Node = str | WikiNode 

14 

15LINK_RE = re.compile(r"(__/?[IL]__)") 

16 

17EXAMPLES_RE = re.compile(r"(?sm)__E__(.*?)__/E__") 

18 

19 

20def process_linkage_section( 

21 wxr: WiktextractContext, 

22 data: WordEntry, 

23 rnode: WikiNode, 

24 linkage_type: Heading, 

25) -> None: 

26 transliteration_template_data: list[Form] = [] 

27 

28 def prehandle_templates_fn( 

29 node: WikiNode, 

30 ) -> list[Node] | None: 

31 """Handle nodes in the parse tree specially.""" 

32 # print(f"{node=}") 

33 if not isinstance(node, TemplateNode): 

34 return None 

35 if node.template_name == "βλ": 

36 # print("REACHED") 

37 # print(f"{node.largs=}") 

38 ret: list[Node] = [] 

39 # print(f"{ret=}") 

40 comma = False 

41 for arg in node.largs[1:]: 

42 if comma: 

43 ret.append(", ") 

44 ret.append("__L__") 

45 ret.append(wxr.wtp.node_to_text(arg)) 

46 ret.append("__/L__") 

47 comma = True 

48 return ret 

49 if node.template_name in ("eo-h", "eo-x"): 

50 transliteration_template_data.append( 

51 Form( 

52 form="".join( 

53 wxr.wtp.node_to_text(arg) for arg in node.largs[1] 

54 ), 

55 raw_tags=[ 

56 "H-sistemo" 

57 if node.template_name == "eo-h" 

58 else "X-sistemo" 

59 ], 

60 tags=["transliteration"], 

61 source="linkage", 

62 ) 

63 ) 

64 return [] 

65 return None 

66 

67 def links_node_fn( 

68 node: WikiNode, 

69 ) -> list[Node] | None: 

70 """Handle nodes in the parse tree specially.""" 

71 # print(f"{node=}") 

72 if node.kind == NodeKind.ITALIC: 

73 return ["__I__", *node.children, "__/I__"] 

74 if node.kind == NodeKind.LINK: 

75 if not isinstance(node.largs[0][0], str): 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 return None 

77 return [ 

78 "__L__", 

79 # unpacking a list-comprehension, unpacking into a list 

80 # seems to be more performant than adding lists together. 

81 *( 

82 wxr.wtp.node_to_text( 

83 node.largs[1:2] or node.largs[0], 

84 ) 

85 # output the "visible" half of the link. 

86 ), 

87 # XXX collect link data if it turns out to be important. 

88 "__/L__", 

89 ] 

90 # print(f"{node.largs=}") 

91 if isinstance(node, TemplateNode) and node.template_name == "βλ": 91 ↛ 94line 91 didn't jump to line 94 because the condition on line 91 was never true

92 # print("REACHED") 

93 # print(f"{node=}") 

94 return node.children 

95 if node.kind == NodeKind.LIST_ITEM and node.sarg.endswith(":"): 

96 return [node.sarg, "__E__", *node.children, "__/E__\n"] 

97 return None 

98 

99 # parse nodes to get lists and list_items 

100 reparsed = wxr.wtp.parse( 

101 wxr.wtp.node_to_wikitext(rnode, node_handler_fn=prehandle_templates_fn), 

102 expand_all=True, 

103 ) 

104 

105 combined_line_data: list[tuple[list[str], list[str], list[str]]] = [] 

106 

107 for list_item in reparsed.find_child_recursively(NodeKind.LIST_ITEM): 

108 # print(f"{list_item=}") 

109 text = wxr.wtp.node_to_text(list_item, node_handler_fn=links_node_fn) 

110 

111 chained_links: list[str] = [] 

112 line_tags: list[str] = [] 

113 inside_link = False 

114 inside_italics = False 

115 interrupted_link = False 

116 

117 examples = [] 

118 for m in EXAMPLES_RE.finditer(text): 

119 example = re.sub(r"__/?[IL]__", "", m.group(1)) 

120 parsed = wxr.wtp.parse(example) 

121 example = clean_node(wxr, None, parsed) 

122 example = example.strip(" \n*:⮡") 

123 examples.append(example) 

124 

125 text = EXAMPLES_RE.sub("", text) 

126 

127 for i, token in enumerate(LINK_RE.split(text)): 

128 # print(f"{token=}") 

129 token = token.strip() 

130 

131 if not token: 

132 continue 

133 

134 if i % 2 == 0: 

135 # Actual text, not __L__or __/L__ 

136 # print(f"{i=}, {token=}, {line_tags=}") 

137 if inside_italics: 

138 line_tags.append(token) 

139 continue 

140 if inside_link is False and token: 

141 # There's something between two link nodes 

142 interrupted_link = True 

143 continue 

144 if inside_link is True: 144 ↛ 153line 144 didn't jump to line 153 because the condition on line 144 was always true

145 if interrupted_link is True and len(chained_links) > 0: 

146 combined_line_data.append( 

147 (chained_links, line_tags, examples) 

148 ) 

149 chained_links = [token] 

150 else: 

151 chained_links.append(token) 

152 continue 

153 if token == "__I__": 

154 inside_italics = True 

155 continue 

156 if token == "__/I__": 

157 inside_italics = False 

158 continue 

159 if token == "__L__": 

160 inside_link = True 

161 continue 

162 if token == "__/L__": 162 ↛ 127line 162 didn't jump to line 127 because the condition on line 162 was always true

163 inside_link = False 

164 interrupted_link = False 

165 continue 

166 if chained_links: 

167 combined_line_data.append((chained_links, line_tags, examples)) 

168 

169 new_combined = [] 

170 for link_parts, tags, examples in combined_line_data: 

171 if link_parts: 171 ↛ 170line 171 didn't jump to line 170 because the condition on line 171 was always true

172 new_combined.append((link_parts, tags, examples)) 

173 combined_line_data = new_combined 

174 

175 match linkage_type: 

176 case Heading.Related: 

177 target_field = data.related 

178 case Heading.Synonyms: 178 ↛ 179line 178 didn't jump to line 179 because the pattern on line 178 never matched

179 target_field = data.synonyms 

180 case Heading.Antonyms: 180 ↛ 181line 180 didn't jump to line 181 because the pattern on line 180 never matched

181 target_field = data.antonyms 

182 case Heading.Derived: 182 ↛ 183line 182 didn't jump to line 183 because the pattern on line 182 never matched

183 target_field = data.derived 

184 case Heading.Transliterations: 184 ↛ 201line 184 didn't jump to line 201 because the pattern on line 184 always matched

185 # For transliteration sections we add these to forms instead. 

186 transliteration_forms = [ 

187 Form( 

188 form=" ".join(link_parts), 

189 raw_tags=ltags, 

190 tags=["transliteration"], 

191 source="linkage", 

192 ) 

193 for link_parts, ltags, _ in combined_line_data 

194 ] 

195 for form in transliteration_forms: 

196 translate_raw_tags(form) 

197 data.forms.extend(transliteration_forms) 

198 if transliteration_template_data: 

199 data.forms.extend(transliteration_template_data) 

200 return 

201 case _: 

202 wxr.wtp.error( 

203 "process_linkage_section() given unhandled Heading: " 

204 f"{linkage_type=}", 

205 sortid="linkages/83", 

206 ) 

207 return 

208 

209 linkages = [ 

210 Linkage(word=" ".join(link_parts), raw_tags=ltags, examples=lexamples) 

211 for link_parts, ltags, lexamples in combined_line_data 

212 ] 

213 for linkage in linkages: 

214 translate_raw_tags(linkage) 

215 target_field.extend(linkages) 

216 

217 # iterate over list item lines and get links 

218 

219 # if links are next to each other with only whitespace between, 

220 # that's part of one entry 

221 

222 # if there's something that isn't a link in-between, then they're 

223 # separate words