Coverage for src/wiktextract/extractor/fr/linkage.py: 95%

132 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import re 

2 

3from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..share import capture_text_in_parentheses 

8from .models import Linkage, WordEntry 

9from .section_types import LINKAGE_SECTIONS 

10from .tags import translate_raw_tags 

11 

12 

13def extract_linkage( 

14 wxr: WiktextractContext, 

15 page_data: list[WordEntry], 

16 level_node: WikiNode, 

17 section_type: str, 

18) -> None: 

19 if section_type == "dérivés autres langues": 

20 process_derives_autres_list(wxr, page_data, level_node) 

21 elif section_type == "anagrammes": 

22 for node in level_node.find_child(NodeKind.TEMPLATE): 

23 if node.template_name == "voir anagrammes": 23 ↛ 22line 23 didn't jump to line 22 because the condition on line 23 was always true

24 anagram_list = process_voir_anagrammes_template(wxr, node) 

25 for data in page_data: 

26 if data.lang_code == page_data[-1].lang_code: 26 ↛ 25line 26 didn't jump to line 25 because the condition on line 26 was always true

27 data.anagrams.extend(anagram_list) 

28 else: 

29 process_linkage_list( 

30 wxr, 

31 page_data, 

32 level_node, 

33 LINKAGE_SECTIONS[section_type], 

34 ) 

35 

36 

37def process_derives_autres_list( 

38 wxr: WiktextractContext, 

39 page_data: list[WordEntry], 

40 level_node: WikiNode, 

41): 

42 # drrive to other languages list 

43 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

44 lang_code = "" 

45 lang_name = "" 

46 for node in list_item.find_child(NodeKind.TEMPLATE | NodeKind.LINK): 

47 if isinstance(node, TemplateNode) and node.template_name == "L": 

48 lang_code = node.template_parameters.get(1) 

49 lang_name = clean_node(wxr, None, node) 

50 elif node.kind == NodeKind.LINK: 

51 word = clean_node(wxr, None, node) 

52 page_data[-1].derived.append( 

53 Linkage(lang_code=lang_code, lang=lang_name, word=word) 

54 ) 

55 elif isinstance(node, TemplateNode) and node.template_name in [ 55 ↛ 46line 55 didn't jump to line 46 because the condition on line 55 was always true

56 "l", 

57 "lien", 

58 "zh-lien", 

59 "zh-lien-t", 

60 ]: 

61 linkage_data = Linkage( 

62 lang_code=lang_code, lang=lang_name, word="" 

63 ) 

64 process_linkage_template(wxr, node, linkage_data) 

65 page_data[-1].derived.append(linkage_data) 

66 

67 

68def process_linkage_list( 

69 wxr: WiktextractContext, 

70 page_data: list[WordEntry], 

71 level_node: WikiNode, 

72 linkage_type: str, 

73) -> None: 

74 sense_text = "" 

75 sense_index = 0 

76 for template_or_list_node in level_node.find_child_recursively( 

77 NodeKind.LIST_ITEM | NodeKind.TEMPLATE 

78 ): 

79 # list table start template: https://fr.wiktionary.org/wiki/Modèle:( 

80 if ( 

81 isinstance(template_or_list_node, TemplateNode) 

82 and template_or_list_node.template_name == "(" 

83 ): 

84 sense_text = clean_node( 

85 wxr, None, template_or_list_node.template_parameters.get(1, "") 

86 ) 

87 sense_index_text = template_or_list_node.template_parameters.get( 

88 2, "0" 

89 ) 

90 if ( 90 ↛ 95line 90 didn't jump to line 95

91 isinstance(sense_index_text, str) 

92 and sense_index_text.isdecimal() 

93 ): 

94 sense_index = int(sense_index_text) 

95 continue 

96 # sense could also be in ";" description list 

97 if ( 

98 template_or_list_node.kind == NodeKind.LIST_ITEM 

99 and template_or_list_node.sarg in {";", ":"} 

100 ): 

101 sense_text = clean_node(wxr, None, template_or_list_node.children) 

102 index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$" 

103 m = re.search(index_pattern, sense_text) 

104 if m is not None: 104 ↛ 107line 104 didn't jump to line 107 because the condition on line 104 was always true

105 sense_text = re.sub(index_pattern, "", sense_text) 

106 sense_index = int(m.group(1)) 

107 continue 

108 

109 linkage_data = Linkage(word="") 

110 if len(sense_text) > 0: 

111 linkage_data.sense = sense_text 

112 if sense_index != 0: 

113 linkage_data.sense_index = sense_index 

114 pending_tag = "" 

115 inside_bracket = False 

116 for index, child_node in enumerate( # remove nested lists 

117 template_or_list_node.invert_find_child(NodeKind.LIST, True) 

118 ): 

119 if isinstance( 

120 child_node, TemplateNode 

121 ) and child_node.template_name in [ 

122 "l", 

123 "lien", 

124 "zh-lien", 

125 "zh-lien-t", 

126 ]: 

127 process_linkage_template(wxr, child_node, linkage_data) 

128 elif ( 

129 isinstance(child_node, WikiNode) 

130 and child_node.kind == NodeKind.LINK 

131 and not inside_bracket 

132 ): 

133 linkage_data.word = clean_node(wxr, None, child_node) 

134 elif ( 

135 isinstance(child_node, WikiNode) 

136 and child_node.kind == NodeKind.ITALIC 

137 ): 

138 current_sense = clean_node(wxr, None, child_node).strip("()") 

139 if ( 

140 len(list(template_or_list_node.filter_empty_str_child())) 

141 == 1 

142 ): 

143 linkage_data.word = current_sense 

144 elif current_sense.isdecimal(): 144 ↛ 145line 144 didn't jump to line 145 because the condition on line 144 was never true

145 linkage_data.sense_index = int(current_sense) 

146 else: 

147 linkage_data.sense = current_sense 

148 elif ( 

149 isinstance(child_node, TemplateNode) 

150 and child_node.template_name == "réf" 

151 ): 

152 continue 

153 else: 

154 tag_text = ( 

155 child_node 

156 if isinstance(child_node, str) 

157 else clean_node(wxr, page_data[-1], child_node) 

158 ) 

159 if ( 

160 tag_text.strip() in {",", "/", "(ou"} 

161 and linkage_data.word != "" 

162 ): 

163 # list item has more than one word 

164 pre_data = getattr(page_data[-1], linkage_type) 

165 pre_data.append(linkage_data) 

166 linkage_data = Linkage(word="") 

167 continue 

168 if tag_text.strip().startswith( 

169 "(" 

170 ) and not tag_text.strip().endswith(")"): 

171 pending_tag = tag_text 

172 inside_bracket = True 

173 continue 

174 elif not tag_text.strip().startswith( 

175 "(" 

176 ) and tag_text.strip().endswith(")"): 

177 tag_text = pending_tag + tag_text 

178 pending_tag = "" 

179 inside_bracket = False 

180 elif len(pending_tag) > 0: 

181 pending_tag += tag_text 

182 continue 

183 

184 if tag_text.strip().startswith("—"): 

185 linkage_data.translation = clean_node( 

186 wxr, 

187 None, 

188 list( 

189 template_or_list_node.invert_find_child( 

190 NodeKind.LIST, True 

191 ) 

192 )[index:], 

193 ).strip("— ") 

194 break 

195 elif tag_text.strip().startswith(":"): 

196 sense_text = tag_text.strip().removeprefix(":").strip() 

197 linkage_data.sense = sense_text 

198 else: 

199 tags, _ = capture_text_in_parentheses(tag_text) 

200 for tag in tags: 

201 if tag.isdecimal(): 

202 linkage_data.sense_index = int(tag) 

203 else: 

204 linkage_data.raw_tags.append(tag) 

205 

206 if len(linkage_data.word) > 0: 

207 pre_data = getattr(page_data[-1], linkage_type) 

208 translate_raw_tags(linkage_data) 

209 pre_data.append(linkage_data) 

210 

211 

212def process_linkage_template( 

213 wxr: WiktextractContext, 

214 node: TemplateNode, 

215 linkage_data: Linkage, 

216) -> None: 

217 if node.template_name in ["lien", "l"]: 

218 process_lien_template(wxr, node, linkage_data) 

219 elif node.template_name.startswith("zh-lien"): 219 ↛ exitline 219 didn't return from function 'process_linkage_template' because the condition on line 219 was always true

220 process_zh_lien_template(wxr, node, linkage_data) 

221 

222 

223def process_lien_template( 

224 wxr: WiktextractContext, 

225 node: TemplateNode, 

226 linkage_data: Linkage, 

227) -> None: 

228 # link word template: https://fr.wiktionary.org/wiki/Modèle:lien 

229 word = clean_node( 

230 wxr, 

231 None, 

232 node.template_parameters.get("dif", node.template_parameters.get(1)), 

233 ) 

234 linkage_data.word = word 

235 if "tr" in node.template_parameters: 

236 linkage_data.roman = clean_node( 

237 wxr, None, node.template_parameters.get("tr") 

238 ) 

239 if "sens" in node.template_parameters: 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true

240 linkage_data.translation = clean_node( 

241 wxr, None, node.template_parameters.get("sens") 

242 ) 

243 

244 

245def process_zh_lien_template( 

246 wxr: WiktextractContext, 

247 node: TemplateNode, 

248 linkage_data: Linkage, 

249) -> None: 

250 # https://fr.wiktionary.org/wiki/Modèle:zh-lien 

251 linkage_data.word = clean_node(wxr, None, node.template_parameters.get(1)) 

252 linkage_data.roman = clean_node( 

253 wxr, None, node.template_parameters.get(2, "") 

254 ) # pinyin 

255 traditional_form = clean_node( 

256 wxr, None, node.template_parameters.get(3, "") 

257 ) 

258 if len(traditional_form) > 0: 

259 linkage_data.alt = traditional_form 

260 

261 

262def process_voir_anagrammes_template( 

263 wxr: WiktextractContext, node: TemplateNode 

264) -> list[Linkage]: 

265 # https://fr.wiktionary.org/wiki/Modèle:voir_anagrammes 

266 results = [] 

267 expanded_node = wxr.wtp.parse( 

268 wxr.wtp.node_to_wikitext(node), expand_all=True 

269 ) 

270 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

271 for link_node in list_item.find_child(NodeKind.LINK): 

272 word = clean_node(wxr, None, link_node) 

273 if len(word) > 0: 273 ↛ 271line 273 didn't jump to line 271 because the condition on line 273 was always true

274 results.append(Linkage(word=word)) 

275 return results