Coverage for src/wiktextract/extractor/fr/linkage.py: 94%

136 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1import re 

2 

3from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..share import capture_text_in_parentheses 

8from .models import Form, Linkage, WordEntry 

9from .section_types import LINKAGE_SECTIONS, LINKAGE_TAGS 

10from .tags import translate_raw_tags 

11 

12 

13def extract_linkage( 

14 wxr: WiktextractContext, 

15 page_data: list[WordEntry], 

16 level_node: WikiNode, 

17 section_type: str, 

18) -> None: 

19 if section_type == "dérivés autres langues": 

20 process_derives_autres_list(wxr, page_data, level_node) 

21 elif section_type == "anagrammes": 

22 for node in level_node.find_child(NodeKind.TEMPLATE): 

23 if node.template_name == "voir anagrammes": 23 ↛ 22line 23 didn't jump to line 22 because the condition on line 23 was always true

24 anagram_list = process_voir_anagrammes_template(wxr, node) 

25 for data in page_data: 

26 if data.lang_code == page_data[-1].lang_code: 26 ↛ 25line 26 didn't jump to line 25 because the condition on line 26 was always true

27 data.anagrams.extend(anagram_list) 

28 else: 

29 process_linkage_list( 

30 wxr, 

31 page_data, 

32 level_node, 

33 LINKAGE_SECTIONS[section_type], 

34 LINKAGE_TAGS.get(section_type, []), 

35 ) 

36 

37 

38def process_derives_autres_list( 

39 wxr: WiktextractContext, 

40 page_data: list[WordEntry], 

41 level_node: WikiNode, 

42): 

43 # drrive to other languages list 

44 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

45 lang_code = "" 

46 lang_name = "" 

47 for node in list_item.find_child(NodeKind.TEMPLATE | NodeKind.LINK): 

48 if isinstance(node, TemplateNode) and node.template_name == "L": 

49 lang_code = node.template_parameters.get(1) 

50 lang_name = clean_node(wxr, None, node) 

51 elif node.kind == NodeKind.LINK: 

52 word = clean_node(wxr, None, node) 

53 page_data[-1].derived.append( 

54 Linkage(lang_code=lang_code, lang=lang_name, word=word) 

55 ) 

56 elif isinstance(node, TemplateNode) and node.template_name in [ 56 ↛ 47line 56 didn't jump to line 47 because the condition on line 56 was always true

57 "l", 

58 "lien", 

59 "zh-lien", 

60 "zh-lien-t", 

61 ]: 

62 linkage_data = Linkage( 

63 lang_code=lang_code, lang=lang_name, word="" 

64 ) 

65 process_linkage_template(wxr, node, linkage_data) 

66 page_data[-1].derived.append(linkage_data) 

67 

68 

69def process_linkage_list( 

70 wxr: WiktextractContext, 

71 page_data: list[WordEntry], 

72 level_node: WikiNode, 

73 linkage_type: str, 

74 section_tags: list[str] = [], 

75) -> None: 

76 sense_text = "" 

77 sense_index = 0 

78 for template_or_list_node in level_node.find_child_recursively( 

79 NodeKind.LIST_ITEM | NodeKind.TEMPLATE 

80 ): 

81 # list table start template: https://fr.wiktionary.org/wiki/Modèle:( 

82 if ( 

83 isinstance(template_or_list_node, TemplateNode) 

84 and template_or_list_node.template_name == "(" 

85 ): 

86 sense_text = clean_node( 

87 wxr, None, template_or_list_node.template_parameters.get(1, "") 

88 ) 

89 sense_index_text = template_or_list_node.template_parameters.get( 

90 2, "0" 

91 ) 

92 if ( 92 ↛ 97line 92 didn't jump to line 97 because the condition on line 92 was always true

93 isinstance(sense_index_text, str) 

94 and sense_index_text.isdecimal() 

95 ): 

96 sense_index = int(sense_index_text) 

97 continue 

98 # sense could also be in ";" description list 

99 if ( 

100 template_or_list_node.kind == NodeKind.LIST_ITEM 

101 and template_or_list_node.sarg in {";", ":"} 

102 ): 

103 sense_text = clean_node(wxr, None, template_or_list_node.children) 

104 index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$" 

105 m = re.search(index_pattern, sense_text) 

106 if m is not None: 106 ↛ 109line 106 didn't jump to line 109 because the condition on line 106 was always true

107 sense_text = re.sub(index_pattern, "", sense_text) 

108 sense_index = int(m.group(1)) 

109 continue 

110 

111 linkage_data = Linkage(word="", tags=section_tags) 

112 if len(sense_text) > 0: 

113 linkage_data.sense = sense_text 

114 if sense_index != 0: 

115 linkage_data.sense_index = sense_index 

116 pending_tag = "" 

117 inside_bracket = False 

118 for index, child_node in enumerate( # remove nested lists 

119 template_or_list_node.invert_find_child(NodeKind.LIST, True) 

120 ): 

121 if isinstance( 

122 child_node, TemplateNode 

123 ) and child_node.template_name in [ 

124 "l", 

125 "lien", 

126 "zh-lien", 

127 "zh-lien-t", 

128 ]: 

129 process_linkage_template(wxr, child_node, linkage_data) 

130 elif ( 

131 isinstance(child_node, WikiNode) 

132 and child_node.kind == NodeKind.LINK 

133 and not inside_bracket 

134 ): 

135 linkage_data.word = clean_node(wxr, None, child_node) 

136 elif ( 

137 isinstance(child_node, WikiNode) 

138 and child_node.kind == NodeKind.ITALIC 

139 ): 

140 current_sense = clean_node(wxr, None, child_node).strip("()") 

141 if ( 

142 len(list(template_or_list_node.filter_empty_str_child())) 

143 == 1 

144 ): 

145 linkage_data.word = current_sense 

146 elif current_sense.isdecimal(): 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true

147 linkage_data.sense_index = int(current_sense) 

148 else: 

149 linkage_data.sense = current_sense 

150 elif ( 

151 isinstance(child_node, TemplateNode) 

152 and child_node.template_name == "réf" 

153 ): 

154 continue 

155 else: 

156 tag_text = ( 

157 child_node 

158 if isinstance(child_node, str) 

159 else clean_node(wxr, page_data[-1], child_node) 

160 ) 

161 if ( 

162 tag_text.strip() in {",", "/", "(ou"} 

163 and linkage_data.word != "" 

164 ): 

165 # list item has more than one word 

166 add_linkage_data(page_data[-1], linkage_type, linkage_data) 

167 linkage_data = Linkage(word="", tags=section_tags) 

168 continue 

169 if tag_text.strip().startswith( 

170 "(" 

171 ) and not tag_text.strip().endswith(")"): 

172 pending_tag = tag_text 

173 inside_bracket = True 

174 continue 

175 elif not tag_text.strip().startswith( 

176 "(" 

177 ) and tag_text.strip().endswith(")"): 

178 tag_text = pending_tag + tag_text 

179 pending_tag = "" 

180 inside_bracket = False 

181 elif len(pending_tag) > 0: 

182 pending_tag += tag_text 

183 continue 

184 

185 if tag_text.strip().startswith("—"): 

186 linkage_data.translation = clean_node( 

187 wxr, 

188 None, 

189 list( 

190 template_or_list_node.invert_find_child( 

191 NodeKind.LIST, True 

192 ) 

193 )[index:], 

194 ).strip("— ") 

195 break 

196 elif tag_text.strip().startswith(":"): 

197 sense_text = tag_text.strip().removeprefix(":").strip() 

198 linkage_data.sense = sense_text 

199 else: 

200 tags, _ = capture_text_in_parentheses(tag_text) 

201 for tag in tags: 

202 if tag.isdecimal(): 

203 linkage_data.sense_index = int(tag) 

204 else: 

205 linkage_data.raw_tags.append(tag) 

206 

207 if len(linkage_data.word) > 0: 

208 add_linkage_data(page_data[-1], linkage_type, linkage_data) 

209 

210 

211def add_linkage_data( 

212 word_entry: WordEntry, l_type: str, l_data: Linkage 

213) -> None: 

214 if l_data.word == "": 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true

215 return 

216 translate_raw_tags(l_data) 

217 if l_type == "forms": 

218 word_entry.forms.append( 

219 Form( 

220 form=l_data.word, 

221 tags=l_data.tags, 

222 raw_tags=l_data.raw_tags, 

223 roman=l_data.roman, 

224 sense=l_data.sense, 

225 sense_index=l_data.sense_index, 

226 ) 

227 ) 

228 else: 

229 getattr(word_entry, l_type).append(l_data) 

230 

231 

232def process_linkage_template( 

233 wxr: WiktextractContext, 

234 node: TemplateNode, 

235 linkage_data: Linkage, 

236) -> None: 

237 if node.template_name in ["lien", "l"]: 

238 process_lien_template(wxr, node, linkage_data) 

239 elif node.template_name.startswith("zh-lien"): 239 ↛ exitline 239 didn't return from function 'process_linkage_template' because the condition on line 239 was always true

240 process_zh_lien_template(wxr, node, linkage_data) 

241 

242 

243def process_lien_template( 

244 wxr: WiktextractContext, 

245 node: TemplateNode, 

246 linkage_data: Linkage, 

247) -> None: 

248 # link word template: https://fr.wiktionary.org/wiki/Modèle:lien 

249 word = clean_node( 

250 wxr, 

251 None, 

252 node.template_parameters.get("dif", node.template_parameters.get(1)), 

253 ) 

254 linkage_data.word = word 

255 if "tr" in node.template_parameters: 

256 linkage_data.roman = clean_node( 

257 wxr, None, node.template_parameters.get("tr") 

258 ) 

259 if "sens" in node.template_parameters: 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true

260 linkage_data.translation = clean_node( 

261 wxr, None, node.template_parameters.get("sens") 

262 ) 

263 

264 

265def process_zh_lien_template( 

266 wxr: WiktextractContext, 

267 node: TemplateNode, 

268 linkage_data: Linkage, 

269) -> None: 

270 # https://fr.wiktionary.org/wiki/Modèle:zh-lien 

271 linkage_data.word = clean_node(wxr, None, node.template_parameters.get(1)) 

272 linkage_data.roman = clean_node( 

273 wxr, None, node.template_parameters.get(2, "") 

274 ) # pinyin 

275 traditional_form = clean_node( 

276 wxr, None, node.template_parameters.get(3, "") 

277 ) 

278 if len(traditional_form) > 0: 

279 linkage_data.alt = traditional_form 

280 

281 

282def process_voir_anagrammes_template( 

283 wxr: WiktextractContext, node: TemplateNode 

284) -> list[Linkage]: 

285 # https://fr.wiktionary.org/wiki/Modèle:voir_anagrammes 

286 results = [] 

287 expanded_node = wxr.wtp.parse( 

288 wxr.wtp.node_to_wikitext(node), expand_all=True 

289 ) 

290 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

291 for link_node in list_item.find_child(NodeKind.LINK): 

292 word = clean_node(wxr, None, link_node) 

293 if len(word) > 0: 293 ↛ 291line 293 didn't jump to line 291 because the condition on line 293 was always true

294 results.append(Linkage(word=word)) 

295 return results