Coverage for src/wiktextract/extractor/fr/linkage.py: 93%

148 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..share import capture_text_in_parentheses 

8from .models import Form, Linkage, WordEntry 

9from .section_types import LINKAGE_SECTIONS, LINKAGE_TAGS 

10from .tags import translate_raw_tags 

11 

12 

13def extract_linkage( 

14 wxr: WiktextractContext, 

15 page_data: list[WordEntry], 

16 level_node: LevelNode, 

17 section_type: str, 

18) -> None: 

19 if section_type == "dérivés autres langues": 

20 process_derives_autres_list(wxr, page_data, level_node) 

21 elif section_type == "anagrammes": 

22 for node in level_node.find_child(NodeKind.TEMPLATE): 

23 if node.template_name == "voir anagrammes": 23 ↛ 22line 23 didn't jump to line 22 because the condition on line 23 was always true

24 anagram_list = process_voir_anagrammes_template(wxr, node) 

25 for data in page_data: 

26 if data.lang_code == page_data[-1].lang_code: 26 ↛ 25line 26 didn't jump to line 25 because the condition on line 26 was always true

27 data.anagrams.extend(anagram_list) 

28 else: 

29 extract_linkage_section( 

30 wxr, 

31 page_data[-1], 

32 level_node, 

33 LINKAGE_SECTIONS[section_type], 

34 LINKAGE_TAGS.get(section_type, []), 

35 ) 

36 

37 

38def process_derives_autres_list( 

39 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode 

40): 

41 # drrive to other languages list 

42 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

43 lang_code = "" 

44 lang_name = "" 

45 for node in list_item.find_child(NodeKind.TEMPLATE | NodeKind.LINK): 

46 if isinstance(node, TemplateNode) and node.template_name == "L": 

47 lang_code = node.template_parameters.get(1) 

48 lang_name = clean_node(wxr, None, node) 

49 elif node.kind == NodeKind.LINK: 

50 word = clean_node(wxr, None, node) 

51 page_data[-1].derived.append( 

52 Linkage(lang_code=lang_code, lang=lang_name, word=word) 

53 ) 

54 elif isinstance(node, TemplateNode) and node.template_name in [ 54 ↛ 45line 54 didn't jump to line 45 because the condition on line 54 was always true

55 "l", 

56 "lien", 

57 "zh-lien", 

58 "zh-lien-t", 

59 ]: 

60 linkage_data = Linkage( 

61 lang_code=lang_code, lang=lang_name, word="" 

62 ) 

63 process_linkage_template(wxr, node, linkage_data) 

64 page_data[-1].derived.append(linkage_data) 

65 

66 

67def extract_linkage_section( 

68 wxr: WiktextractContext, 

69 word_entry: WordEntry, 

70 level_node: LevelNode, 

71 linkage_type: str, 

72 section_tags: list[str] = [], 

73): 

74 sense_text = "" 

75 sense_index = 0 

76 for node in level_node.children: 

77 if isinstance(node, TemplateNode) and node.template_name == "(": 

78 new_sense_text = clean_node( 

79 wxr, None, node.template_parameters.get(1, "") 

80 ) 

81 if new_sense_text != "": 81 ↛ 83line 81 didn't jump to line 83 because the condition on line 81 was always true

82 sense_text = new_sense_text 

83 sense_index_text = node.template_parameters.get(2, "0") 

84 if ( 84 ↛ 76line 84 didn't jump to line 76 because the condition on line 84 was always true

85 isinstance(sense_index_text, str) 

86 and sense_index_text.isdecimal() 

87 ): 

88 sense_index = int(sense_index_text) 

89 elif ( 

90 isinstance(node, WikiNode) 

91 and node.kind in NodeKind.BOLD | NodeKind.ITALIC 

92 ): 

93 sense_text = clean_node(wxr, None, node) 

94 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

95 # sense could also be in ";" description list 

96 if node.sarg in [";", ":"]: 

97 for list_item in node.find_child(NodeKind.LIST_ITEM): 

98 sense_text = clean_node(wxr, None, list_item.children) 

99 index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$" 

100 m = re.search(index_pattern, sense_text) 

101 if m is not None: 101 ↛ 97line 101 didn't jump to line 97 because the condition on line 101 was always true

102 sense_text = re.sub(index_pattern, "", sense_text) 

103 sense_index = int(m.group(1)) 

104 else: 

105 for list_item in node.find_child(NodeKind.LIST_ITEM): 

106 extract_linkage_list_item( 

107 wxr, 

108 word_entry, 

109 list_item, 

110 linkage_type, 

111 section_tags, 

112 sense_text, 

113 sense_index, 

114 ) 

115 

116 

117def extract_linkage_list_item( 

118 wxr: WiktextractContext, 

119 word_entry: WordEntry, 

120 list_item: WikiNode, 

121 linkage_type: str, 

122 section_tags: list[str], 

123 sense: str, 

124 sense_index: int, 

125): 

126 linkage_data = Linkage( 

127 word="", tags=section_tags, sense=sense, sense_index=sense_index 

128 ) 

129 pending_tag = "" 

130 inside_bracket = False 

131 for index, child_node in enumerate(list_item.children): 

132 if isinstance( 

133 child_node, TemplateNode 

134 ) and child_node.template_name in [ 

135 "l", 

136 "lien", 

137 "zh-lien", 

138 "zh-lien-t", 

139 ]: 

140 process_linkage_template(wxr, child_node, linkage_data) 

141 elif ( 

142 isinstance(child_node, TemplateNode) 

143 and child_node.template_name == "cf" 

144 ): 

145 return 

146 elif ( 

147 isinstance(child_node, WikiNode) 

148 and child_node.kind == NodeKind.LINK 

149 and not inside_bracket 

150 ): 

151 linkage_data.word = clean_node(wxr, None, child_node) 

152 elif ( 

153 isinstance(child_node, WikiNode) 

154 and child_node.kind == NodeKind.ITALIC 

155 ): 

156 italic_text = clean_node(wxr, None, child_node).strip("()") 

157 if italic_text == "": 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true

158 continue 

159 elif len(list(list_item.filter_empty_str_child())) == 1: 

160 linkage_data.word = italic_text 

161 elif italic_text.isdecimal(): 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true

162 linkage_data.sense_index = int(italic_text) 

163 elif inside_bracket: 

164 linkage_data.raw_tags.append(italic_text) 

165 else: 

166 linkage_data.sense = italic_text 

167 elif ( 

168 isinstance(child_node, TemplateNode) 

169 and child_node.template_name == "réf" 

170 ) or ( 

171 isinstance(child_node, WikiNode) 

172 and child_node.kind == NodeKind.LIST 

173 ): 

174 continue 

175 else: 

176 tag_text = ( 

177 child_node 

178 if isinstance(child_node, str) 

179 else clean_node(wxr, word_entry, child_node) 

180 ) 

181 if ( 

182 tag_text.strip() in {",", "/", "(ou"} 

183 and linkage_data.word != "" 

184 ): 

185 # list item has more than one word 

186 add_linkage_data(word_entry, linkage_type, linkage_data) 

187 linkage_data = Linkage( 

188 word="", 

189 tags=section_tags, 

190 sense=sense, 

191 sense_index=sense_index, 

192 ) 

193 continue 

194 if tag_text.strip().startswith( 

195 "(" 

196 ) and not tag_text.strip().endswith(")"): 

197 pending_tag = tag_text 

198 inside_bracket = True 

199 continue 

200 elif not tag_text.strip().startswith( 

201 "(" 

202 ) and tag_text.strip().endswith(")"): 

203 tag_text = pending_tag + tag_text 

204 pending_tag = "" 

205 inside_bracket = False 

206 elif len(pending_tag) > 0: 

207 pending_tag += tag_text 

208 continue 

209 

210 if tag_text.strip().startswith("—"): 

211 linkage_data.translation = clean_node( 

212 wxr, 

213 None, 

214 list(list_item.invert_find_child(NodeKind.LIST, True))[ 

215 index: 

216 ], 

217 ).strip("— \n") 

218 break 

219 elif tag_text.strip().startswith(":"): 

220 sense_text = tag_text.strip().removeprefix(":").strip() 

221 linkage_data.sense = sense_text 

222 else: 

223 tags, _ = capture_text_in_parentheses(tag_text) 

224 for tag in tags: 

225 if tag.isdecimal(): 

226 linkage_data.sense_index = int(tag) 

227 else: 

228 linkage_data.raw_tags.append(tag) 

229 

230 if len(linkage_data.word) > 0: 230 ↛ 232line 230 didn't jump to line 232 because the condition on line 230 was always true

231 add_linkage_data(word_entry, linkage_type, linkage_data) 

232 for child_list in list_item.find_child(NodeKind.LIST): 

233 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

234 extract_linkage_list_item( 

235 wxr, 

236 word_entry, 

237 child_list_item, 

238 linkage_type, 

239 section_tags, 

240 sense, 

241 sense_index, 

242 ) 

243 

244 

245def add_linkage_data( 

246 word_entry: WordEntry, l_type: str, l_data: Linkage 

247) -> None: 

248 if l_data.word == "": 248 ↛ 249line 248 didn't jump to line 249 because the condition on line 248 was never true

249 return 

250 translate_raw_tags(l_data) 

251 if l_type == "forms": 

252 word_entry.forms.append( 

253 Form( 

254 form=l_data.word, 

255 tags=l_data.tags, 

256 raw_tags=l_data.raw_tags, 

257 roman=l_data.roman, 

258 sense=l_data.sense, 

259 sense_index=l_data.sense_index, 

260 ) 

261 ) 

262 else: 

263 getattr(word_entry, l_type).append(l_data) 

264 

265 

266def process_linkage_template( 

267 wxr: WiktextractContext, 

268 node: TemplateNode, 

269 linkage_data: Linkage, 

270) -> None: 

271 if node.template_name in ["lien", "l"]: 

272 process_lien_template(wxr, node, linkage_data) 

273 elif node.template_name.startswith("zh-lien"): 273 ↛ exitline 273 didn't return from function 'process_linkage_template' because the condition on line 273 was always true

274 process_zh_lien_template(wxr, node, linkage_data) 

275 

276 

277def process_lien_template( 

278 wxr: WiktextractContext, 

279 node: TemplateNode, 

280 linkage_data: Linkage, 

281) -> None: 

282 # link word template: https://fr.wiktionary.org/wiki/Modèle:lien 

283 word = clean_node( 

284 wxr, 

285 None, 

286 node.template_parameters.get("dif", node.template_parameters.get(1)), 

287 ) 

288 linkage_data.word = word 

289 if "tr" in node.template_parameters: 

290 linkage_data.roman = clean_node( 

291 wxr, None, node.template_parameters.get("tr") 

292 ) 

293 if "sens" in node.template_parameters: 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true

294 linkage_data.translation = clean_node( 

295 wxr, None, node.template_parameters.get("sens") 

296 ) 

297 

298 

299def process_zh_lien_template( 

300 wxr: WiktextractContext, 

301 node: TemplateNode, 

302 linkage_data: Linkage, 

303) -> None: 

304 # https://fr.wiktionary.org/wiki/Modèle:zh-lien 

305 linkage_data.word = clean_node(wxr, None, node.template_parameters.get(1)) 

306 linkage_data.roman = clean_node( 

307 wxr, None, node.template_parameters.get(2, "") 

308 ) # pinyin 

309 traditional_form = clean_node( 

310 wxr, None, node.template_parameters.get(3, "") 

311 ) 

312 if len(traditional_form) > 0: 

313 linkage_data.alt = traditional_form 

314 

315 

316def process_voir_anagrammes_template( 

317 wxr: WiktextractContext, node: TemplateNode 

318) -> list[Linkage]: 

319 # https://fr.wiktionary.org/wiki/Modèle:voir_anagrammes 

320 results = [] 

321 expanded_node = wxr.wtp.parse( 

322 wxr.wtp.node_to_wikitext(node), expand_all=True 

323 ) 

324 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

325 for link_node in list_item.find_child(NodeKind.LINK): 

326 word = clean_node(wxr, None, link_node) 

327 if len(word) > 0: 327 ↛ 325line 327 didn't jump to line 325 because the condition on line 327 was always true

328 results.append(Linkage(word=word)) 

329 return results