Coverage for src / wiktextract / extractor / fr / linkage.py: 95%

153 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-12 08:09 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..ruby import extract_ruby 

8from ..share import capture_text_in_parentheses 

9from .models import Form, Linkage, WordEntry 

10from .section_types import LINKAGE_SECTIONS, LINKAGE_TAGS 

11from .tags import translate_raw_tags 

12 

13 

14def extract_linkage( 

15 wxr: WiktextractContext, 

16 page_data: list[WordEntry], 

17 level_node: LevelNode, 

18 section_type: str, 

19) -> None: 

20 if section_type == "anagrammes": 

21 for node in level_node.find_child(NodeKind.TEMPLATE): 

22 if node.template_name == "voir anagrammes": 22 ↛ 21line 22 didn't jump to line 21 because the condition on line 22 was always true

23 anagram_list = process_voir_anagrammes_template(wxr, node) 

24 for data in page_data: 

25 if data.lang_code == page_data[-1].lang_code: 25 ↛ 24line 25 didn't jump to line 24 because the condition on line 25 was always true

26 data.anagrams.extend(anagram_list) 

27 else: 

28 extract_linkage_section( 

29 wxr, 

30 page_data[-1], 

31 level_node, 

32 LINKAGE_SECTIONS[section_type], 

33 LINKAGE_TAGS.get(section_type, []), 

34 ) 

35 

36 

37def extract_linkage_section( 

38 wxr: WiktextractContext, 

39 word_entry: WordEntry, 

40 level_node: LevelNode, 

41 linkage_type: str, 

42 section_tags: list[str] = [], 

43): 

44 sense_text = "" 

45 sense_index = 0 

46 for node in level_node.children: 

47 if isinstance(node, TemplateNode) and node.template_name == "(": 

48 new_sense_text = clean_node( 

49 wxr, None, node.template_parameters.get(1, "") 

50 ) 

51 if new_sense_text != "": 51 ↛ 53line 51 didn't jump to line 53 because the condition on line 51 was always true

52 sense_text = new_sense_text 

53 sense_index_text = node.template_parameters.get(2, "0") 

54 if ( 54 ↛ 46line 54 didn't jump to line 46 because the condition on line 54 was always true

55 isinstance(sense_index_text, str) 

56 and sense_index_text.isdecimal() 

57 ): 

58 sense_index = int(sense_index_text) 

59 elif ( 

60 isinstance(node, WikiNode) 

61 and node.kind in NodeKind.BOLD | NodeKind.ITALIC 

62 ): 

63 sense_text = clean_node(wxr, None, node) 

64 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

65 # sense could also be in ";" description list 

66 if node.sarg in [";", ":"]: 

67 for list_item in node.find_child(NodeKind.LIST_ITEM): 

68 sense_text = clean_node(wxr, None, list_item.children) 

69 index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$" 

70 m = re.search(index_pattern, sense_text) 

71 if m is not None: 71 ↛ 67line 71 didn't jump to line 67 because the condition on line 71 was always true

72 sense_text = re.sub(index_pattern, "", sense_text) 

73 sense_index = int(m.group(1)) 

74 else: 

75 for list_item in node.find_child(NodeKind.LIST_ITEM): 

76 extract_linkage_list_item( 

77 wxr, 

78 word_entry, 

79 list_item, 

80 linkage_type, 

81 section_tags, 

82 sense_text, 

83 sense_index, 

84 ) 

85 

86 

87def extract_linkage_list_item( 

88 wxr: WiktextractContext, 

89 word_entry: WordEntry, 

90 list_item: WikiNode, 

91 linkage_type: str, 

92 section_tags: list[str], 

93 sense: str, 

94 sense_index: int, 

95): 

96 linkage_data = Linkage( 

97 word="", tags=section_tags, sense=sense, sense_index=sense_index 

98 ) 

99 pending_tag = "" 

100 inside_bracket = False 

101 for index, child_node in enumerate(list_item.children): 

102 if isinstance( 

103 child_node, TemplateNode 

104 ) and child_node.template_name in [ 

105 "l", 

106 "lien", 

107 "zh-lien", 

108 "zh-lien-t", 

109 ]: 

110 process_linkage_template(wxr, child_node, linkage_data) 

111 elif ( 

112 isinstance(child_node, TemplateNode) 

113 and child_node.template_name == "zh-l" 

114 ): 

115 for l_data in extract_zh_l_template( 

116 wxr, child_node, section_tags, sense, sense_index 

117 ): 

118 add_linkage_data(word_entry, linkage_type, l_data) 

119 elif ( 

120 isinstance(child_node, TemplateNode) 

121 and child_node.template_name == "cf" 

122 ): 

123 return 

124 elif ( 

125 isinstance(child_node, WikiNode) 

126 and child_node.kind == NodeKind.LINK 

127 and not inside_bracket 

128 ): 

129 linkage_data.word = clean_node(wxr, None, child_node) 

130 elif ( 

131 isinstance(child_node, WikiNode) 

132 and child_node.kind == NodeKind.ITALIC 

133 ): 

134 italic_text = clean_node(wxr, None, child_node).strip("()") 

135 if italic_text == "": 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 continue 

137 elif len(list(list_item.filter_empty_str_child())) == 1: 

138 linkage_data.word = italic_text 

139 elif italic_text.isdecimal(): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 linkage_data.sense_index = int(italic_text) 

141 elif inside_bracket: 

142 linkage_data.raw_tags.append(italic_text) 

143 else: 

144 linkage_data.sense = italic_text 

145 elif ( 

146 isinstance(child_node, TemplateNode) 

147 and child_node.template_name == "réf" 

148 ) or ( 

149 isinstance(child_node, WikiNode) 

150 and child_node.kind == NodeKind.LIST 

151 ): 

152 continue 

153 else: 

154 tag_text = ( 

155 child_node 

156 if isinstance(child_node, str) 

157 else clean_node(wxr, word_entry, child_node) 

158 ) 

159 if ( 

160 tag_text.strip() in {",", "/", "(ou"} 

161 and linkage_data.word != "" 

162 ): 

163 # list item has more than one word 

164 add_linkage_data(word_entry, linkage_type, linkage_data) 

165 linkage_data = Linkage( 

166 word="", 

167 tags=section_tags, 

168 sense=sense, 

169 sense_index=sense_index, 

170 ) 

171 continue 

172 if tag_text.strip().startswith( 

173 "(" 

174 ) and not tag_text.strip().endswith(")"): 

175 pending_tag = tag_text 

176 inside_bracket = True 

177 continue 

178 elif not tag_text.strip().startswith( 

179 "(" 

180 ) and tag_text.strip().endswith(")"): 

181 tag_text = pending_tag + tag_text 

182 pending_tag = "" 

183 inside_bracket = False 

184 elif len(pending_tag) > 0: 

185 pending_tag += tag_text 

186 continue 

187 

188 if tag_text.strip().startswith("—"): 

189 linkage_data.translation = clean_node( 

190 wxr, 

191 None, 

192 list(list_item.invert_find_child(NodeKind.LIST, True))[ 

193 index: 

194 ], 

195 ).strip("— \n") 

196 break 

197 elif tag_text.lstrip().startswith(":"): 

198 linkage_data.sense = clean_node( 

199 wxr, 

200 None, 

201 [tag_text.lstrip().removeprefix(":").lstrip()] 

202 + [ 

203 n 

204 for n in list_item.children[index + 1 :] 

205 if not ( 

206 ( 

207 isinstance(n, TemplateNode) 

208 and n.template_name == "réf" 

209 ) 

210 or ( 

211 isinstance(n, WikiNode) 

212 and n.kind == NodeKind.LIST 

213 ) 

214 ) 

215 ], 

216 ) 

217 break 

218 else: 

219 tags, _ = capture_text_in_parentheses(tag_text) 

220 for tag in tags: 

221 if tag.isdecimal(): 

222 linkage_data.sense_index = int(tag) 

223 else: 

224 linkage_data.raw_tags.append(tag) 

225 

226 if len(linkage_data.word) > 0: 

227 add_linkage_data(word_entry, linkage_type, linkage_data) 

228 for child_list in list_item.find_child(NodeKind.LIST): 

229 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

230 extract_linkage_list_item( 

231 wxr, 

232 word_entry, 

233 child_list_item, 

234 linkage_type, 

235 section_tags, 

236 sense, 

237 sense_index, 

238 ) 

239 

240 

241def add_linkage_data( 

242 word_entry: WordEntry, l_type: str, l_data: Linkage 

243) -> None: 

244 if l_data.word == "": 244 ↛ 245line 244 didn't jump to line 245 because the condition on line 244 was never true

245 return 

246 translate_raw_tags(l_data) 

247 if l_type == "forms": 

248 word_entry.forms.append( 

249 Form( 

250 form=l_data.word, 

251 tags=l_data.tags, 

252 raw_tags=l_data.raw_tags, 

253 roman=l_data.roman, 

254 sense=l_data.sense, 

255 sense_index=l_data.sense_index, 

256 ) 

257 ) 

258 else: 

259 getattr(word_entry, l_type).append(l_data) 

260 

261 

262def process_linkage_template( 

263 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage 

264) -> None: 

265 if node.template_name in ["lien", "l"]: 

266 process_lien_template(wxr, node, linkage_data) 

267 elif node.template_name.startswith("zh-lien"): 267 ↛ exitline 267 didn't return from function 'process_linkage_template' because the condition on line 267 was always true

268 process_zh_lien_template(wxr, node, linkage_data) 

269 

270 

271def process_lien_template( 

272 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage 

273) -> None: 

274 # link word template: https://fr.wiktionary.org/wiki/Modèle:lien 

275 ruby, without_ruby = extract_ruby( 

276 wxr, 

277 wxr.wtp.parse( 

278 wxr.wtp.node_to_wikitext( 

279 node.template_parameters.get( 

280 "dif", node.template_parameters.get(1) 

281 ) 

282 ), 

283 expand_all=True, 

284 ), 

285 ) 

286 linkage_data.word = clean_node(wxr, None, without_ruby) 

287 linkage_data.ruby = ruby 

288 linkage_data.roman = clean_node( 

289 wxr, None, node.template_parameters.get("tr", "") 

290 ) 

291 linkage_data.translation = clean_node( 

292 wxr, None, node.template_parameters.get("sens", "") 

293 ) 

294 

295 

296def process_zh_lien_template( 

297 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage 

298) -> None: 

299 # https://fr.wiktionary.org/wiki/Modèle:zh-lien 

300 linkage_data.word = clean_node(wxr, None, node.template_parameters.get(1)) 

301 linkage_data.roman = clean_node( 

302 wxr, None, node.template_parameters.get(2, "") 

303 ) # pinyin 

304 traditional_form = clean_node( 

305 wxr, None, node.template_parameters.get(3, "") 

306 ) 

307 if len(traditional_form) > 0: 

308 linkage_data.alt = traditional_form 

309 

310 

311def process_voir_anagrammes_template( 

312 wxr: WiktextractContext, node: TemplateNode 

313) -> list[Linkage]: 

314 # https://fr.wiktionary.org/wiki/Modèle:voir_anagrammes 

315 results = [] 

316 expanded_node = wxr.wtp.parse( 

317 wxr.wtp.node_to_wikitext(node), expand_all=True 

318 ) 

319 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

320 for link_node in list_item.find_child(NodeKind.LINK): 

321 word = clean_node(wxr, None, link_node) 

322 if len(word) > 0: 322 ↛ 320line 322 didn't jump to line 320 because the condition on line 322 was always true

323 results.append(Linkage(word=word)) 

324 return results 

325 

326 

327def extract_zh_l_template( 

328 wxr: WiktextractContext, 

329 t_node: TemplateNode, 

330 raw_tags: list[str] = [], 

331 sense: str = "", 

332 sense_index: int = 0, 

333) -> list[Linkage]: 

334 # https://fr.wiktionary.org/wiki/Modèle:zh-l 

335 roman = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

336 new_sense = clean_node(wxr, None, t_node.template_parameters.get(3, "")) 

337 if new_sense != "": 

338 sense = new_sense 

339 l_list = [] 

340 expanded_node = wxr.wtp.parse( 

341 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

342 ) 

343 for span_tag in expanded_node.find_html( 

344 "span", attr_name="lang", attr_value="zh" 

345 ): 

346 word = clean_node(wxr, None, span_tag) 

347 if word != "": 347 ↛ 343line 347 didn't jump to line 343 because the condition on line 347 was always true

348 l_data = Linkage( 

349 word=word, 

350 sense=sense, 

351 sense_index=sense_index, 

352 raw_tags=raw_tags, 

353 roman=roman, 

354 ) 

355 translate_raw_tags(l_data) 

356 l_list.append(l_data) 

357 if len(l_list) == 2: 

358 for index, l_data in enumerate(l_list): 

359 if index == 0: 

360 l_data.tags.append("Traditional-Chinese") 

361 else: 

362 l_data.tags.append("Simplified-Chinese") 

363 return l_list