Coverage for src/wiktextract/extractor/fr/linkage.py: 94%

152 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-17 05:52 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..ruby import extract_ruby 

8from ..share import capture_text_in_parentheses 

9from .models import Form, Linkage, WordEntry 

10from .section_types import LINKAGE_SECTIONS, LINKAGE_TAGS 

11from .tags import translate_raw_tags 

12 

13 

14def extract_linkage( 

15 wxr: WiktextractContext, 

16 page_data: list[WordEntry], 

17 level_node: LevelNode, 

18 section_type: str, 

19) -> None: 

20 if section_type == "anagrammes": 

21 for node in level_node.find_child(NodeKind.TEMPLATE): 

22 if node.template_name == "voir anagrammes": 22 ↛ 21line 22 didn't jump to line 21 because the condition on line 22 was always true

23 anagram_list = process_voir_anagrammes_template(wxr, node) 

24 for data in page_data: 

25 if data.lang_code == page_data[-1].lang_code: 25 ↛ 24line 25 didn't jump to line 24 because the condition on line 25 was always true

26 data.anagrams.extend(anagram_list) 

27 else: 

28 extract_linkage_section( 

29 wxr, 

30 page_data[-1], 

31 level_node, 

32 LINKAGE_SECTIONS[section_type], 

33 LINKAGE_TAGS.get(section_type, []), 

34 ) 

35 

36 

37def extract_linkage_section( 

38 wxr: WiktextractContext, 

39 word_entry: WordEntry, 

40 level_node: LevelNode, 

41 linkage_type: str, 

42 section_tags: list[str] = [], 

43): 

44 sense_text = "" 

45 sense_index = 0 

46 for node in level_node.children: 

47 if isinstance(node, TemplateNode) and node.template_name == "(": 

48 new_sense_text = clean_node( 

49 wxr, None, node.template_parameters.get(1, "") 

50 ) 

51 if new_sense_text != "": 51 ↛ 53line 51 didn't jump to line 53 because the condition on line 51 was always true

52 sense_text = new_sense_text 

53 sense_index_text = node.template_parameters.get(2, "0") 

54 if ( 54 ↛ 46line 54 didn't jump to line 46 because the condition on line 54 was always true

55 isinstance(sense_index_text, str) 

56 and sense_index_text.isdecimal() 

57 ): 

58 sense_index = int(sense_index_text) 

59 elif ( 

60 isinstance(node, WikiNode) 

61 and node.kind in NodeKind.BOLD | NodeKind.ITALIC 

62 ): 

63 sense_text = clean_node(wxr, None, node) 

64 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

65 # sense could also be in ";" description list 

66 if node.sarg in [";", ":"]: 

67 for list_item in node.find_child(NodeKind.LIST_ITEM): 

68 sense_text = clean_node(wxr, None, list_item.children) 

69 index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$" 

70 m = re.search(index_pattern, sense_text) 

71 if m is not None: 71 ↛ 67line 71 didn't jump to line 67 because the condition on line 71 was always true

72 sense_text = re.sub(index_pattern, "", sense_text) 

73 sense_index = int(m.group(1)) 

74 else: 

75 for list_item in node.find_child(NodeKind.LIST_ITEM): 

76 extract_linkage_list_item( 

77 wxr, 

78 word_entry, 

79 list_item, 

80 linkage_type, 

81 section_tags, 

82 sense_text, 

83 sense_index, 

84 ) 

85 

86 

87def extract_linkage_list_item( 

88 wxr: WiktextractContext, 

89 word_entry: WordEntry, 

90 list_item: WikiNode, 

91 linkage_type: str, 

92 section_tags: list[str], 

93 sense: str, 

94 sense_index: int, 

95): 

96 linkage_data = Linkage( 

97 word="", tags=section_tags, sense=sense, sense_index=sense_index 

98 ) 

99 pending_tag = "" 

100 inside_bracket = False 

101 for index, child_node in enumerate(list_item.children): 

102 if isinstance( 

103 child_node, TemplateNode 

104 ) and child_node.template_name in [ 

105 "l", 

106 "lien", 

107 "zh-lien", 

108 "zh-lien-t", 

109 ]: 

110 process_linkage_template(wxr, child_node, linkage_data) 

111 elif ( 

112 isinstance(child_node, TemplateNode) 

113 and child_node.template_name == "zh-l" 

114 ): 

115 getattr(word_entry, linkage_type).extend( 

116 extract_zh_l_template( 

117 wxr, child_node, section_tags, sense, sense_index 

118 ) 

119 ) 

120 elif ( 

121 isinstance(child_node, TemplateNode) 

122 and child_node.template_name == "cf" 

123 ): 

124 return 

125 elif ( 

126 isinstance(child_node, WikiNode) 

127 and child_node.kind == NodeKind.LINK 

128 and not inside_bracket 

129 ): 

130 linkage_data.word = clean_node(wxr, None, child_node) 

131 elif ( 

132 isinstance(child_node, WikiNode) 

133 and child_node.kind == NodeKind.ITALIC 

134 ): 

135 italic_text = clean_node(wxr, None, child_node).strip("()") 

136 if italic_text == "": 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true

137 continue 

138 elif len(list(list_item.filter_empty_str_child())) == 1: 

139 linkage_data.word = italic_text 

140 elif italic_text.isdecimal(): 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true

141 linkage_data.sense_index = int(italic_text) 

142 elif inside_bracket: 

143 linkage_data.raw_tags.append(italic_text) 

144 else: 

145 linkage_data.sense = italic_text 

146 elif ( 

147 isinstance(child_node, TemplateNode) 

148 and child_node.template_name == "réf" 

149 ) or ( 

150 isinstance(child_node, WikiNode) 

151 and child_node.kind == NodeKind.LIST 

152 ): 

153 continue 

154 else: 

155 tag_text = ( 

156 child_node 

157 if isinstance(child_node, str) 

158 else clean_node(wxr, word_entry, child_node) 

159 ) 

160 if ( 

161 tag_text.strip() in {",", "/", "(ou"} 

162 and linkage_data.word != "" 

163 ): 

164 # list item has more than one word 

165 add_linkage_data(word_entry, linkage_type, linkage_data) 

166 linkage_data = Linkage( 

167 word="", 

168 tags=section_tags, 

169 sense=sense, 

170 sense_index=sense_index, 

171 ) 

172 continue 

173 if tag_text.strip().startswith( 

174 "(" 

175 ) and not tag_text.strip().endswith(")"): 

176 pending_tag = tag_text 

177 inside_bracket = True 

178 continue 

179 elif not tag_text.strip().startswith( 

180 "(" 

181 ) and tag_text.strip().endswith(")"): 

182 tag_text = pending_tag + tag_text 

183 pending_tag = "" 

184 inside_bracket = False 

185 elif len(pending_tag) > 0: 

186 pending_tag += tag_text 

187 continue 

188 

189 if tag_text.strip().startswith("—"): 

190 linkage_data.translation = clean_node( 

191 wxr, 

192 None, 

193 list(list_item.invert_find_child(NodeKind.LIST, True))[ 

194 index: 

195 ], 

196 ).strip("— \n") 

197 break 

198 elif tag_text.strip().startswith(":"): 

199 sense_text = tag_text.strip().removeprefix(":").strip() 

200 linkage_data.sense = sense_text 

201 else: 

202 tags, _ = capture_text_in_parentheses(tag_text) 

203 for tag in tags: 

204 if tag.isdecimal(): 

205 linkage_data.sense_index = int(tag) 

206 else: 

207 linkage_data.raw_tags.append(tag) 

208 

209 if len(linkage_data.word) > 0: 

210 add_linkage_data(word_entry, linkage_type, linkage_data) 

211 for child_list in list_item.find_child(NodeKind.LIST): 

212 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

213 extract_linkage_list_item( 

214 wxr, 

215 word_entry, 

216 child_list_item, 

217 linkage_type, 

218 section_tags, 

219 sense, 

220 sense_index, 

221 ) 

222 

223 

224def add_linkage_data( 

225 word_entry: WordEntry, l_type: str, l_data: Linkage 

226) -> None: 

227 if l_data.word == "": 227 ↛ 228line 227 didn't jump to line 228 because the condition on line 227 was never true

228 return 

229 translate_raw_tags(l_data) 

230 if l_type == "forms": 

231 word_entry.forms.append( 

232 Form( 

233 form=l_data.word, 

234 tags=l_data.tags, 

235 raw_tags=l_data.raw_tags, 

236 roman=l_data.roman, 

237 sense=l_data.sense, 

238 sense_index=l_data.sense_index, 

239 ) 

240 ) 

241 else: 

242 getattr(word_entry, l_type).append(l_data) 

243 

244 

245def process_linkage_template( 

246 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage 

247) -> None: 

248 if node.template_name in ["lien", "l"]: 

249 process_lien_template(wxr, node, linkage_data) 

250 elif node.template_name.startswith("zh-lien"): 250 ↛ exitline 250 didn't return from function 'process_linkage_template' because the condition on line 250 was always true

251 process_zh_lien_template(wxr, node, linkage_data) 

252 

253 

254def process_lien_template( 

255 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage 

256) -> None: 

257 # link word template: https://fr.wiktionary.org/wiki/Modèle:lien 

258 ruby, without_ruby = extract_ruby( 

259 wxr, 

260 wxr.wtp.parse( 

261 wxr.wtp.node_to_wikitext( 

262 node.template_parameters.get( 

263 "dif", node.template_parameters.get(1) 

264 ) 

265 ), 

266 expand_all=True, 

267 ), 

268 ) 

269 linkage_data.word = clean_node(wxr, None, without_ruby) 

270 linkage_data.ruby = ruby 

271 linkage_data.roman = clean_node( 

272 wxr, None, node.template_parameters.get("tr", "") 

273 ) 

274 linkage_data.translation = clean_node( 

275 wxr, None, node.template_parameters.get("sens", "") 

276 ) 

277 

278 

279def process_zh_lien_template( 

280 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage 

281) -> None: 

282 # https://fr.wiktionary.org/wiki/Modèle:zh-lien 

283 linkage_data.word = clean_node(wxr, None, node.template_parameters.get(1)) 

284 linkage_data.roman = clean_node( 

285 wxr, None, node.template_parameters.get(2, "") 

286 ) # pinyin 

287 traditional_form = clean_node( 

288 wxr, None, node.template_parameters.get(3, "") 

289 ) 

290 if len(traditional_form) > 0: 

291 linkage_data.alt = traditional_form 

292 

293 

294def process_voir_anagrammes_template( 

295 wxr: WiktextractContext, node: TemplateNode 

296) -> list[Linkage]: 

297 # https://fr.wiktionary.org/wiki/Modèle:voir_anagrammes 

298 results = [] 

299 expanded_node = wxr.wtp.parse( 

300 wxr.wtp.node_to_wikitext(node), expand_all=True 

301 ) 

302 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

303 for link_node in list_item.find_child(NodeKind.LINK): 

304 word = clean_node(wxr, None, link_node) 

305 if len(word) > 0: 305 ↛ 303line 305 didn't jump to line 303 because the condition on line 305 was always true

306 results.append(Linkage(word=word)) 

307 return results 

308 

309 

310def extract_zh_l_template( 

311 wxr: WiktextractContext, 

312 t_node: TemplateNode, 

313 raw_tags: list[str] = [], 

314 sense: str = "", 

315 sense_index: int = 0, 

316) -> list[Linkage]: 

317 # https://fr.wiktionary.org/wiki/Modèle:zh-l 

318 roman = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

319 new_sense = clean_node(wxr, None, t_node.template_parameters.get(3, "")) 

320 if new_sense != "": 320 ↛ 322line 320 didn't jump to line 322 because the condition on line 320 was always true

321 sense = new_sense 

322 l_list = [] 

323 expanded_node = wxr.wtp.parse( 

324 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

325 ) 

326 for span_tag in expanded_node.find_html( 

327 "span", attr_name="lang", attr_value="zh" 

328 ): 

329 word = clean_node(wxr, None, span_tag) 

330 if word != "": 330 ↛ 326line 330 didn't jump to line 326 because the condition on line 330 was always true

331 l_data = Linkage( 

332 word=word, 

333 sense=sense, 

334 sense_index=sense_index, 

335 raw_tags=raw_tags, 

336 roman=roman, 

337 ) 

338 translate_raw_tags(l_data) 

339 l_list.append(l_data) 

340 if len(l_list) == 2: 340 ↛ 346line 340 didn't jump to line 346 because the condition on line 340 was always true

341 for index, l_data in enumerate(l_list): 

342 if index == 0: 

343 l_data.tags.append("Traditional-Chinese") 

344 else: 

345 l_data.tags.append("Simplified-Chinese") 

346 return l_list