Coverage for src/wiktextract/extractor/fr/linkage.py: 95%

153 statements  

« prev     ^ index     » next       coverage.py v7.11.3, created at 2025-11-14 08:49 +0000

1import re 

2 

3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from ..ruby import extract_ruby 

8from ..share import capture_text_in_parentheses 

9from .models import Form, Linkage, WordEntry 

10from .section_types import LINKAGE_SECTIONS, LINKAGE_TAGS 

11from .tags import translate_raw_tags 

12 

13 

14def extract_linkage( 

15 wxr: WiktextractContext, 

16 page_data: list[WordEntry], 

17 level_node: LevelNode, 

18 section_type: str, 

19) -> None: 

20 if section_type == "anagrammes": 

21 for node in level_node.find_child(NodeKind.TEMPLATE): 

22 if node.template_name == "voir anagrammes": 22 ↛ 21line 22 didn't jump to line 21 because the condition on line 22 was always true

23 anagram_list = process_voir_anagrammes_template(wxr, node) 

24 for data in page_data: 

25 if data.lang_code == page_data[-1].lang_code: 25 ↛ 24line 25 didn't jump to line 24 because the condition on line 25 was always true

26 data.anagrams.extend(anagram_list) 

27 else: 

28 extract_linkage_section( 

29 wxr, 

30 page_data[-1], 

31 level_node, 

32 LINKAGE_SECTIONS[section_type], 

33 LINKAGE_TAGS.get(section_type, []), 

34 ) 

35 

36 

37def extract_linkage_section( 

38 wxr: WiktextractContext, 

39 word_entry: WordEntry, 

40 level_node: LevelNode, 

41 linkage_type: str, 

42 section_tags: list[str] = [], 

43): 

44 sense_text = "" 

45 sense_index = 0 

46 for node in level_node.children: 

47 if isinstance(node, TemplateNode) and node.template_name == "(": 

48 new_sense_text = clean_node( 

49 wxr, None, node.template_parameters.get(1, "") 

50 ) 

51 if new_sense_text != "": 51 ↛ 53line 51 didn't jump to line 53 because the condition on line 51 was always true

52 sense_text = new_sense_text 

53 sense_index_text = node.template_parameters.get(2, "0") 

54 if ( 54 ↛ 46line 54 didn't jump to line 46 because the condition on line 54 was always true

55 isinstance(sense_index_text, str) 

56 and sense_index_text.isdecimal() 

57 ): 

58 sense_index = int(sense_index_text) 

59 elif ( 

60 isinstance(node, WikiNode) 

61 and node.kind in NodeKind.BOLD | NodeKind.ITALIC 

62 ): 

63 sense_text = clean_node(wxr, None, node) 

64 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

65 # sense could also be in ";" description list 

66 if node.sarg in [";", ":"]: 

67 for list_item in node.find_child(NodeKind.LIST_ITEM): 

68 sense_text = clean_node(wxr, None, list_item.children) 

69 index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$" 

70 m = re.search(index_pattern, sense_text) 

71 if m is not None: 71 ↛ 67line 71 didn't jump to line 67 because the condition on line 71 was always true

72 sense_text = re.sub(index_pattern, "", sense_text) 

73 sense_index = int(m.group(1)) 

74 else: 

75 for list_item in node.find_child(NodeKind.LIST_ITEM): 

76 extract_linkage_list_item( 

77 wxr, 

78 word_entry, 

79 list_item, 

80 linkage_type, 

81 section_tags, 

82 sense_text, 

83 sense_index, 

84 ) 

85 

86 

87def extract_linkage_list_item( 

88 wxr: WiktextractContext, 

89 word_entry: WordEntry, 

90 list_item: WikiNode, 

91 linkage_type: str, 

92 section_tags: list[str], 

93 sense: str, 

94 sense_index: int, 

95): 

96 linkage_data = Linkage( 

97 word="", tags=section_tags, sense=sense, sense_index=sense_index 

98 ) 

99 pending_tag = "" 

100 inside_bracket = False 

101 for index, child_node in enumerate(list_item.children): 

102 if isinstance( 

103 child_node, TemplateNode 

104 ) and child_node.template_name in [ 

105 "l", 

106 "lien", 

107 "zh-lien", 

108 "zh-lien-t", 

109 ]: 

110 process_linkage_template(wxr, child_node, linkage_data) 

111 elif ( 

112 isinstance(child_node, TemplateNode) 

113 and child_node.template_name == "zh-l" 

114 ): 

115 for l_data in extract_zh_l_template( 

116 wxr, child_node, section_tags, sense, sense_index 

117 ): 

118 add_linkage_data(word_entry, linkage_type, l_data) 

119 elif ( 

120 isinstance(child_node, TemplateNode) 

121 and child_node.template_name == "cf" 

122 ): 

123 return 

124 elif ( 

125 isinstance(child_node, WikiNode) 

126 and child_node.kind == NodeKind.LINK 

127 and not inside_bracket 

128 ): 

129 linkage_data.word = clean_node(wxr, None, child_node) 

130 elif ( 

131 isinstance(child_node, WikiNode) 

132 and child_node.kind == NodeKind.ITALIC 

133 ): 

134 italic_text = clean_node(wxr, None, child_node).strip("()") 

135 if italic_text == "": 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 continue 

137 elif len(list(list_item.filter_empty_str_child())) == 1: 

138 linkage_data.word = italic_text 

139 elif italic_text.isdecimal(): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 linkage_data.sense_index = int(italic_text) 

141 elif inside_bracket: 

142 linkage_data.raw_tags.append(italic_text) 

143 else: 

144 linkage_data.sense = italic_text 

145 elif ( 

146 isinstance(child_node, TemplateNode) 

147 and child_node.template_name == "réf" 

148 ) or ( 

149 isinstance(child_node, WikiNode) 

150 and child_node.kind == NodeKind.LIST 

151 ): 

152 continue 

153 else: 

154 tag_text = ( 

155 child_node 

156 if isinstance(child_node, str) 

157 else clean_node(wxr, word_entry, child_node) 

158 ) 

159 if ( 

160 tag_text.strip() in {",", "/", "(ou"} 

161 and linkage_data.word != "" 

162 ): 

163 # list item has more than one word 

164 add_linkage_data(word_entry, linkage_type, linkage_data) 

165 linkage_data = Linkage( 

166 word="", 

167 tags=section_tags, 

168 sense=sense, 

169 sense_index=sense_index, 

170 ) 

171 continue 

172 if tag_text.strip().startswith( 

173 "(" 

174 ) and not tag_text.strip().endswith(")"): 

175 pending_tag = tag_text 

176 inside_bracket = True 

177 continue 

178 elif not tag_text.strip().startswith( 

179 "(" 

180 ) and tag_text.strip().endswith(")"): 

181 tag_text = pending_tag + tag_text 

182 pending_tag = "" 

183 inside_bracket = False 

184 elif len(pending_tag) > 0: 

185 pending_tag += tag_text 

186 continue 

187 

188 if tag_text.strip().startswith("—"): 

189 linkage_data.translation = clean_node( 

190 wxr, 

191 None, 

192 list(list_item.invert_find_child(NodeKind.LIST, True))[ 

193 index: 

194 ], 

195 ).strip("— \n") 

196 break 

197 elif tag_text.strip().startswith(":"): 

198 sense_text = tag_text.strip().removeprefix(":").strip() 

199 linkage_data.sense = sense_text 

200 else: 

201 tags, _ = capture_text_in_parentheses(tag_text) 

202 for tag in tags: 

203 if tag.isdecimal(): 

204 linkage_data.sense_index = int(tag) 

205 else: 

206 linkage_data.raw_tags.append(tag) 

207 

208 if len(linkage_data.word) > 0: 

209 add_linkage_data(word_entry, linkage_type, linkage_data) 

210 for child_list in list_item.find_child(NodeKind.LIST): 

211 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

212 extract_linkage_list_item( 

213 wxr, 

214 word_entry, 

215 child_list_item, 

216 linkage_type, 

217 section_tags, 

218 sense, 

219 sense_index, 

220 ) 

221 

222 

223def add_linkage_data( 

224 word_entry: WordEntry, l_type: str, l_data: Linkage 

225) -> None: 

226 if l_data.word == "": 226 ↛ 227line 226 didn't jump to line 227 because the condition on line 226 was never true

227 return 

228 translate_raw_tags(l_data) 

229 if l_type == "forms": 

230 word_entry.forms.append( 

231 Form( 

232 form=l_data.word, 

233 tags=l_data.tags, 

234 raw_tags=l_data.raw_tags, 

235 roman=l_data.roman, 

236 sense=l_data.sense, 

237 sense_index=l_data.sense_index, 

238 ) 

239 ) 

240 else: 

241 getattr(word_entry, l_type).append(l_data) 

242 

243 

244def process_linkage_template( 

245 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage 

246) -> None: 

247 if node.template_name in ["lien", "l"]: 

248 process_lien_template(wxr, node, linkage_data) 

249 elif node.template_name.startswith("zh-lien"): 249 ↛ exitline 249 didn't return from function 'process_linkage_template' because the condition on line 249 was always true

250 process_zh_lien_template(wxr, node, linkage_data) 

251 

252 

253def process_lien_template( 

254 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage 

255) -> None: 

256 # link word template: https://fr.wiktionary.org/wiki/Modèle:lien 

257 ruby, without_ruby = extract_ruby( 

258 wxr, 

259 wxr.wtp.parse( 

260 wxr.wtp.node_to_wikitext( 

261 node.template_parameters.get( 

262 "dif", node.template_parameters.get(1) 

263 ) 

264 ), 

265 expand_all=True, 

266 ), 

267 ) 

268 linkage_data.word = clean_node(wxr, None, without_ruby) 

269 linkage_data.ruby = ruby 

270 linkage_data.roman = clean_node( 

271 wxr, None, node.template_parameters.get("tr", "") 

272 ) 

273 linkage_data.translation = clean_node( 

274 wxr, None, node.template_parameters.get("sens", "") 

275 ) 

276 

277 

278def process_zh_lien_template( 

279 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage 

280) -> None: 

281 # https://fr.wiktionary.org/wiki/Modèle:zh-lien 

282 linkage_data.word = clean_node(wxr, None, node.template_parameters.get(1)) 

283 linkage_data.roman = clean_node( 

284 wxr, None, node.template_parameters.get(2, "") 

285 ) # pinyin 

286 traditional_form = clean_node( 

287 wxr, None, node.template_parameters.get(3, "") 

288 ) 

289 if len(traditional_form) > 0: 

290 linkage_data.alt = traditional_form 

291 

292 

293def process_voir_anagrammes_template( 

294 wxr: WiktextractContext, node: TemplateNode 

295) -> list[Linkage]: 

296 # https://fr.wiktionary.org/wiki/Modèle:voir_anagrammes 

297 results = [] 

298 expanded_node = wxr.wtp.parse( 

299 wxr.wtp.node_to_wikitext(node), expand_all=True 

300 ) 

301 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

302 for link_node in list_item.find_child(NodeKind.LINK): 

303 word = clean_node(wxr, None, link_node) 

304 if len(word) > 0: 304 ↛ 302line 304 didn't jump to line 302 because the condition on line 304 was always true

305 results.append(Linkage(word=word)) 

306 return results 

307 

308 

309def extract_zh_l_template( 

310 wxr: WiktextractContext, 

311 t_node: TemplateNode, 

312 raw_tags: list[str] = [], 

313 sense: str = "", 

314 sense_index: int = 0, 

315) -> list[Linkage]: 

316 # https://fr.wiktionary.org/wiki/Modèle:zh-l 

317 roman = clean_node(wxr, None, t_node.template_parameters.get(2, "")) 

318 new_sense = clean_node(wxr, None, t_node.template_parameters.get(3, "")) 

319 if new_sense != "": 

320 sense = new_sense 

321 l_list = [] 

322 expanded_node = wxr.wtp.parse( 

323 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

324 ) 

325 for span_tag in expanded_node.find_html( 

326 "span", attr_name="lang", attr_value="zh" 

327 ): 

328 word = clean_node(wxr, None, span_tag) 

329 if word != "": 329 ↛ 325line 329 didn't jump to line 325 because the condition on line 329 was always true

330 l_data = Linkage( 

331 word=word, 

332 sense=sense, 

333 sense_index=sense_index, 

334 raw_tags=raw_tags, 

335 roman=roman, 

336 ) 

337 translate_raw_tags(l_data) 

338 l_list.append(l_data) 

339 if len(l_list) == 2: 

340 for index, l_data in enumerate(l_list): 

341 if index == 0: 

342 l_data.tags.append("Traditional-Chinese") 

343 else: 

344 l_data.tags.append("Simplified-Chinese") 

345 return l_list