Coverage for src/wiktextract/extractor/fr/etymology.py: 95%

126 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1from collections import defaultdict 

2from dataclasses import dataclass, field 

3 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .models import Example, WordEntry 

15 

16 

17@dataclass 

18class EtymologyData: 

19 texts: list[str] = field(default_factory=list) 

20 categories: list[str] = field(default_factory=list) 

21 

22 

23EtymologyDict = dict[tuple[str, str], EtymologyData] 

24 

25 

26def extract_etymology( 

27 wxr: WiktextractContext, level_node: LevelNode, base_data: WordEntry 

28) -> EtymologyDict: 

29 etymology_dict: EtymologyDict = defaultdict(EtymologyData) 

30 level_node_index = len(level_node.children) 

31 pos_id = "" 

32 pos_title = "" 

33 for node_index, node in level_node.find_child( 

34 NodeKind.LIST | LEVEL_KIND_FLAGS, True 

35 ): 

36 if node.kind in LEVEL_KIND_FLAGS and node_index < level_node_index: 

37 level_node_index = node_index 

38 elif node.kind == NodeKind.LIST: 38 ↛ 33line 38 didn't jump to line 33 because the condition on line 38 was always true

39 for etymology_item in node.find_child(NodeKind.LIST_ITEM): 

40 pos_id, pos_title = extract_etymology_list_item( 

41 wxr, etymology_item, etymology_dict, pos_id, pos_title 

42 ) 

43 

44 if len(etymology_dict) == 0: 

45 categories = {} 

46 etymology_text = clean_node( 

47 wxr, categories, level_node.children[:level_node_index] 

48 ) 

49 if len(etymology_text) > 0: 49 ↛ 55line 49 didn't jump to line 55 because the condition on line 49 was always true

50 etymology_dict[("", "")].texts.append(etymology_text) 

51 etymology_dict[(pos_id, pos_title)].categories.extend( 

52 categories.get("categories", []) 

53 ) 

54 

55 if ("", "") in etymology_dict and etymology_dict.get(("", "")).texts == [ 

56 " " 

57 ]: 

58 # remove "ébauche-étym" template placeholder 

59 del etymology_dict[("", "")] 

60 

61 return etymology_dict 

62 

63 

64def extract_etymology_list_item( 

65 wxr: WiktextractContext, 

66 list_item: WikiNode, 

67 etymology_dict: EtymologyDict, 

68 pos_id: str, 

69 pos_title: str, 

70) -> tuple[str, str]: 

71 etymology_data = find_pos_in_etymology_list(wxr, list_item) 

72 if etymology_data is not None: 

73 pos_id, pos_title, etymology_text, categories = etymology_data 

74 if len(etymology_text) > 0: 

75 etymology_dict[(pos_id, pos_title)].texts.append(etymology_text) 

76 etymology_dict[(pos_id, pos_title)].categories.extend(categories) 

77 else: 

78 categories = {} 

79 etymology_text = clean_node( 

80 wxr, 

81 categories, 

82 list( 

83 list_item.invert_find_child( 

84 NodeKind.LIST, include_empty_str=True 

85 ) 

86 ), 

87 ) 

88 if len(etymology_text) > 0: 88 ↛ 94line 88 didn't jump to line 94 because the condition on line 88 was always true

89 etymology_dict[(pos_id, pos_title)].texts.append(etymology_text) 

90 etymology_dict[(pos_id, pos_title)].categories.extend( 

91 categories.get("categories", []) 

92 ) 

93 

94 for child_list in list_item.find_child(NodeKind.LIST): 

95 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

96 extract_etymology_list_item( 

97 wxr, child_list_item, etymology_dict, pos_id, pos_title 

98 ) 

99 

100 return pos_id, pos_title 

101 

102 

103def find_pos_in_etymology_list( 

104 wxr: WiktextractContext, list_item_node: WikiNode 

105) -> tuple[str, str, str, list[str]] | None: 

106 """ 

107 Return tuple of POS id, title, etymology text, categories if the passed 

108 list item node starts with italic POS node or POS template, otherwise 

109 return `None`. 

110 """ 

111 for template_node in list_item_node.find_child(NodeKind.TEMPLATE): 

112 if template_node.template_name == "ébauche-étym": 

113 return ("", "", " ", []) # missing etymology 

114 

115 categories = {} 

116 

117 for index, node in list_item_node.find_child( 

118 NodeKind.TEMPLATE | NodeKind.LINK | NodeKind.ITALIC, True 

119 ): 

120 if isinstance(node, TemplateNode) and node.template_name in ( 

121 "lien-ancre-étym", 

122 "laé", 

123 ): 

124 expanded_template = wxr.wtp.parse( 

125 wxr.wtp.node_to_wikitext(node), expand_all=True 

126 ) 

127 for italic_node in expanded_template.find_child(NodeKind.ITALIC): 127 ↛ 117line 127 didn't jump to line 117 because the loop on line 127 didn't complete

128 for link_node in italic_node.find_child(NodeKind.LINK): 128 ↛ 127line 128 didn't jump to line 127 because the loop on line 128 didn't complete

129 if isinstance( 129 ↛ 128line 129 didn't jump to line 128 because the condition on line 129 was always true

130 link_node.largs[0][0], str 

131 ) and link_node.largs[0][0].startswith("#"): 

132 pos_id = link_node.largs[0][0].removeprefix("#") 

133 return ( 

134 pos_id, 

135 clean_node(wxr, None, link_node).strip(": "), 

136 clean_node( 

137 wxr, 

138 categories, 

139 [ 

140 n 

141 for n in list_item_node.children[ 

142 index + 1 : 

143 ] 

144 if not ( 

145 isinstance(n, WikiNode) 

146 and n.kind == NodeKind.LIST 

147 ) 

148 ], 

149 ), 

150 categories.get("categories", []), 

151 ) 

152 elif ( 

153 node.kind == NodeKind.LINK 

154 and isinstance(node.largs[0][0], str) 

155 and node.largs[0][0].startswith("#") 

156 ): 

157 pos_id = node.largs[0][0].removeprefix("#") 

158 return ( 

159 pos_id, 

160 clean_node(wxr, None, node).strip(": "), 

161 clean_node( 

162 wxr, 

163 categories, 

164 [ 

165 n 

166 for n in list_item_node.children[index + 1 :] 

167 if not ( 

168 isinstance(n, WikiNode) and n.kind == NodeKind.LIST 

169 ) 

170 ], 

171 ), 

172 categories.get("categories", []), 

173 ) 

174 elif node.kind == NodeKind.ITALIC: 

175 for link_node in node.find_child(NodeKind.LINK): 

176 if isinstance(link_node.largs[0][0], str) and link_node.largs[ 

177 0 

178 ][0].startswith("#"): 

179 pos_id = link_node.largs[0][0].removeprefix("#") 

180 return ( 

181 pos_id, 

182 clean_node(wxr, None, link_node).strip(": "), 

183 clean_node( 

184 wxr, 

185 categories, 

186 [ 

187 n 

188 for n in list_item_node.children[index + 1 :] 

189 if not ( 

190 isinstance(n, WikiNode) 

191 and n.kind == NodeKind.LIST 

192 ) 

193 ], 

194 ).lstrip(") "), 

195 categories.get("categories", []), 

196 ) 

197 italic_text = clean_node(wxr, None, node) 

198 if ( 

199 index <= 1 # first node is empty string 

200 and italic_text.startswith("(") 

201 and italic_text.endswith(")") 

202 ): 

203 return ( 

204 "", 

205 italic_text.strip("() "), 

206 clean_node( 

207 wxr, 

208 categories, 

209 [ 

210 n 

211 for n in list_item_node.children[index + 1 :] 

212 if not ( 

213 isinstance(n, WikiNode) 

214 and n.kind == NodeKind.LIST 

215 ) 

216 ], 

217 ), 

218 categories.get("categories", []), 

219 ) 

220 

221 

222def insert_etymology_data( 

223 lang_code: str, page_data: list[WordEntry], etymology_dict: EtymologyDict 

224) -> None: 

225 """ 

226 Insert list of etymology data extracted from the level 3 node to each sense 

227 dictionary matches the language and POS. 

228 """ 

229 sense_dict = defaultdict(list) # group by pos title and id 

230 for sense_data in page_data: 

231 if sense_data.lang_code == lang_code: 

232 sense_dict[sense_data.pos_title].append(sense_data) 

233 sense_dict[sense_data.pos_id].append(sense_data) 

234 if sense_data.pos_id.endswith("-1"): 

235 # extra ids for the first title 

236 sense_dict[sense_data.pos_title.replace(" ", "_")].append( 

237 sense_data 

238 ) 

239 sense_dict[sense_data.pos_id.removesuffix("-1")].append( 

240 sense_data 

241 ) 

242 

243 for pos_id_title, etymology_data in etymology_dict.items(): 

244 if pos_id_title == ("", ""): # add to all sense dictionaries 

245 for sense_data_list in sense_dict.values(): 

246 for sense_data in sense_data_list: 

247 sense_data.etymology_texts = etymology_data.texts 

248 sense_data.categories.extend(etymology_data.categories) 

249 else: 

250 for pos_key in pos_id_title: 

251 if pos_key in sense_dict: 

252 for sense_data in sense_dict[pos_key]: 

253 sense_data.etymology_texts = etymology_data.texts 

254 sense_data.categories.extend(etymology_data.categories) 

255 

256 

257def extract_etymology_examples( 

258 wxr: WiktextractContext, 

259 level_node: LevelNode, 

260 base_data: WordEntry, 

261) -> None: 

262 for list_node in level_node.find_child(NodeKind.LIST): 

263 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

264 extract_etymology_example_list_item(wxr, list_item, base_data, "") 

265 

266 

267def extract_etymology_example_list_item( 

268 wxr: WiktextractContext, 

269 list_item: WikiNode, 

270 base_data: WordEntry, 

271 note: str, 

272) -> None: 

273 from .gloss import process_exemple_template 

274 

275 time = "" 

276 source = "" 

277 example_nodes = [] 

278 has_exemple_template = False 

279 for node in list_item.children: 

280 if isinstance(node, TemplateNode): 

281 if node.template_name in ["siècle", "circa", "date"]: 

282 time = clean_node(wxr, base_data, node).strip("() ") 

283 elif node.template_name == "exemple": 

284 has_exemple_template = True 

285 example_data = process_exemple_template( 

286 wxr, node, base_data, time 

287 ) 

288 if example_data.text != "": 288 ↛ 279line 288 didn't jump to line 279 because the condition on line 288 was always true

289 example_data.note = note 

290 base_data.etymology_examples.append(example_data) 

291 elif node.template_name == "source": 291 ↛ 294line 291 didn't jump to line 294 because the condition on line 291 was always true

292 source = clean_node(wxr, base_data, node).strip("— ()") 

293 else: 

294 example_nodes.append(node) 

295 else: 

296 example_nodes.append(node) 

297 

298 if not has_exemple_template: 

299 if time == "" and list_item.contain_node(NodeKind.LIST): 

300 note = clean_node( 

301 wxr, 

302 base_data, 

303 list( 

304 list_item.invert_find_child( 

305 NodeKind.LIST, include_empty_str=True 

306 ) 

307 ), 

308 ) 

309 for next_list in list_item.find_child(NodeKind.LIST): 

310 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM): 

311 extract_etymology_example_list_item( 

312 wxr, next_list_item, base_data, note 

313 ) 

314 elif len(example_nodes) > 0: 314 ↛ exitline 314 didn't return from function 'extract_etymology_example_list_item' because the condition on line 314 was always true

315 example_str = clean_node(wxr, base_data, example_nodes) 

316 if example_str != "": 316 ↛ exitline 316 didn't return from function 'extract_etymology_example_list_item' because the condition on line 316 was always true

317 example_data = Example( 

318 text=example_str, time=time, ref=source, note=note 

319 ) 

320 base_data.etymology_examples.append(example_data)