Coverage for src/wiktextract/extractor/fr/etymology.py: 96%

166 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1from collections import defaultdict 

2from dataclasses import dataclass, field 

3 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .models import AttestationData, Example, WordEntry 

15 

16ATTESTATION_TEMPLATES = {"siècle", "circa", "date"} 

17 

18 

19@dataclass 

20class EtymologyData: 

21 texts: list[str] = field(default_factory=list) 

22 categories: list[str] = field(default_factory=list) 

23 attestations: list[AttestationData] = field(default_factory=list) 

24 

25 

26EtymologyDict = dict[tuple[str, str], EtymologyData] 

27 

28 

29def extract_etymology( 

30 wxr: WiktextractContext, level_node: LevelNode, base_data: WordEntry 

31) -> EtymologyDict: 

32 etymology_dict: EtymologyDict = defaultdict(EtymologyData) 

33 level_node_index = len(level_node.children) 

34 pos_id = "" 

35 pos_title = "" 

36 for node_index, node in level_node.find_child( 

37 NodeKind.LIST | LEVEL_KIND_FLAGS, True 

38 ): 

39 if node.kind in LEVEL_KIND_FLAGS and node_index < level_node_index: 

40 level_node_index = node_index 

41 elif node.kind == NodeKind.LIST: 41 ↛ 36line 41 didn't jump to line 36 because the condition on line 41 was always true

42 for etymology_item in node.find_child(NodeKind.LIST_ITEM): 

43 pos_id, pos_title = extract_etymology_list_item( 

44 wxr, etymology_item, etymology_dict, pos_id, pos_title 

45 ) 

46 

47 if len(etymology_dict) == 0: 

48 categories = {} 

49 e_nodes = [] 

50 attestations = [] 

51 for node in level_node.children[:level_node_index]: 

52 if ( 

53 isinstance(node, TemplateNode) 

54 and node.template_name in ATTESTATION_TEMPLATES 

55 ): 

56 attestations.extend(extract_date_template(wxr, base_data, node)) 

57 else: 

58 e_nodes.append(node) 

59 etymology_text = clean_node(wxr, categories, e_nodes) 

60 if len(etymology_text) > 0: 60 ↛ 69line 60 didn't jump to line 69 because the condition on line 60 was always true

61 etymology_dict[("", "")].texts.extend( 

62 list(filter(None, map(str.strip, etymology_text.splitlines()))) 

63 ) 

64 etymology_dict[("", "")].attestations = attestations 

65 etymology_dict[(pos_id, pos_title)].categories.extend( 

66 categories.get("categories", []) 

67 ) 

68 

69 if ("", "") in etymology_dict and etymology_dict.get(("", "")).texts == [ 

70 " " 

71 ]: 

72 # remove "ébauche-étym" template placeholder 

73 del etymology_dict[("", "")] 

74 

75 return etymology_dict 

76 

77 

78def extract_etymology_list_item( 

79 wxr: WiktextractContext, 

80 list_item: WikiNode, 

81 etymology_dict: EtymologyDict, 

82 pos_id: str, 

83 pos_title: str, 

84) -> tuple[str, str]: 

85 etymology_data = find_pos_in_etymology_list(wxr, list_item) 

86 if etymology_data is not None: 

87 pos_id, pos_title, etymology_data = etymology_data 

88 if len(etymology_data.texts) > 0: 

89 etymology_dict[(pos_id, pos_title)].texts.extend( 

90 etymology_data.texts 

91 ) 

92 etymology_dict[(pos_id, pos_title)].categories.extend( 

93 etymology_data.categories 

94 ) 

95 etymology_dict[(pos_id, pos_title)].attestations.extend( 

96 etymology_data.attestations 

97 ) 

98 else: 

99 etymology_data = extract_etymology_list_item_nodes( 

100 wxr, list_item.children 

101 ) 

102 if len(etymology_data.texts) > 0: 102 ↛ 113line 102 didn't jump to line 113 because the condition on line 102 was always true

103 etymology_dict[(pos_id, pos_title)].texts.extend( 

104 etymology_data.texts 

105 ) 

106 etymology_dict[(pos_id, pos_title)].categories.extend( 

107 etymology_data.categories 

108 ) 

109 etymology_dict[(pos_id, pos_title)].attestations.extend( 

110 etymology_data.attestations 

111 ) 

112 

113 for child_list in list_item.find_child(NodeKind.LIST): 

114 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

115 extract_etymology_list_item( 

116 wxr, child_list_item, etymology_dict, pos_id, pos_title 

117 ) 

118 

119 return pos_id, pos_title 

120 

121 

122def find_pos_in_etymology_list( 

123 wxr: WiktextractContext, list_item_node: WikiNode 

124) -> tuple[str, str, EtymologyData] | None: 

125 """ 

126 Return tuple of POS id, title, etymology text, categories if the passed 

127 list item node starts with italic POS node or POS template, otherwise 

128 return `None`. 

129 """ 

130 for template_node in list_item_node.find_child(NodeKind.TEMPLATE): 

131 if template_node.template_name == "ébauche-étym": 

132 return "", "", EtymologyData(" ", [], []) # missing etymology 

133 

134 for index, node in list_item_node.find_child( 

135 NodeKind.TEMPLATE | NodeKind.LINK | NodeKind.ITALIC, True 

136 ): 

137 if isinstance(node, TemplateNode) and node.template_name in ( 

138 "lien-ancre-étym", 

139 "laé", 

140 ): 

141 expanded_template = wxr.wtp.parse( 

142 wxr.wtp.node_to_wikitext(node), expand_all=True 

143 ) 

144 for italic_node in expanded_template.find_child(NodeKind.ITALIC): 144 ↛ 134line 144 didn't jump to line 134 because the loop on line 144 didn't complete

145 for link_node in italic_node.find_child(NodeKind.LINK): 145 ↛ 144line 145 didn't jump to line 144 because the loop on line 145 didn't complete

146 if isinstance( 146 ↛ 145line 146 didn't jump to line 145 because the condition on line 146 was always true

147 link_node.largs[0][0], str 

148 ) and link_node.largs[0][0].startswith("#"): 

149 pos_id = link_node.largs[0][0].removeprefix("#") 

150 return ( 

151 pos_id, 

152 clean_node(wxr, None, link_node).strip(": "), 

153 extract_etymology_list_item_nodes( 

154 wxr, list_item_node.children[index + 1 :] 

155 ), 

156 ) 

157 elif ( 

158 node.kind == NodeKind.LINK 

159 and isinstance(node.largs[0][0], str) 

160 and node.largs[0][0].startswith("#") 

161 ): 

162 pos_id = node.largs[0][0].removeprefix("#") 

163 return ( 

164 pos_id, 

165 clean_node(wxr, None, node).strip(": "), 

166 extract_etymology_list_item_nodes( 

167 wxr, list_item_node.children[index + 1 :] 

168 ), 

169 ) 

170 elif node.kind == NodeKind.ITALIC: 

171 for link_node in node.find_child(NodeKind.LINK): 

172 if isinstance(link_node.largs[0][0], str) and link_node.largs[ 

173 0 

174 ][0].startswith("#"): 

175 pos_id = link_node.largs[0][0].removeprefix("#") 

176 e_data = extract_etymology_list_item_nodes( 

177 wxr, list_item_node.children[index + 1 :] 

178 ) 

179 e_data.texts = [t.lstrip(") ") for t in e_data.texts] 

180 return ( 

181 pos_id, 

182 clean_node(wxr, None, link_node).strip(": "), 

183 e_data, 

184 ) 

185 italic_text = clean_node(wxr, None, node) 

186 if ( 

187 index <= 1 # first node is empty string 

188 and italic_text.startswith("(") 

189 and italic_text.endswith(")") 

190 ): 

191 return ( 

192 "", 

193 italic_text.strip("() "), 

194 extract_etymology_list_item_nodes( 

195 wxr, list_item_node.children[index + 1 :] 

196 ), 

197 ) 

198 

199 

200def extract_etymology_list_item_nodes( 

201 wxr: WiktextractContext, nodes: list[WikiNode] 

202) -> EtymologyData: 

203 used_nodes = [] 

204 cats = {} 

205 e_data = EtymologyData() 

206 is_first_attest_template = True 

207 for node in nodes: 

208 if ( 

209 is_first_attest_template 

210 and isinstance(node, TemplateNode) 

211 and node.template_name in ATTESTATION_TEMPLATES 

212 ): 

213 e_data.attestations = extract_date_template(wxr, cats, node) 

214 is_first_attest_template = False 

215 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

216 used_nodes.append(node) 

217 e_text = clean_node(wxr, cats, used_nodes) 

218 if e_text != "": 

219 e_data.texts.append(e_text) 

220 e_data.categories = cats.get("categories", []) 

221 return e_data 

222 

223 

224def insert_etymology_data( 

225 lang_code: str, page_data: list[WordEntry], etymology_dict: EtymologyDict 

226) -> None: 

227 """ 

228 Insert list of etymology data extracted from the level 3 node to each sense 

229 dictionary matches the language and POS. 

230 """ 

231 sense_dict = defaultdict(list) # group by pos title and id 

232 for sense_data in page_data: 

233 if sense_data.lang_code == lang_code: 

234 sense_dict[sense_data.pos_title].append(sense_data) 

235 sense_dict[sense_data.pos_id].append(sense_data) 

236 if sense_data.pos_id.endswith("-1"): 

237 # extra ids for the first title 

238 sense_dict[sense_data.pos_title.replace(" ", "_")].append( 

239 sense_data 

240 ) 

241 sense_dict[sense_data.pos_id.removesuffix("-1")].append( 

242 sense_data 

243 ) 

244 

245 added_sense = [] 

246 for pos_id_title, etymology_data in etymology_dict.items(): 

247 if pos_id_title == ("", ""): # add to all sense dictionaries 

248 for sense_data_list in sense_dict.values(): 

249 for sense_data in sense_data_list: 

250 if sense_data not in added_sense: 

251 sense_data.etymology_texts = etymology_data.texts 

252 sense_data.categories.extend(etymology_data.categories) 

253 sense_data.attestations.extend( 

254 etymology_data.attestations 

255 ) 

256 added_sense.append(sense_data) 

257 else: 

258 for pos_key in pos_id_title: 

259 if pos_key in sense_dict: 

260 for sense_data in sense_dict[pos_key]: 

261 if sense_data not in added_sense: 

262 sense_data.etymology_texts = etymology_data.texts 

263 sense_data.categories.extend( 

264 etymology_data.categories 

265 ) 

266 sense_data.attestations.extend( 

267 etymology_data.attestations 

268 ) 

269 added_sense.append(sense_data) 

270 

271 

272def extract_etymology_examples( 

273 wxr: WiktextractContext, 

274 level_node: LevelNode, 

275 base_data: WordEntry, 

276) -> None: 

277 for list_node in level_node.find_child(NodeKind.LIST): 

278 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

279 extract_etymology_example_list_item(wxr, list_item, base_data, "") 

280 

281 

282def extract_etymology_example_list_item( 

283 wxr: WiktextractContext, 

284 list_item: WikiNode, 

285 base_data: WordEntry, 

286 note: str, 

287) -> None: 

288 from .gloss import process_exemple_template 

289 

290 attestations = [] 

291 source = "" 

292 example_nodes = [] 

293 has_exemple_template = False 

294 for node in list_item.children: 

295 if isinstance(node, TemplateNode): 

296 if node.template_name in ATTESTATION_TEMPLATES: 

297 attestations = extract_date_template(wxr, base_data, node) 

298 elif node.template_name == "exemple": 

299 has_exemple_template = True 

300 example_data = process_exemple_template( 

301 wxr, node, base_data, attestations 

302 ) 

303 if example_data.text != "": 303 ↛ 294line 303 didn't jump to line 294 because the condition on line 303 was always true

304 example_data.note = note 

305 base_data.etymology_examples.append(example_data) 

306 elif node.template_name == "source": 306 ↛ 309line 306 didn't jump to line 309 because the condition on line 306 was always true

307 source = clean_node(wxr, base_data, node).strip("— ()") 

308 else: 

309 example_nodes.append(node) 

310 else: 

311 example_nodes.append(node) 

312 

313 if not has_exemple_template: 

314 if len(attestations) == 0 and list_item.contain_node(NodeKind.LIST): 

315 note = clean_node( 

316 wxr, 

317 base_data, 

318 list( 

319 list_item.invert_find_child( 

320 NodeKind.LIST, include_empty_str=True 

321 ) 

322 ), 

323 ) 

324 for next_list in list_item.find_child(NodeKind.LIST): 

325 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM): 

326 extract_etymology_example_list_item( 

327 wxr, next_list_item, base_data, note 

328 ) 

329 elif len(example_nodes) > 0: 329 ↛ exitline 329 didn't return from function 'extract_etymology_example_list_item' because the condition on line 329 was always true

330 example_str = clean_node(wxr, base_data, example_nodes) 

331 if example_str != "": 331 ↛ exitline 331 didn't return from function 'extract_etymology_example_list_item' because the condition on line 331 was always true

332 example_data = Example( 

333 text=example_str, 

334 ref=source, 

335 note=note, 

336 attestations=attestations, 

337 ) 

338 base_data.etymology_examples.append(example_data) 

339 

340 

341def extract_date_template( 

342 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

343) -> list[AttestationData]: 

344 date_list = [] 

345 date = clean_node(wxr, word_entry, t_node).strip("()") 

346 if date not in ["", "Date à préciser"]: 346 ↛ 348line 346 didn't jump to line 348 because the condition on line 346 was always true

347 date_list.append(AttestationData(date=date)) 

348 return date_list