Coverage for src/wiktextract/extractor/fr/etymology.py: 96%

159 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1from collections import defaultdict 

2from dataclasses import dataclass, field 

3 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .models import AttestationData, Example, WordEntry 

15 

16ATTESTATION_TEMPLATES = {"siècle", "circa", "date"} 

17 

18 

19@dataclass 

20class EtymologyData: 

21 texts: list[str] = field(default_factory=list) 

22 categories: list[str] = field(default_factory=list) 

23 attestations: list[AttestationData] = field(default_factory=list) 

24 

25 

26EtymologyDict = dict[tuple[str, str], EtymologyData] 

27 

28 

29def extract_etymology( 

30 wxr: WiktextractContext, level_node: LevelNode, base_data: WordEntry 

31) -> EtymologyDict: 

32 etymology_dict: EtymologyDict = defaultdict(EtymologyData) 

33 level_node_index = len(level_node.children) 

34 pos_id = "" 

35 pos_title = "" 

36 for node_index, node in level_node.find_child( 

37 NodeKind.LIST | LEVEL_KIND_FLAGS, True 

38 ): 

39 if node.kind in LEVEL_KIND_FLAGS and node_index < level_node_index: 

40 level_node_index = node_index 

41 elif node.kind == NodeKind.LIST: 41 ↛ 36line 41 didn't jump to line 36 because the condition on line 41 was always true

42 for etymology_item in node.find_child(NodeKind.LIST_ITEM): 

43 pos_id, pos_title = extract_etymology_list_item( 

44 wxr, etymology_item, etymology_dict, pos_id, pos_title 

45 ) 

46 

47 if len(etymology_dict) == 0: 

48 categories = {} 

49 etymology_text = clean_node( 

50 wxr, categories, level_node.children[:level_node_index] 

51 ) 

52 if len(etymology_text) > 0: 52 ↛ 58line 52 didn't jump to line 58 because the condition on line 52 was always true

53 etymology_dict[("", "")].texts.append(etymology_text) 

54 etymology_dict[(pos_id, pos_title)].categories.extend( 

55 categories.get("categories", []) 

56 ) 

57 

58 if ("", "") in etymology_dict and etymology_dict.get(("", "")).texts == [ 

59 " " 

60 ]: 

61 # remove "ébauche-étym" template placeholder 

62 del etymology_dict[("", "")] 

63 

64 return etymology_dict 

65 

66 

67def extract_etymology_list_item( 

68 wxr: WiktextractContext, 

69 list_item: WikiNode, 

70 etymology_dict: EtymologyDict, 

71 pos_id: str, 

72 pos_title: str, 

73) -> tuple[str, str]: 

74 etymology_data = find_pos_in_etymology_list(wxr, list_item) 

75 if etymology_data is not None: 

76 pos_id, pos_title, etymology_data = etymology_data 

77 if len(etymology_data.texts) > 0: 

78 etymology_dict[(pos_id, pos_title)].texts.extend( 

79 etymology_data.texts 

80 ) 

81 etymology_dict[(pos_id, pos_title)].categories.extend( 

82 etymology_data.categories 

83 ) 

84 etymology_dict[(pos_id, pos_title)].attestations.extend( 

85 etymology_data.attestations 

86 ) 

87 else: 

88 etymology_data = extract_etymology_list_item_nodes( 

89 wxr, list_item.children 

90 ) 

91 if len(etymology_data.texts) > 0: 91 ↛ 102line 91 didn't jump to line 102 because the condition on line 91 was always true

92 etymology_dict[(pos_id, pos_title)].texts.extend( 

93 etymology_data.texts 

94 ) 

95 etymology_dict[(pos_id, pos_title)].categories.extend( 

96 etymology_data.categories 

97 ) 

98 etymology_dict[(pos_id, pos_title)].attestations.extend( 

99 etymology_data.attestations 

100 ) 

101 

102 for child_list in list_item.find_child(NodeKind.LIST): 

103 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM): 

104 extract_etymology_list_item( 

105 wxr, child_list_item, etymology_dict, pos_id, pos_title 

106 ) 

107 

108 return pos_id, pos_title 

109 

110 

111def find_pos_in_etymology_list( 

112 wxr: WiktextractContext, list_item_node: WikiNode 

113) -> tuple[str, str, EtymologyData] | None: 

114 """ 

115 Return tuple of POS id, title, etymology text, categories if the passed 

116 list item node starts with italic POS node or POS template, otherwise 

117 return `None`. 

118 """ 

119 for template_node in list_item_node.find_child(NodeKind.TEMPLATE): 

120 if template_node.template_name == "ébauche-étym": 

121 return "", "", EtymologyData(" ", [], []) # missing etymology 

122 

123 for index, node in list_item_node.find_child( 

124 NodeKind.TEMPLATE | NodeKind.LINK | NodeKind.ITALIC, True 

125 ): 

126 if isinstance(node, TemplateNode) and node.template_name in ( 

127 "lien-ancre-étym", 

128 "laé", 

129 ): 

130 expanded_template = wxr.wtp.parse( 

131 wxr.wtp.node_to_wikitext(node), expand_all=True 

132 ) 

133 for italic_node in expanded_template.find_child(NodeKind.ITALIC): 133 ↛ 123line 133 didn't jump to line 123 because the loop on line 133 didn't complete

134 for link_node in italic_node.find_child(NodeKind.LINK): 134 ↛ 133line 134 didn't jump to line 133 because the loop on line 134 didn't complete

135 if isinstance( 135 ↛ 134line 135 didn't jump to line 134 because the condition on line 135 was always true

136 link_node.largs[0][0], str 

137 ) and link_node.largs[0][0].startswith("#"): 

138 pos_id = link_node.largs[0][0].removeprefix("#") 

139 return ( 

140 pos_id, 

141 clean_node(wxr, None, link_node).strip(": "), 

142 extract_etymology_list_item_nodes( 

143 wxr, list_item_node.children[index + 1 :] 

144 ), 

145 ) 

146 elif ( 

147 node.kind == NodeKind.LINK 

148 and isinstance(node.largs[0][0], str) 

149 and node.largs[0][0].startswith("#") 

150 ): 

151 pos_id = node.largs[0][0].removeprefix("#") 

152 return ( 

153 pos_id, 

154 clean_node(wxr, None, node).strip(": "), 

155 extract_etymology_list_item_nodes( 

156 wxr, list_item_node.children[index + 1 :] 

157 ), 

158 ) 

159 elif node.kind == NodeKind.ITALIC: 

160 for link_node in node.find_child(NodeKind.LINK): 

161 if isinstance(link_node.largs[0][0], str) and link_node.largs[ 

162 0 

163 ][0].startswith("#"): 

164 pos_id = link_node.largs[0][0].removeprefix("#") 

165 e_data = extract_etymology_list_item_nodes( 

166 wxr, list_item_node.children[index + 1 :] 

167 ) 

168 e_data.texts = [t.lstrip(") ") for t in e_data.texts] 

169 return ( 

170 pos_id, 

171 clean_node(wxr, None, link_node).strip(": "), 

172 e_data, 

173 ) 

174 italic_text = clean_node(wxr, None, node) 

175 if ( 

176 index <= 1 # first node is empty string 

177 and italic_text.startswith("(") 

178 and italic_text.endswith(")") 

179 ): 

180 return ( 

181 "", 

182 italic_text.strip("() "), 

183 extract_etymology_list_item_nodes( 

184 wxr, list_item_node.children[index + 1 :] 

185 ), 

186 ) 

187 

188 

189def extract_etymology_list_item_nodes( 

190 wxr: WiktextractContext, nodes: list[WikiNode] 

191) -> EtymologyData: 

192 used_nodes = [] 

193 cats = {} 

194 e_data = EtymologyData() 

195 is_first_attest_template = True 

196 for node in nodes: 

197 if ( 

198 is_first_attest_template 

199 and isinstance(node, TemplateNode) 

200 and node.template_name in ATTESTATION_TEMPLATES 

201 ): 

202 e_data.attestations = extract_date_template(wxr, cats, node) 

203 is_first_attest_template = False 

204 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST): 

205 used_nodes.append(node) 

206 e_text = clean_node(wxr, cats, used_nodes) 

207 if e_text != "": 

208 e_data.texts.append(e_text) 

209 e_data.categories = cats.get("categories", []) 

210 return e_data 

211 

212 

213def insert_etymology_data( 

214 lang_code: str, page_data: list[WordEntry], etymology_dict: EtymologyDict 

215) -> None: 

216 """ 

217 Insert list of etymology data extracted from the level 3 node to each sense 

218 dictionary matches the language and POS. 

219 """ 

220 sense_dict = defaultdict(list) # group by pos title and id 

221 for sense_data in page_data: 

222 if sense_data.lang_code == lang_code: 

223 sense_dict[sense_data.pos_title].append(sense_data) 

224 sense_dict[sense_data.pos_id].append(sense_data) 

225 if sense_data.pos_id.endswith("-1"): 

226 # extra ids for the first title 

227 sense_dict[sense_data.pos_title.replace(" ", "_")].append( 

228 sense_data 

229 ) 

230 sense_dict[sense_data.pos_id.removesuffix("-1")].append( 

231 sense_data 

232 ) 

233 

234 added_sense = [] 

235 for pos_id_title, etymology_data in etymology_dict.items(): 

236 if pos_id_title == ("", ""): # add to all sense dictionaries 

237 for sense_data_list in sense_dict.values(): 

238 for sense_data in sense_data_list: 

239 if sense_data not in added_sense: 

240 sense_data.etymology_texts = etymology_data.texts 

241 sense_data.categories.extend(etymology_data.categories) 

242 sense_data.attestations.extend( 

243 etymology_data.attestations 

244 ) 

245 added_sense.append(sense_data) 

246 else: 

247 for pos_key in pos_id_title: 

248 if pos_key in sense_dict: 

249 for sense_data in sense_dict[pos_key]: 

250 if sense_data not in added_sense: 

251 sense_data.etymology_texts = etymology_data.texts 

252 sense_data.categories.extend( 

253 etymology_data.categories 

254 ) 

255 sense_data.attestations.extend( 

256 etymology_data.attestations 

257 ) 

258 added_sense.append(sense_data) 

259 

260 

261def extract_etymology_examples( 

262 wxr: WiktextractContext, 

263 level_node: LevelNode, 

264 base_data: WordEntry, 

265) -> None: 

266 for list_node in level_node.find_child(NodeKind.LIST): 

267 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

268 extract_etymology_example_list_item(wxr, list_item, base_data, "") 

269 

270 

271def extract_etymology_example_list_item( 

272 wxr: WiktextractContext, 

273 list_item: WikiNode, 

274 base_data: WordEntry, 

275 note: str, 

276) -> None: 

277 from .gloss import process_exemple_template 

278 

279 attestations = [] 

280 source = "" 

281 example_nodes = [] 

282 has_exemple_template = False 

283 for node in list_item.children: 

284 if isinstance(node, TemplateNode): 

285 if node.template_name in ATTESTATION_TEMPLATES: 

286 attestations = extract_date_template(wxr, base_data, node) 

287 elif node.template_name == "exemple": 

288 has_exemple_template = True 

289 example_data = process_exemple_template( 

290 wxr, node, base_data, attestations 

291 ) 

292 if example_data.text != "": 292 ↛ 283line 292 didn't jump to line 283 because the condition on line 292 was always true

293 example_data.note = note 

294 base_data.etymology_examples.append(example_data) 

295 elif node.template_name == "source": 295 ↛ 298line 295 didn't jump to line 298 because the condition on line 295 was always true

296 source = clean_node(wxr, base_data, node).strip("— ()") 

297 else: 

298 example_nodes.append(node) 

299 else: 

300 example_nodes.append(node) 

301 

302 if not has_exemple_template: 

303 if len(attestations) == 0 and list_item.contain_node(NodeKind.LIST): 

304 note = clean_node( 

305 wxr, 

306 base_data, 

307 list( 

308 list_item.invert_find_child( 

309 NodeKind.LIST, include_empty_str=True 

310 ) 

311 ), 

312 ) 

313 for next_list in list_item.find_child(NodeKind.LIST): 

314 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM): 

315 extract_etymology_example_list_item( 

316 wxr, next_list_item, base_data, note 

317 ) 

318 elif len(example_nodes) > 0: 318 ↛ exitline 318 didn't return from function 'extract_etymology_example_list_item' because the condition on line 318 was always true

319 example_str = clean_node(wxr, base_data, example_nodes) 

320 if example_str != "": 320 ↛ exitline 320 didn't return from function 'extract_etymology_example_list_item' because the condition on line 320 was always true

321 example_data = Example( 

322 text=example_str, 

323 ref=source, 

324 note=note, 

325 attestations=attestations, 

326 ) 

327 base_data.etymology_examples.append(example_data) 

328 

329 

330def extract_date_template( 

331 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

332) -> list[AttestationData]: 

333 date_list = [] 

334 date = clean_node(wxr, word_entry, t_node).strip("()") 

335 if date not in ["", "Date à préciser"]: 335 ↛ 337line 335 didn't jump to line 337 because the condition on line 335 was always true

336 date_list.append(AttestationData(date=date)) 

337 return date_list