Coverage for src/wiktextract/extractor/fr/etymology.py: 96%

1from collections import defaultdict

2from dataclasses import dataclass, field

4from wikitextprocessor.parser import (

5 LEVEL_KIND_FLAGS,

6 LevelNode,

7 NodeKind,

8 TemplateNode,

9 WikiNode,

10)

12from ...page import clean_node

13from ...wxr_context import WiktextractContext

14from .models import AttestationData, Example, WordEntry

16ATTESTATION_TEMPLATES = {"siècle", "circa", "date"}

19@dataclass

20class EtymologyData:

21 texts: list[str] = field(default_factory=list)

22 categories: list[str] = field(default_factory=list)

23 attestations: list[AttestationData] = field(default_factory=list)

26EtymologyDict = dict[tuple[str, str], EtymologyData]

29def extract_etymology(

30 wxr: WiktextractContext, level_node: LevelNode, base_data: WordEntry

31) -> EtymologyDict:

32 etymology_dict: EtymologyDict = defaultdict(EtymologyData)

33 level_node_index = len(level_node.children)

34 pos_id = ""

35 pos_title = ""

36 for node_index, node in level_node.find_child(

37 NodeKind.LIST | LEVEL_KIND_FLAGS, True

38 ):

39 if node.kind in LEVEL_KIND_FLAGS and node_index < level_node_index:

40 level_node_index = node_index

41 elif node.kind == NodeKind.LIST: 41 ↛ 36line 41 didn't jump to line 36 because the condition on line 41 was always true

42 for etymology_item in node.find_child(NodeKind.LIST_ITEM):

43 pos_id, pos_title = extract_etymology_list_item(

44 wxr, etymology_item, etymology_dict, pos_id, pos_title

45 )

47 if len(etymology_dict) == 0:

48 categories = {}

49 etymology_text = clean_node(

50 wxr, categories, level_node.children[:level_node_index]

51 )

52 if len(etymology_text) > 0: 52 ↛ 58line 52 didn't jump to line 58 because the condition on line 52 was always true

53 etymology_dict[("", "")].texts.append(etymology_text)

54 etymology_dict[(pos_id, pos_title)].categories.extend(

55 categories.get("categories", [])

56 )

58 if ("", "") in etymology_dict and etymology_dict.get(("", "")).texts == [

59 " "

60 ]:

61 # remove "ébauche-étym" template placeholder

62 del etymology_dict[("", "")]

64 return etymology_dict

67def extract_etymology_list_item(

68 wxr: WiktextractContext,

69 list_item: WikiNode,

70 etymology_dict: EtymologyDict,

71 pos_id: str,

72 pos_title: str,

73) -> tuple[str, str]:

74 etymology_data = find_pos_in_etymology_list(wxr, list_item)

75 if etymology_data is not None:

76 pos_id, pos_title, etymology_data = etymology_data

77 if len(etymology_data.texts) > 0:

78 etymology_dict[(pos_id, pos_title)].texts.extend(

79 etymology_data.texts

80 )

81 etymology_dict[(pos_id, pos_title)].categories.extend(

82 etymology_data.categories

83 )

84 etymology_dict[(pos_id, pos_title)].attestations.extend(

85 etymology_data.attestations

86 )

87 else:

88 etymology_data = extract_etymology_list_item_nodes(

89 wxr, list_item.children

90 )

91 if len(etymology_data.texts) > 0: 91 ↛ 102line 91 didn't jump to line 102 because the condition on line 91 was always true

92 etymology_dict[(pos_id, pos_title)].texts.extend(

93 etymology_data.texts

94 )

95 etymology_dict[(pos_id, pos_title)].categories.extend(

96 etymology_data.categories

97 )

98 etymology_dict[(pos_id, pos_title)].attestations.extend(

99 etymology_data.attestations

100 )

101

102 for child_list in list_item.find_child(NodeKind.LIST):

103 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):

104 extract_etymology_list_item(

105 wxr, child_list_item, etymology_dict, pos_id, pos_title

106 )

107

108 return pos_id, pos_title

109

110

111def find_pos_in_etymology_list(

112 wxr: WiktextractContext, list_item_node: WikiNode

113) -> tuple[str, str, EtymologyData] | None:

114 """

115 Return tuple of POS id, title, etymology text, categories if the passed

116 list item node starts with italic POS node or POS template, otherwise

117 return `None`.

118 """

119 for template_node in list_item_node.find_child(NodeKind.TEMPLATE):

120 if template_node.template_name == "ébauche-étym":

121 return "", "", EtymologyData(" ", [], []) # missing etymology

122

123 for index, node in list_item_node.find_child(

124 NodeKind.TEMPLATE | NodeKind.LINK | NodeKind.ITALIC, True

125 ):

126 if isinstance(node, TemplateNode) and node.template_name in (

127 "lien-ancre-étym",

128 "laé",

129 ):

130 expanded_template = wxr.wtp.parse(

131 wxr.wtp.node_to_wikitext(node), expand_all=True

132 )

133 for italic_node in expanded_template.find_child(NodeKind.ITALIC): 133 ↛ 123line 133 didn't jump to line 123 because the loop on line 133 didn't complete

134 for link_node in italic_node.find_child(NodeKind.LINK): 134 ↛ 133line 134 didn't jump to line 133 because the loop on line 134 didn't complete

135 if isinstance( 135 ↛ 134line 135 didn't jump to line 134 because the condition on line 135 was always true

136 link_node.largs[0][0], str

137 ) and link_node.largs[0][0].startswith("#"):

138 pos_id = link_node.largs[0][0].removeprefix("#")

139 return (

140 pos_id,

141 clean_node(wxr, None, link_node).strip(": "),

142 extract_etymology_list_item_nodes(

143 wxr, list_item_node.children[index + 1 :]

144 ),

145 )

146 elif (

147 node.kind == NodeKind.LINK

148 and isinstance(node.largs[0][0], str)

149 and node.largs[0][0].startswith("#")

150 ):

151 pos_id = node.largs[0][0].removeprefix("#")

152 return (

153 pos_id,

154 clean_node(wxr, None, node).strip(": "),

155 extract_etymology_list_item_nodes(

156 wxr, list_item_node.children[index + 1 :]

157 ),

158 )

159 elif node.kind == NodeKind.ITALIC:

160 for link_node in node.find_child(NodeKind.LINK):

161 if isinstance(link_node.largs[0][0], str) and link_node.largs[

162 0

163 ][0].startswith("#"):

164 pos_id = link_node.largs[0][0].removeprefix("#")

165 e_data = extract_etymology_list_item_nodes(

166 wxr, list_item_node.children[index + 1 :]

167 )

168 e_data.texts = [t.lstrip(") ") for t in e_data.texts]

169 return (

170 pos_id,

171 clean_node(wxr, None, link_node).strip(": "),

172 e_data,

173 )

174 italic_text = clean_node(wxr, None, node)

175 if (

176 index <= 1 # first node is empty string

177 and italic_text.startswith("(")

178 and italic_text.endswith(")")

179 ):

180 return (

181 "",

182 italic_text.strip("() "),

183 extract_etymology_list_item_nodes(

184 wxr, list_item_node.children[index + 1 :]

185 ),

186 )

187

188

189def extract_etymology_list_item_nodes(

190 wxr: WiktextractContext, nodes: list[WikiNode]

191) -> EtymologyData:

192 used_nodes = []

193 cats = {}

194 e_data = EtymologyData()

195 is_first_attest_template = True

196 for node in nodes:

197 if (

198 is_first_attest_template

199 and isinstance(node, TemplateNode)

200 and node.template_name in ATTESTATION_TEMPLATES

201 ):

202 e_data.attestations = extract_date_template(wxr, cats, node)

203 is_first_attest_template = False

204 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):

205 used_nodes.append(node)

206 e_text = clean_node(wxr, cats, used_nodes)

207 if e_text != "":

208 e_data.texts.append(e_text)

209 e_data.categories = cats.get("categories", [])

210 return e_data

211

212

213def insert_etymology_data(

214 lang_code: str, page_data: list[WordEntry], etymology_dict: EtymologyDict

215) -> None:

216 """

217 Insert list of etymology data extracted from the level 3 node to each sense

218 dictionary matches the language and POS.

219 """

220 sense_dict = defaultdict(list) # group by pos title and id

221 for sense_data in page_data:

222 if sense_data.lang_code == lang_code:

223 sense_dict[sense_data.pos_title].append(sense_data)

224 sense_dict[sense_data.pos_id].append(sense_data)

225 if sense_data.pos_id.endswith("-1"):

226 # extra ids for the first title

227 sense_dict[sense_data.pos_title.replace(" ", "_")].append(

228 sense_data

229 )

230 sense_dict[sense_data.pos_id.removesuffix("-1")].append(

231 sense_data

232 )

233

234 added_sense = []

235 for pos_id_title, etymology_data in etymology_dict.items():

236 if pos_id_title == ("", ""): # add to all sense dictionaries

237 for sense_data_list in sense_dict.values():

238 for sense_data in sense_data_list:

239 if sense_data not in added_sense:

240 sense_data.etymology_texts = etymology_data.texts

241 sense_data.categories.extend(etymology_data.categories)

242 sense_data.attestations.extend(

243 etymology_data.attestations

244 )

245 added_sense.append(sense_data)

246 else:

247 for pos_key in pos_id_title:

248 if pos_key in sense_dict:

249 for sense_data in sense_dict[pos_key]:

250 if sense_data not in added_sense:

251 sense_data.etymology_texts = etymology_data.texts

252 sense_data.categories.extend(

253 etymology_data.categories

254 )

255 sense_data.attestations.extend(

256 etymology_data.attestations

257 )

258 added_sense.append(sense_data)

259

260

261def extract_etymology_examples(

262 wxr: WiktextractContext,

263 level_node: LevelNode,

264 base_data: WordEntry,

265) -> None:

266 for list_node in level_node.find_child(NodeKind.LIST):

267 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

268 extract_etymology_example_list_item(wxr, list_item, base_data, "")

269

270

271def extract_etymology_example_list_item(

272 wxr: WiktextractContext,

273 list_item: WikiNode,

274 base_data: WordEntry,

275 note: str,

276) -> None:

277 from .gloss import process_exemple_template

278

279 attestations = []

280 source = ""

281 example_nodes = []

282 has_exemple_template = False

283 for node in list_item.children:

284 if isinstance(node, TemplateNode):

285 if node.template_name in ATTESTATION_TEMPLATES:

286 attestations = extract_date_template(wxr, base_data, node)

287 elif node.template_name == "exemple":

288 has_exemple_template = True

289 example_data = process_exemple_template(

290 wxr, node, base_data, attestations

291 )

292 if example_data.text != "": 292 ↛ 283line 292 didn't jump to line 283 because the condition on line 292 was always true

293 example_data.note = note

294 base_data.etymology_examples.append(example_data)

295 elif node.template_name == "source": 295 ↛ 298line 295 didn't jump to line 298 because the condition on line 295 was always true

296 source = clean_node(wxr, base_data, node).strip("— ()")

297 else:

298 example_nodes.append(node)

299 else:

300 example_nodes.append(node)

301

302 if not has_exemple_template:

303 if len(attestations) == 0 and list_item.contain_node(NodeKind.LIST):

304 note = clean_node(

305 wxr,

306 base_data,

307 list(

308 list_item.invert_find_child(

309 NodeKind.LIST, include_empty_str=True

310 )

311 ),

312 )

313 for next_list in list_item.find_child(NodeKind.LIST):

314 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM):

315 extract_etymology_example_list_item(

316 wxr, next_list_item, base_data, note

317 )

318 elif len(example_nodes) > 0: 318 ↛ exitline 318 didn't return from function 'extract_etymology_example_list_item' because the condition on line 318 was always true

319 example_str = clean_node(wxr, base_data, example_nodes)

320 if example_str != "": 320 ↛ exitline 320 didn't return from function 'extract_etymology_example_list_item' because the condition on line 320 was always true

321 example_data = Example(

322 text=example_str,

323 ref=source,

324 note=note,

325 attestations=attestations,

326 )

327 base_data.etymology_examples.append(example_data)

328

329

330def extract_date_template(

331 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

332) -> list[AttestationData]:

333 date_list = []

334 date = clean_node(wxr, word_entry, t_node).strip("()")

335 if date not in ["", "Date à préciser"]: 335 ↛ 337line 335 didn't jump to line 337 because the condition on line 335 was always true

336 date_list.append(AttestationData(date=date))

337 return date_list