Coverage for src/wiktextract/extractor/fr/etymology.py: 96%
166 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1from collections import defaultdict
2from dataclasses import dataclass, field
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .models import AttestationData, Example, WordEntry
16ATTESTATION_TEMPLATES = {"siècle", "circa", "date"}
19@dataclass
20class EtymologyData:
21 texts: list[str] = field(default_factory=list)
22 categories: list[str] = field(default_factory=list)
23 attestations: list[AttestationData] = field(default_factory=list)
26EtymologyDict = dict[tuple[str, str], EtymologyData]
29def extract_etymology(
30 wxr: WiktextractContext, level_node: LevelNode, base_data: WordEntry
31) -> EtymologyDict:
32 etymology_dict: EtymologyDict = defaultdict(EtymologyData)
33 level_node_index = len(level_node.children)
34 pos_id = ""
35 pos_title = ""
36 for node_index, node in level_node.find_child(
37 NodeKind.LIST | LEVEL_KIND_FLAGS, True
38 ):
39 if node.kind in LEVEL_KIND_FLAGS and node_index < level_node_index:
40 level_node_index = node_index
41 elif node.kind == NodeKind.LIST: 41 ↛ 36line 41 didn't jump to line 36 because the condition on line 41 was always true
42 for etymology_item in node.find_child(NodeKind.LIST_ITEM):
43 pos_id, pos_title = extract_etymology_list_item(
44 wxr, etymology_item, etymology_dict, pos_id, pos_title
45 )
47 if len(etymology_dict) == 0:
48 categories = {}
49 e_nodes = []
50 attestations = []
51 for node in level_node.children[:level_node_index]:
52 if (
53 isinstance(node, TemplateNode)
54 and node.template_name in ATTESTATION_TEMPLATES
55 ):
56 attestations.extend(extract_date_template(wxr, base_data, node))
57 else:
58 e_nodes.append(node)
59 etymology_text = clean_node(wxr, categories, e_nodes)
60 if len(etymology_text) > 0: 60 ↛ 69line 60 didn't jump to line 69 because the condition on line 60 was always true
61 etymology_dict[("", "")].texts.extend(
62 list(filter(None, map(str.strip, etymology_text.splitlines())))
63 )
64 etymology_dict[("", "")].attestations = attestations
65 etymology_dict[(pos_id, pos_title)].categories.extend(
66 categories.get("categories", [])
67 )
69 if ("", "") in etymology_dict and etymology_dict.get(("", "")).texts == [
70 " "
71 ]:
72 # remove "ébauche-étym" template placeholder
73 del etymology_dict[("", "")]
75 return etymology_dict
78def extract_etymology_list_item(
79 wxr: WiktextractContext,
80 list_item: WikiNode,
81 etymology_dict: EtymologyDict,
82 pos_id: str,
83 pos_title: str,
84) -> tuple[str, str]:
85 etymology_data = find_pos_in_etymology_list(wxr, list_item)
86 if etymology_data is not None:
87 pos_id, pos_title, etymology_data = etymology_data
88 if len(etymology_data.texts) > 0:
89 etymology_dict[(pos_id, pos_title)].texts.extend(
90 etymology_data.texts
91 )
92 etymology_dict[(pos_id, pos_title)].categories.extend(
93 etymology_data.categories
94 )
95 etymology_dict[(pos_id, pos_title)].attestations.extend(
96 etymology_data.attestations
97 )
98 else:
99 etymology_data = extract_etymology_list_item_nodes(
100 wxr, list_item.children
101 )
102 if len(etymology_data.texts) > 0: 102 ↛ 113line 102 didn't jump to line 113 because the condition on line 102 was always true
103 etymology_dict[(pos_id, pos_title)].texts.extend(
104 etymology_data.texts
105 )
106 etymology_dict[(pos_id, pos_title)].categories.extend(
107 etymology_data.categories
108 )
109 etymology_dict[(pos_id, pos_title)].attestations.extend(
110 etymology_data.attestations
111 )
113 for child_list in list_item.find_child(NodeKind.LIST):
114 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
115 extract_etymology_list_item(
116 wxr, child_list_item, etymology_dict, pos_id, pos_title
117 )
119 return pos_id, pos_title
122def find_pos_in_etymology_list(
123 wxr: WiktextractContext, list_item_node: WikiNode
124) -> tuple[str, str, EtymologyData] | None:
125 """
126 Return tuple of POS id, title, etymology text, categories if the passed
127 list item node starts with italic POS node or POS template, otherwise
128 return `None`.
129 """
130 for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
131 if template_node.template_name == "ébauche-étym":
132 return "", "", EtymologyData(" ", [], []) # missing etymology
134 for index, node in list_item_node.find_child(
135 NodeKind.TEMPLATE | NodeKind.LINK | NodeKind.ITALIC, True
136 ):
137 if isinstance(node, TemplateNode) and node.template_name in (
138 "lien-ancre-étym",
139 "laé",
140 ):
141 expanded_template = wxr.wtp.parse(
142 wxr.wtp.node_to_wikitext(node), expand_all=True
143 )
144 for italic_node in expanded_template.find_child(NodeKind.ITALIC): 144 ↛ 134line 144 didn't jump to line 134 because the loop on line 144 didn't complete
145 for link_node in italic_node.find_child(NodeKind.LINK): 145 ↛ 144line 145 didn't jump to line 144 because the loop on line 145 didn't complete
146 if isinstance( 146 ↛ 145line 146 didn't jump to line 145 because the condition on line 146 was always true
147 link_node.largs[0][0], str
148 ) and link_node.largs[0][0].startswith("#"):
149 pos_id = link_node.largs[0][0].removeprefix("#")
150 return (
151 pos_id,
152 clean_node(wxr, None, link_node).strip(": "),
153 extract_etymology_list_item_nodes(
154 wxr, list_item_node.children[index + 1 :]
155 ),
156 )
157 elif (
158 node.kind == NodeKind.LINK
159 and isinstance(node.largs[0][0], str)
160 and node.largs[0][0].startswith("#")
161 ):
162 pos_id = node.largs[0][0].removeprefix("#")
163 return (
164 pos_id,
165 clean_node(wxr, None, node).strip(": "),
166 extract_etymology_list_item_nodes(
167 wxr, list_item_node.children[index + 1 :]
168 ),
169 )
170 elif node.kind == NodeKind.ITALIC:
171 for link_node in node.find_child(NodeKind.LINK):
172 if isinstance(link_node.largs[0][0], str) and link_node.largs[
173 0
174 ][0].startswith("#"):
175 pos_id = link_node.largs[0][0].removeprefix("#")
176 e_data = extract_etymology_list_item_nodes(
177 wxr, list_item_node.children[index + 1 :]
178 )
179 e_data.texts = [t.lstrip(") ") for t in e_data.texts]
180 return (
181 pos_id,
182 clean_node(wxr, None, link_node).strip(": "),
183 e_data,
184 )
185 italic_text = clean_node(wxr, None, node)
186 if (
187 index <= 1 # first node is empty string
188 and italic_text.startswith("(")
189 and italic_text.endswith(")")
190 ):
191 return (
192 "",
193 italic_text.strip("() "),
194 extract_etymology_list_item_nodes(
195 wxr, list_item_node.children[index + 1 :]
196 ),
197 )
200def extract_etymology_list_item_nodes(
201 wxr: WiktextractContext, nodes: list[WikiNode]
202) -> EtymologyData:
203 used_nodes = []
204 cats = {}
205 e_data = EtymologyData()
206 is_first_attest_template = True
207 for node in nodes:
208 if (
209 is_first_attest_template
210 and isinstance(node, TemplateNode)
211 and node.template_name in ATTESTATION_TEMPLATES
212 ):
213 e_data.attestations = extract_date_template(wxr, cats, node)
214 is_first_attest_template = False
215 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
216 used_nodes.append(node)
217 e_text = clean_node(wxr, cats, used_nodes)
218 if e_text != "":
219 e_data.texts.append(e_text)
220 e_data.categories = cats.get("categories", [])
221 return e_data
224def insert_etymology_data(
225 lang_code: str, page_data: list[WordEntry], etymology_dict: EtymologyDict
226) -> None:
227 """
228 Insert list of etymology data extracted from the level 3 node to each sense
229 dictionary matches the language and POS.
230 """
231 sense_dict = defaultdict(list) # group by pos title and id
232 for sense_data in page_data:
233 if sense_data.lang_code == lang_code:
234 sense_dict[sense_data.pos_title].append(sense_data)
235 sense_dict[sense_data.pos_id].append(sense_data)
236 if sense_data.pos_id.endswith("-1"):
237 # extra ids for the first title
238 sense_dict[sense_data.pos_title.replace(" ", "_")].append(
239 sense_data
240 )
241 sense_dict[sense_data.pos_id.removesuffix("-1")].append(
242 sense_data
243 )
245 added_sense = []
246 for pos_id_title, etymology_data in etymology_dict.items():
247 if pos_id_title == ("", ""): # add to all sense dictionaries
248 for sense_data_list in sense_dict.values():
249 for sense_data in sense_data_list:
250 if sense_data not in added_sense:
251 sense_data.etymology_texts = etymology_data.texts
252 sense_data.categories.extend(etymology_data.categories)
253 sense_data.attestations.extend(
254 etymology_data.attestations
255 )
256 added_sense.append(sense_data)
257 else:
258 for pos_key in pos_id_title:
259 if pos_key in sense_dict:
260 for sense_data in sense_dict[pos_key]:
261 if sense_data not in added_sense:
262 sense_data.etymology_texts = etymology_data.texts
263 sense_data.categories.extend(
264 etymology_data.categories
265 )
266 sense_data.attestations.extend(
267 etymology_data.attestations
268 )
269 added_sense.append(sense_data)
272def extract_etymology_examples(
273 wxr: WiktextractContext,
274 level_node: LevelNode,
275 base_data: WordEntry,
276) -> None:
277 for list_node in level_node.find_child(NodeKind.LIST):
278 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
279 extract_etymology_example_list_item(wxr, list_item, base_data, "")
282def extract_etymology_example_list_item(
283 wxr: WiktextractContext,
284 list_item: WikiNode,
285 base_data: WordEntry,
286 note: str,
287) -> None:
288 from .gloss import process_exemple_template
290 attestations = []
291 source = ""
292 example_nodes = []
293 has_exemple_template = False
294 for node in list_item.children:
295 if isinstance(node, TemplateNode):
296 if node.template_name in ATTESTATION_TEMPLATES:
297 attestations = extract_date_template(wxr, base_data, node)
298 elif node.template_name == "exemple":
299 has_exemple_template = True
300 example_data = process_exemple_template(
301 wxr, node, base_data, attestations
302 )
303 if example_data.text != "": 303 ↛ 294line 303 didn't jump to line 294 because the condition on line 303 was always true
304 example_data.note = note
305 base_data.etymology_examples.append(example_data)
306 elif node.template_name == "source": 306 ↛ 309line 306 didn't jump to line 309 because the condition on line 306 was always true
307 source = clean_node(wxr, base_data, node).strip("— ()")
308 else:
309 example_nodes.append(node)
310 else:
311 example_nodes.append(node)
313 if not has_exemple_template:
314 if len(attestations) == 0 and list_item.contain_node(NodeKind.LIST):
315 note = clean_node(
316 wxr,
317 base_data,
318 list(
319 list_item.invert_find_child(
320 NodeKind.LIST, include_empty_str=True
321 )
322 ),
323 )
324 for next_list in list_item.find_child(NodeKind.LIST):
325 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM):
326 extract_etymology_example_list_item(
327 wxr, next_list_item, base_data, note
328 )
329 elif len(example_nodes) > 0: 329 ↛ exitline 329 didn't return from function 'extract_etymology_example_list_item' because the condition on line 329 was always true
330 example_str = clean_node(wxr, base_data, example_nodes)
331 if example_str != "": 331 ↛ exitline 331 didn't return from function 'extract_etymology_example_list_item' because the condition on line 331 was always true
332 example_data = Example(
333 text=example_str,
334 ref=source,
335 note=note,
336 attestations=attestations,
337 )
338 base_data.etymology_examples.append(example_data)
341def extract_date_template(
342 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
343) -> list[AttestationData]:
344 date_list = []
345 date = clean_node(wxr, word_entry, t_node).strip("()")
346 if date not in ["", "Date à préciser"]: 346 ↛ 348line 346 didn't jump to line 348 because the condition on line 346 was always true
347 date_list.append(AttestationData(date=date))
348 return date_list