Coverage for src/wiktextract/extractor/fr/etymology.py: 96%
159 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1from collections import defaultdict
2from dataclasses import dataclass, field
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .models import AttestationData, Example, WordEntry
16ATTESTATION_TEMPLATES = {"siècle", "circa", "date"}
19@dataclass
20class EtymologyData:
21 texts: list[str] = field(default_factory=list)
22 categories: list[str] = field(default_factory=list)
23 attestations: list[AttestationData] = field(default_factory=list)
26EtymologyDict = dict[tuple[str, str], EtymologyData]
29def extract_etymology(
30 wxr: WiktextractContext, level_node: LevelNode, base_data: WordEntry
31) -> EtymologyDict:
32 etymology_dict: EtymologyDict = defaultdict(EtymologyData)
33 level_node_index = len(level_node.children)
34 pos_id = ""
35 pos_title = ""
36 for node_index, node in level_node.find_child(
37 NodeKind.LIST | LEVEL_KIND_FLAGS, True
38 ):
39 if node.kind in LEVEL_KIND_FLAGS and node_index < level_node_index:
40 level_node_index = node_index
41 elif node.kind == NodeKind.LIST: 41 ↛ 36line 41 didn't jump to line 36 because the condition on line 41 was always true
42 for etymology_item in node.find_child(NodeKind.LIST_ITEM):
43 pos_id, pos_title = extract_etymology_list_item(
44 wxr, etymology_item, etymology_dict, pos_id, pos_title
45 )
47 if len(etymology_dict) == 0:
48 categories = {}
49 etymology_text = clean_node(
50 wxr, categories, level_node.children[:level_node_index]
51 )
52 if len(etymology_text) > 0: 52 ↛ 58line 52 didn't jump to line 58 because the condition on line 52 was always true
53 etymology_dict[("", "")].texts.append(etymology_text)
54 etymology_dict[(pos_id, pos_title)].categories.extend(
55 categories.get("categories", [])
56 )
58 if ("", "") in etymology_dict and etymology_dict.get(("", "")).texts == [
59 " "
60 ]:
61 # remove "ébauche-étym" template placeholder
62 del etymology_dict[("", "")]
64 return etymology_dict
67def extract_etymology_list_item(
68 wxr: WiktextractContext,
69 list_item: WikiNode,
70 etymology_dict: EtymologyDict,
71 pos_id: str,
72 pos_title: str,
73) -> tuple[str, str]:
74 etymology_data = find_pos_in_etymology_list(wxr, list_item)
75 if etymology_data is not None:
76 pos_id, pos_title, etymology_data = etymology_data
77 if len(etymology_data.texts) > 0:
78 etymology_dict[(pos_id, pos_title)].texts.extend(
79 etymology_data.texts
80 )
81 etymology_dict[(pos_id, pos_title)].categories.extend(
82 etymology_data.categories
83 )
84 etymology_dict[(pos_id, pos_title)].attestations.extend(
85 etymology_data.attestations
86 )
87 else:
88 etymology_data = extract_etymology_list_item_nodes(
89 wxr, list_item.children
90 )
91 if len(etymology_data.texts) > 0: 91 ↛ 102line 91 didn't jump to line 102 because the condition on line 91 was always true
92 etymology_dict[(pos_id, pos_title)].texts.extend(
93 etymology_data.texts
94 )
95 etymology_dict[(pos_id, pos_title)].categories.extend(
96 etymology_data.categories
97 )
98 etymology_dict[(pos_id, pos_title)].attestations.extend(
99 etymology_data.attestations
100 )
102 for child_list in list_item.find_child(NodeKind.LIST):
103 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
104 extract_etymology_list_item(
105 wxr, child_list_item, etymology_dict, pos_id, pos_title
106 )
108 return pos_id, pos_title
111def find_pos_in_etymology_list(
112 wxr: WiktextractContext, list_item_node: WikiNode
113) -> tuple[str, str, EtymologyData] | None:
114 """
115 Return tuple of POS id, title, etymology text, categories if the passed
116 list item node starts with italic POS node or POS template, otherwise
117 return `None`.
118 """
119 for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
120 if template_node.template_name == "ébauche-étym":
121 return "", "", EtymologyData(" ", [], []) # missing etymology
123 for index, node in list_item_node.find_child(
124 NodeKind.TEMPLATE | NodeKind.LINK | NodeKind.ITALIC, True
125 ):
126 if isinstance(node, TemplateNode) and node.template_name in (
127 "lien-ancre-étym",
128 "laé",
129 ):
130 expanded_template = wxr.wtp.parse(
131 wxr.wtp.node_to_wikitext(node), expand_all=True
132 )
133 for italic_node in expanded_template.find_child(NodeKind.ITALIC): 133 ↛ 123line 133 didn't jump to line 123 because the loop on line 133 didn't complete
134 for link_node in italic_node.find_child(NodeKind.LINK): 134 ↛ 133line 134 didn't jump to line 133 because the loop on line 134 didn't complete
135 if isinstance( 135 ↛ 134line 135 didn't jump to line 134 because the condition on line 135 was always true
136 link_node.largs[0][0], str
137 ) and link_node.largs[0][0].startswith("#"):
138 pos_id = link_node.largs[0][0].removeprefix("#")
139 return (
140 pos_id,
141 clean_node(wxr, None, link_node).strip(": "),
142 extract_etymology_list_item_nodes(
143 wxr, list_item_node.children[index + 1 :]
144 ),
145 )
146 elif (
147 node.kind == NodeKind.LINK
148 and isinstance(node.largs[0][0], str)
149 and node.largs[0][0].startswith("#")
150 ):
151 pos_id = node.largs[0][0].removeprefix("#")
152 return (
153 pos_id,
154 clean_node(wxr, None, node).strip(": "),
155 extract_etymology_list_item_nodes(
156 wxr, list_item_node.children[index + 1 :]
157 ),
158 )
159 elif node.kind == NodeKind.ITALIC:
160 for link_node in node.find_child(NodeKind.LINK):
161 if isinstance(link_node.largs[0][0], str) and link_node.largs[
162 0
163 ][0].startswith("#"):
164 pos_id = link_node.largs[0][0].removeprefix("#")
165 e_data = extract_etymology_list_item_nodes(
166 wxr, list_item_node.children[index + 1 :]
167 )
168 e_data.texts = [t.lstrip(") ") for t in e_data.texts]
169 return (
170 pos_id,
171 clean_node(wxr, None, link_node).strip(": "),
172 e_data,
173 )
174 italic_text = clean_node(wxr, None, node)
175 if (
176 index <= 1 # first node is empty string
177 and italic_text.startswith("(")
178 and italic_text.endswith(")")
179 ):
180 return (
181 "",
182 italic_text.strip("() "),
183 extract_etymology_list_item_nodes(
184 wxr, list_item_node.children[index + 1 :]
185 ),
186 )
189def extract_etymology_list_item_nodes(
190 wxr: WiktextractContext, nodes: list[WikiNode]
191) -> EtymologyData:
192 used_nodes = []
193 cats = {}
194 e_data = EtymologyData()
195 is_first_attest_template = True
196 for node in nodes:
197 if (
198 is_first_attest_template
199 and isinstance(node, TemplateNode)
200 and node.template_name in ATTESTATION_TEMPLATES
201 ):
202 e_data.attestations = extract_date_template(wxr, cats, node)
203 is_first_attest_template = False
204 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
205 used_nodes.append(node)
206 e_text = clean_node(wxr, cats, used_nodes)
207 if e_text != "":
208 e_data.texts.append(e_text)
209 e_data.categories = cats.get("categories", [])
210 return e_data
213def insert_etymology_data(
214 lang_code: str, page_data: list[WordEntry], etymology_dict: EtymologyDict
215) -> None:
216 """
217 Insert list of etymology data extracted from the level 3 node to each sense
218 dictionary matches the language and POS.
219 """
220 sense_dict = defaultdict(list) # group by pos title and id
221 for sense_data in page_data:
222 if sense_data.lang_code == lang_code:
223 sense_dict[sense_data.pos_title].append(sense_data)
224 sense_dict[sense_data.pos_id].append(sense_data)
225 if sense_data.pos_id.endswith("-1"):
226 # extra ids for the first title
227 sense_dict[sense_data.pos_title.replace(" ", "_")].append(
228 sense_data
229 )
230 sense_dict[sense_data.pos_id.removesuffix("-1")].append(
231 sense_data
232 )
234 added_sense = []
235 for pos_id_title, etymology_data in etymology_dict.items():
236 if pos_id_title == ("", ""): # add to all sense dictionaries
237 for sense_data_list in sense_dict.values():
238 for sense_data in sense_data_list:
239 if sense_data not in added_sense:
240 sense_data.etymology_texts = etymology_data.texts
241 sense_data.categories.extend(etymology_data.categories)
242 sense_data.attestations.extend(
243 etymology_data.attestations
244 )
245 added_sense.append(sense_data)
246 else:
247 for pos_key in pos_id_title:
248 if pos_key in sense_dict:
249 for sense_data in sense_dict[pos_key]:
250 if sense_data not in added_sense:
251 sense_data.etymology_texts = etymology_data.texts
252 sense_data.categories.extend(
253 etymology_data.categories
254 )
255 sense_data.attestations.extend(
256 etymology_data.attestations
257 )
258 added_sense.append(sense_data)
261def extract_etymology_examples(
262 wxr: WiktextractContext,
263 level_node: LevelNode,
264 base_data: WordEntry,
265) -> None:
266 for list_node in level_node.find_child(NodeKind.LIST):
267 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
268 extract_etymology_example_list_item(wxr, list_item, base_data, "")
271def extract_etymology_example_list_item(
272 wxr: WiktextractContext,
273 list_item: WikiNode,
274 base_data: WordEntry,
275 note: str,
276) -> None:
277 from .gloss import process_exemple_template
279 attestations = []
280 source = ""
281 example_nodes = []
282 has_exemple_template = False
283 for node in list_item.children:
284 if isinstance(node, TemplateNode):
285 if node.template_name in ATTESTATION_TEMPLATES:
286 attestations = extract_date_template(wxr, base_data, node)
287 elif node.template_name == "exemple":
288 has_exemple_template = True
289 example_data = process_exemple_template(
290 wxr, node, base_data, attestations
291 )
292 if example_data.text != "": 292 ↛ 283line 292 didn't jump to line 283 because the condition on line 292 was always true
293 example_data.note = note
294 base_data.etymology_examples.append(example_data)
295 elif node.template_name == "source": 295 ↛ 298line 295 didn't jump to line 298 because the condition on line 295 was always true
296 source = clean_node(wxr, base_data, node).strip("— ()")
297 else:
298 example_nodes.append(node)
299 else:
300 example_nodes.append(node)
302 if not has_exemple_template:
303 if len(attestations) == 0 and list_item.contain_node(NodeKind.LIST):
304 note = clean_node(
305 wxr,
306 base_data,
307 list(
308 list_item.invert_find_child(
309 NodeKind.LIST, include_empty_str=True
310 )
311 ),
312 )
313 for next_list in list_item.find_child(NodeKind.LIST):
314 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM):
315 extract_etymology_example_list_item(
316 wxr, next_list_item, base_data, note
317 )
318 elif len(example_nodes) > 0: 318 ↛ exitline 318 didn't return from function 'extract_etymology_example_list_item' because the condition on line 318 was always true
319 example_str = clean_node(wxr, base_data, example_nodes)
320 if example_str != "": 320 ↛ exitline 320 didn't return from function 'extract_etymology_example_list_item' because the condition on line 320 was always true
321 example_data = Example(
322 text=example_str,
323 ref=source,
324 note=note,
325 attestations=attestations,
326 )
327 base_data.etymology_examples.append(example_data)
330def extract_date_template(
331 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
332) -> list[AttestationData]:
333 date_list = []
334 date = clean_node(wxr, word_entry, t_node).strip("()")
335 if date not in ["", "Date à préciser"]: 335 ↛ 337line 335 didn't jump to line 337 because the condition on line 335 was always true
336 date_list.append(AttestationData(date=date))
337 return date_list