Coverage for src/wiktextract/extractor/fr/etymology.py: 95%
126 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1from collections import defaultdict
2from dataclasses import dataclass, field
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .models import Example, WordEntry
17@dataclass
18class EtymologyData:
19 texts: list[str] = field(default_factory=list)
20 categories: list[str] = field(default_factory=list)
23EtymologyDict = dict[tuple[str, str], EtymologyData]
26def extract_etymology(
27 wxr: WiktextractContext, level_node: LevelNode, base_data: WordEntry
28) -> EtymologyDict:
29 etymology_dict: EtymologyDict = defaultdict(EtymologyData)
30 level_node_index = len(level_node.children)
31 pos_id = ""
32 pos_title = ""
33 for node_index, node in level_node.find_child(
34 NodeKind.LIST | LEVEL_KIND_FLAGS, True
35 ):
36 if node.kind in LEVEL_KIND_FLAGS and node_index < level_node_index:
37 level_node_index = node_index
38 elif node.kind == NodeKind.LIST: 38 ↛ 33line 38 didn't jump to line 33 because the condition on line 38 was always true
39 for etymology_item in node.find_child(NodeKind.LIST_ITEM):
40 pos_id, pos_title = extract_etymology_list_item(
41 wxr, etymology_item, etymology_dict, pos_id, pos_title
42 )
44 if len(etymology_dict) == 0:
45 categories = {}
46 etymology_text = clean_node(
47 wxr, categories, level_node.children[:level_node_index]
48 )
49 if len(etymology_text) > 0: 49 ↛ 55line 49 didn't jump to line 55 because the condition on line 49 was always true
50 etymology_dict[("", "")].texts.append(etymology_text)
51 etymology_dict[(pos_id, pos_title)].categories.extend(
52 categories.get("categories", [])
53 )
55 if ("", "") in etymology_dict and etymology_dict.get(("", "")).texts == [
56 " "
57 ]:
58 # remove "ébauche-étym" template placeholder
59 del etymology_dict[("", "")]
61 return etymology_dict
64def extract_etymology_list_item(
65 wxr: WiktextractContext,
66 list_item: WikiNode,
67 etymology_dict: EtymologyDict,
68 pos_id: str,
69 pos_title: str,
70) -> tuple[str, str]:
71 etymology_data = find_pos_in_etymology_list(wxr, list_item)
72 if etymology_data is not None:
73 pos_id, pos_title, etymology_text, categories = etymology_data
74 if len(etymology_text) > 0:
75 etymology_dict[(pos_id, pos_title)].texts.append(etymology_text)
76 etymology_dict[(pos_id, pos_title)].categories.extend(categories)
77 else:
78 categories = {}
79 etymology_text = clean_node(
80 wxr,
81 categories,
82 list(
83 list_item.invert_find_child(
84 NodeKind.LIST, include_empty_str=True
85 )
86 ),
87 )
88 if len(etymology_text) > 0: 88 ↛ 94line 88 didn't jump to line 94 because the condition on line 88 was always true
89 etymology_dict[(pos_id, pos_title)].texts.append(etymology_text)
90 etymology_dict[(pos_id, pos_title)].categories.extend(
91 categories.get("categories", [])
92 )
94 for child_list in list_item.find_child(NodeKind.LIST):
95 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
96 extract_etymology_list_item(
97 wxr, child_list_item, etymology_dict, pos_id, pos_title
98 )
100 return pos_id, pos_title
103def find_pos_in_etymology_list(
104 wxr: WiktextractContext, list_item_node: WikiNode
105) -> tuple[str, str, str, list[str]] | None:
106 """
107 Return tuple of POS id, title, etymology text, categories if the passed
108 list item node starts with italic POS node or POS template, otherwise
109 return `None`.
110 """
111 for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
112 if template_node.template_name == "ébauche-étym":
113 return ("", "", " ", []) # missing etymology
115 categories = {}
117 for index, node in list_item_node.find_child(
118 NodeKind.TEMPLATE | NodeKind.LINK | NodeKind.ITALIC, True
119 ):
120 if isinstance(node, TemplateNode) and node.template_name in (
121 "lien-ancre-étym",
122 "laé",
123 ):
124 expanded_template = wxr.wtp.parse(
125 wxr.wtp.node_to_wikitext(node), expand_all=True
126 )
127 for italic_node in expanded_template.find_child(NodeKind.ITALIC): 127 ↛ 117line 127 didn't jump to line 117 because the loop on line 127 didn't complete
128 for link_node in italic_node.find_child(NodeKind.LINK): 128 ↛ 127line 128 didn't jump to line 127 because the loop on line 128 didn't complete
129 if isinstance( 129 ↛ 128line 129 didn't jump to line 128 because the condition on line 129 was always true
130 link_node.largs[0][0], str
131 ) and link_node.largs[0][0].startswith("#"):
132 pos_id = link_node.largs[0][0].removeprefix("#")
133 return (
134 pos_id,
135 clean_node(wxr, None, link_node).strip(": "),
136 clean_node(
137 wxr,
138 categories,
139 [
140 n
141 for n in list_item_node.children[
142 index + 1 :
143 ]
144 if not (
145 isinstance(n, WikiNode)
146 and n.kind == NodeKind.LIST
147 )
148 ],
149 ),
150 categories.get("categories", []),
151 )
152 elif (
153 node.kind == NodeKind.LINK
154 and isinstance(node.largs[0][0], str)
155 and node.largs[0][0].startswith("#")
156 ):
157 pos_id = node.largs[0][0].removeprefix("#")
158 return (
159 pos_id,
160 clean_node(wxr, None, node).strip(": "),
161 clean_node(
162 wxr,
163 categories,
164 [
165 n
166 for n in list_item_node.children[index + 1 :]
167 if not (
168 isinstance(n, WikiNode) and n.kind == NodeKind.LIST
169 )
170 ],
171 ),
172 categories.get("categories", []),
173 )
174 elif node.kind == NodeKind.ITALIC:
175 for link_node in node.find_child(NodeKind.LINK):
176 if isinstance(link_node.largs[0][0], str) and link_node.largs[
177 0
178 ][0].startswith("#"):
179 pos_id = link_node.largs[0][0].removeprefix("#")
180 return (
181 pos_id,
182 clean_node(wxr, None, link_node).strip(": "),
183 clean_node(
184 wxr,
185 categories,
186 [
187 n
188 for n in list_item_node.children[index + 1 :]
189 if not (
190 isinstance(n, WikiNode)
191 and n.kind == NodeKind.LIST
192 )
193 ],
194 ).lstrip(") "),
195 categories.get("categories", []),
196 )
197 italic_text = clean_node(wxr, None, node)
198 if (
199 index <= 1 # first node is empty string
200 and italic_text.startswith("(")
201 and italic_text.endswith(")")
202 ):
203 return (
204 "",
205 italic_text.strip("() "),
206 clean_node(
207 wxr,
208 categories,
209 [
210 n
211 for n in list_item_node.children[index + 1 :]
212 if not (
213 isinstance(n, WikiNode)
214 and n.kind == NodeKind.LIST
215 )
216 ],
217 ),
218 categories.get("categories", []),
219 )
222def insert_etymology_data(
223 lang_code: str, page_data: list[WordEntry], etymology_dict: EtymologyDict
224) -> None:
225 """
226 Insert list of etymology data extracted from the level 3 node to each sense
227 dictionary matches the language and POS.
228 """
229 sense_dict = defaultdict(list) # group by pos title and id
230 for sense_data in page_data:
231 if sense_data.lang_code == lang_code:
232 sense_dict[sense_data.pos_title].append(sense_data)
233 sense_dict[sense_data.pos_id].append(sense_data)
234 if sense_data.pos_id.endswith("-1"):
235 # extra ids for the first title
236 sense_dict[sense_data.pos_title.replace(" ", "_")].append(
237 sense_data
238 )
239 sense_dict[sense_data.pos_id.removesuffix("-1")].append(
240 sense_data
241 )
243 for pos_id_title, etymology_data in etymology_dict.items():
244 if pos_id_title == ("", ""): # add to all sense dictionaries
245 for sense_data_list in sense_dict.values():
246 for sense_data in sense_data_list:
247 sense_data.etymology_texts = etymology_data.texts
248 sense_data.categories.extend(etymology_data.categories)
249 else:
250 for pos_key in pos_id_title:
251 if pos_key in sense_dict:
252 for sense_data in sense_dict[pos_key]:
253 sense_data.etymology_texts = etymology_data.texts
254 sense_data.categories.extend(etymology_data.categories)
257def extract_etymology_examples(
258 wxr: WiktextractContext,
259 level_node: LevelNode,
260 base_data: WordEntry,
261) -> None:
262 for list_node in level_node.find_child(NodeKind.LIST):
263 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
264 extract_etymology_example_list_item(wxr, list_item, base_data, "")
267def extract_etymology_example_list_item(
268 wxr: WiktextractContext,
269 list_item: WikiNode,
270 base_data: WordEntry,
271 note: str,
272) -> None:
273 from .gloss import process_exemple_template
275 time = ""
276 source = ""
277 example_nodes = []
278 has_exemple_template = False
279 for node in list_item.children:
280 if isinstance(node, TemplateNode):
281 if node.template_name in ["siècle", "circa", "date"]:
282 time = clean_node(wxr, base_data, node).strip("() ")
283 elif node.template_name == "exemple":
284 has_exemple_template = True
285 example_data = process_exemple_template(
286 wxr, node, base_data, time
287 )
288 if example_data.text != "": 288 ↛ 279line 288 didn't jump to line 279 because the condition on line 288 was always true
289 example_data.note = note
290 base_data.etymology_examples.append(example_data)
291 elif node.template_name == "source": 291 ↛ 294line 291 didn't jump to line 294 because the condition on line 291 was always true
292 source = clean_node(wxr, base_data, node).strip("— ()")
293 else:
294 example_nodes.append(node)
295 else:
296 example_nodes.append(node)
298 if not has_exemple_template:
299 if time == "" and list_item.contain_node(NodeKind.LIST):
300 note = clean_node(
301 wxr,
302 base_data,
303 list(
304 list_item.invert_find_child(
305 NodeKind.LIST, include_empty_str=True
306 )
307 ),
308 )
309 for next_list in list_item.find_child(NodeKind.LIST):
310 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM):
311 extract_etymology_example_list_item(
312 wxr, next_list_item, base_data, note
313 )
314 elif len(example_nodes) > 0: 314 ↛ exitline 314 didn't return from function 'extract_etymology_example_list_item' because the condition on line 314 was always true
315 example_str = clean_node(wxr, base_data, example_nodes)
316 if example_str != "": 316 ↛ exitline 316 didn't return from function 'extract_etymology_example_list_item' because the condition on line 316 was always true
317 example_data = Example(
318 text=example_str, time=time, ref=source, note=note
319 )
320 base_data.etymology_examples.append(example_data)