Coverage for src/wiktextract/extractor/fr/etymology.py: 95%
120 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from collections import defaultdict
2from dataclasses import dataclass, field
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .models import Example, WordEntry
17@dataclass
18class EtymologyData:
19 texts: list[str] = field(default_factory=list)
20 categories: list[str] = field(default_factory=list)
23EtymologyDict = dict[tuple[str, str], EtymologyData]
26def extract_etymology(
27 wxr: WiktextractContext, level_node: LevelNode, base_data: WordEntry
28) -> EtymologyDict:
29 etymology_dict: EtymologyDict = defaultdict(EtymologyData)
30 level_node_index = len(level_node.children)
31 pos_id = ""
32 pos_title = ""
33 for node_index, node in level_node.find_child(
34 NodeKind.LIST | LEVEL_KIND_FLAGS, True
35 ):
36 if node.kind in LEVEL_KIND_FLAGS and node_index < level_node_index:
37 level_node_index = node_index
38 elif node.kind == NodeKind.LIST: 38 ↛ 33line 38 didn't jump to line 33 because the condition on line 38 was always true
39 for etymology_item in node.find_child(NodeKind.LIST_ITEM):
40 etymology_data = find_pos_in_etymology_list(wxr, etymology_item)
41 if etymology_data is not None:
42 pos_id, pos_title, etymology_text, categories = (
43 etymology_data
44 )
45 if len(etymology_text) > 0:
46 etymology_dict[(pos_id, pos_title)].texts.append(
47 etymology_text
48 )
49 etymology_dict[(pos_id, pos_title)].categories.extend(
50 categories
51 )
52 else:
53 categories = {}
54 etymology_text = clean_node(
55 wxr, categories, etymology_item.children
56 )
57 if len(etymology_text) > 0: 57 ↛ 39line 57 didn't jump to line 39 because the condition on line 57 was always true
58 etymology_dict[(pos_id, pos_title)].texts.append(
59 etymology_text
60 )
61 etymology_dict[(pos_id, pos_title)].categories.extend(
62 categories.get("categories", [])
63 )
65 if len(etymology_dict) == 0:
66 categories = {}
67 etymology_text = clean_node(
68 wxr, categories, level_node.children[:level_node_index]
69 )
70 if len(etymology_text) > 0: 70 ↛ 76line 70 didn't jump to line 76 because the condition on line 70 was always true
71 etymology_dict[("", "")].texts.append(etymology_text)
72 etymology_dict[(pos_id, pos_title)].categories.extend(
73 categories.get("categories", [])
74 )
76 if ("", "") in etymology_dict and etymology_dict.get(("", "")).texts == [
77 " "
78 ]:
79 # remove "ébauche-étym" template placeholder
80 del etymology_dict[("", "")]
82 return etymology_dict
85def find_pos_in_etymology_list(
86 wxr: WiktextractContext, list_item_node: WikiNode
87) -> tuple[str, str, str, list[str]] | None:
88 """
89 Return tuple of POS id, title, etymology text, categories if the passed
90 list item node starts with italic POS node or POS template, otherwise
91 return `None`.
92 """
93 for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
94 if template_node.template_name == "ébauche-étym":
95 return ("", "", " ", []) # missing etymology
97 categories = {}
99 for index, node in list_item_node.find_child(
100 NodeKind.TEMPLATE | NodeKind.LINK | NodeKind.ITALIC, True
101 ):
102 if isinstance(node, TemplateNode) and node.template_name in (
103 "lien-ancre-étym",
104 "laé",
105 ):
106 expanded_template = wxr.wtp.parse(
107 wxr.wtp.node_to_wikitext(node), expand_all=True
108 )
109 for italic_node in expanded_template.find_child(NodeKind.ITALIC): 109 ↛ 99line 109 didn't jump to line 99 because the loop on line 109 didn't complete
110 for link_node in italic_node.find_child(NodeKind.LINK): 110 ↛ 109line 110 didn't jump to line 109 because the loop on line 110 didn't complete
111 if isinstance( 111 ↛ 110line 111 didn't jump to line 110 because the condition on line 111 was always true
112 link_node.largs[0][0], str
113 ) and link_node.largs[0][0].startswith("#"):
114 pos_id = link_node.largs[0][0].removeprefix("#")
115 return (
116 pos_id,
117 clean_node(wxr, None, link_node).strip(": "),
118 clean_node(
119 wxr,
120 categories,
121 list_item_node.children[index + 1 :],
122 ),
123 categories.get("categories", []),
124 )
125 elif (
126 node.kind == NodeKind.LINK
127 and isinstance(node.largs[0][0], str)
128 and node.largs[0][0].startswith("#")
129 ):
130 pos_id = node.largs[0][0].removeprefix("#")
131 return (
132 pos_id,
133 clean_node(wxr, None, node).strip(": "),
134 clean_node(
135 wxr, categories, list_item_node.children[index + 1 :]
136 ),
137 categories.get("categories", []),
138 )
139 elif node.kind == NodeKind.ITALIC:
140 for link_node in node.find_child(NodeKind.LINK):
141 if isinstance(link_node.largs[0][0], str) and link_node.largs[
142 0
143 ][0].startswith("#"):
144 pos_id = link_node.largs[0][0].removeprefix("#")
145 return (
146 pos_id,
147 clean_node(wxr, None, link_node).strip(": "),
148 clean_node(
149 wxr,
150 categories,
151 list_item_node.children[index + 1 :],
152 ).lstrip(") "),
153 categories.get("categories", []),
154 )
155 italic_text = clean_node(wxr, None, node)
156 if (
157 index <= 1 # first node is empty string
158 and italic_text.startswith("(")
159 and italic_text.endswith(")")
160 ):
161 return (
162 "",
163 italic_text.strip("() "),
164 clean_node(
165 wxr,
166 categories,
167 list_item_node.children[index + 1 :],
168 ),
169 categories.get("categories", []),
170 )
173def insert_etymology_data(
174 lang_code: str, page_data: list[WordEntry], etymology_dict: EtymologyDict
175) -> None:
176 """
177 Insert list of etymology data extracted from the level 3 node to each sense
178 dictionary matches the language and POS.
179 """
180 sense_dict = defaultdict(list) # group by pos title and id
181 for sense_data in page_data:
182 if sense_data.lang_code == lang_code:
183 sense_dict[sense_data.pos_title].append(sense_data)
184 sense_dict[sense_data.pos_id].append(sense_data)
185 if sense_data.pos_id.endswith("-1"):
186 # extra ids for the first title
187 sense_dict[sense_data.pos_title.replace(" ", "_")].append(
188 sense_data
189 )
190 sense_dict[sense_data.pos_id.removesuffix("-1")].append(
191 sense_data
192 )
194 for pos_id_title, etymology_data in etymology_dict.items():
195 if pos_id_title == ("", ""): # add to all sense dictionaries
196 for sense_data_list in sense_dict.values():
197 for sense_data in sense_data_list:
198 sense_data.etymology_texts = etymology_data.texts
199 sense_data.categories.extend(etymology_data.categories)
200 else:
201 for pos_key in pos_id_title:
202 if pos_key in sense_dict:
203 for sense_data in sense_dict[pos_key]:
204 sense_data.etymology_texts = etymology_data.texts
205 sense_data.categories.extend(etymology_data.categories)
208def extract_etymology_examples(
209 wxr: WiktextractContext,
210 level_node: LevelNode,
211 base_data: WordEntry,
212) -> None:
213 for list_node in level_node.find_child(NodeKind.LIST):
214 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
215 extract_etymology_example_list_item(wxr, list_item, base_data, "")
218def extract_etymology_example_list_item(
219 wxr: WiktextractContext,
220 list_item: WikiNode,
221 base_data: WordEntry,
222 note: str,
223) -> None:
224 from .gloss import process_exemple_template
226 time = ""
227 source = ""
228 example_nodes = []
229 has_exemple_template = False
230 for node in list_item.children:
231 if isinstance(node, TemplateNode):
232 if node.template_name in ["siècle", "circa", "date"]:
233 time = clean_node(wxr, base_data, node).strip("() ")
234 elif node.template_name == "exemple":
235 has_exemple_template = True
236 example_data = process_exemple_template(
237 wxr, node, base_data, time
238 )
239 if example_data.text != "": 239 ↛ 230line 239 didn't jump to line 230 because the condition on line 239 was always true
240 example_data.note = note
241 base_data.etymology_examples.append(example_data)
242 elif node.template_name == "source": 242 ↛ 245line 242 didn't jump to line 245 because the condition on line 242 was always true
243 source = clean_node(wxr, base_data, node).strip("— ()")
244 else:
245 example_nodes.append(node)
246 else:
247 example_nodes.append(node)
249 if not has_exemple_template:
250 if time == "" and list_item.contain_node(NodeKind.LIST):
251 note = clean_node(
252 wxr, base_data, list(list_item.invert_find_child(NodeKind.LIST))
253 )
254 for next_list in list_item.find_child(NodeKind.LIST):
255 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM):
256 extract_etymology_example_list_item(
257 wxr, next_list_item, base_data, note
258 )
259 elif len(example_nodes) > 0: 259 ↛ exitline 259 didn't return from function 'extract_etymology_example_list_item' because the condition on line 259 was always true
260 example_str = clean_node(wxr, base_data, example_nodes)
261 if example_str != "": 261 ↛ exitline 261 didn't return from function 'extract_etymology_example_list_item' because the condition on line 261 was always true
262 example_data = Example(
263 text=example_str, time=time, ref=source, note=note
264 )
265 base_data.etymology_examples.append(example_data)