Coverage for src/wiktextract/extractor/fr/etymology.py: 96%
100 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1from collections import defaultdict
2from dataclasses import dataclass, field
3from typing import Optional
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 LevelNode,
8 NodeKind,
9 TemplateNode,
10 WikiNode,
11)
13from ...page import clean_node
14from ...wxr_context import WiktextractContext
15from .models import WordEntry
18@dataclass
19class EtymologyData:
20 texts: list[str] = field(default_factory=list)
21 categories: list[str] = field(default_factory=list)
24EtymologyDict = dict[tuple[str, str], EtymologyData]
27def extract_etymology(
28 wxr: WiktextractContext, level_node: LevelNode, base_data: WordEntry
29) -> EtymologyDict:
30 etymology_dict: EtymologyDict = defaultdict(EtymologyData)
31 level_node_index = len(level_node.children)
32 pos_id = ""
33 pos_title = ""
34 for node_index, node in level_node.find_child(
35 NodeKind.LIST | LEVEL_KIND_FLAGS, True
36 ):
37 if node.kind in LEVEL_KIND_FLAGS:
38 level_node_index = node_index
39 title_text = clean_node(wxr, None, node.largs)
40 if title_text == "Attestations historiques":
41 extract_etymology_examples(wxr, node, base_data)
42 elif node.kind == NodeKind.LIST: 42 ↛ 34line 42 didn't jump to line 34 because the condition on line 42 was always true
43 for etymology_item in node.find_child(NodeKind.LIST_ITEM):
44 etymology_data = find_pos_in_etymology_list(wxr, etymology_item)
45 if etymology_data is not None:
46 pos_id, pos_title, etymology_text, categories = (
47 etymology_data
48 )
49 if len(etymology_text) > 0:
50 etymology_dict[(pos_id, pos_title)].texts.append(
51 etymology_text
52 )
53 etymology_dict[(pos_id, pos_title)].categories.extend(
54 categories
55 )
56 else:
57 categories = {}
58 etymology_text = clean_node(
59 wxr, categories, etymology_item.children
60 )
61 if len(etymology_text) > 0: 61 ↛ 43line 61 didn't jump to line 43 because the condition on line 61 was always true
62 etymology_dict[(pos_id, pos_title)].texts.append(
63 etymology_text
64 )
65 etymology_dict[(pos_id, pos_title)].categories.extend(
66 categories.get("categories", [])
67 )
69 if len(etymology_dict) == 0:
70 categories = {}
71 etymology_text = clean_node(
72 wxr, categories, level_node.children[:level_node_index]
73 )
74 if len(etymology_text) > 0:
75 etymology_dict[("", "")].texts.append(etymology_text)
76 etymology_dict[(pos_id, pos_title)].categories.extend(
77 categories.get("categories", [])
78 )
80 if ("", "") in etymology_dict and etymology_dict.get(("", "")).texts == [
81 " "
82 ]:
83 # remove "ébauche-étym" template placeholder
84 del etymology_dict[("", "")]
86 return etymology_dict
89def find_pos_in_etymology_list(
90 wxr: WiktextractContext, list_item_node: WikiNode
91) -> Optional[tuple[str, str, str, list[str]]]:
92 """
93 Return tuple of POS id, title, etymology text, categories if the passed
94 list item node starts with italic POS node or POS template, otherwise
95 return `None`.
96 """
97 for template_node in list_item_node.find_child(NodeKind.TEMPLATE):
98 if template_node.template_name == "ébauche-étym":
99 return ("", "", " ", []) # missing etymology
101 categories = {}
103 for index, node in list_item_node.find_child(
104 NodeKind.TEMPLATE | NodeKind.LINK | NodeKind.ITALIC, True
105 ):
106 if isinstance(node, TemplateNode) and node.template_name in (
107 "lien-ancre-étym",
108 "laé",
109 ):
110 expanded_template = wxr.wtp.parse(
111 wxr.wtp.node_to_wikitext(node), expand_all=True
112 )
113 for italic_node in expanded_template.find_child(NodeKind.ITALIC): 113 ↛ 103line 113 didn't jump to line 103 because the loop on line 113 didn't complete
114 for link_node in italic_node.find_child(NodeKind.LINK): 114 ↛ 113line 114 didn't jump to line 113 because the loop on line 114 didn't complete
115 if isinstance( 115 ↛ 114line 115 didn't jump to line 114 because the condition on line 115 was always true
116 link_node.largs[0][0], str
117 ) and link_node.largs[0][0].startswith("#"):
118 pos_id = link_node.largs[0][0].removeprefix("#")
119 return (
120 pos_id,
121 clean_node(wxr, None, link_node).strip(": "),
122 clean_node(
123 wxr,
124 categories,
125 list_item_node.children[index + 1 :],
126 ),
127 categories.get("categories", []),
128 )
129 elif (
130 node.kind == NodeKind.LINK
131 and isinstance(node.largs[0][0], str)
132 and node.largs[0][0].startswith("#")
133 ):
134 pos_id = node.largs[0][0].removeprefix("#")
135 return (
136 pos_id,
137 clean_node(wxr, None, node).strip(": "),
138 clean_node(
139 wxr, categories, list_item_node.children[index + 1 :]
140 ),
141 categories.get("categories", []),
142 )
143 elif node.kind == NodeKind.ITALIC:
144 for link_node in node.find_child(NodeKind.LINK):
145 if isinstance(link_node.largs[0][0], str) and link_node.largs[
146 0
147 ][0].startswith("#"):
148 pos_id = link_node.largs[0][0].removeprefix("#")
149 return (
150 pos_id,
151 clean_node(wxr, None, link_node).strip(": "),
152 clean_node(
153 wxr,
154 categories,
155 list_item_node.children[index + 1 :],
156 ).lstrip(") "),
157 categories.get("categories", []),
158 )
159 italic_text = clean_node(wxr, None, node)
160 if (
161 index <= 1 # first node is empty string
162 and italic_text.startswith("(")
163 and italic_text.endswith(")")
164 ):
165 return (
166 "",
167 italic_text.strip("() "),
168 clean_node(
169 wxr,
170 categories,
171 list_item_node.children[index + 1 :],
172 ),
173 categories.get("categories", []),
174 )
177def insert_etymology_data(
178 lang_code: str, page_data: list[WordEntry], etymology_dict: EtymologyDict
179) -> None:
180 """
181 Insert list of etymology data extracted from the level 3 node to each sense
182 dictionary matches the language and POS.
183 """
184 sense_dict = defaultdict(list) # group by pos title and id
185 for sense_data in page_data:
186 if sense_data.lang_code == lang_code:
187 sense_dict[sense_data.pos_title].append(sense_data)
188 sense_dict[sense_data.pos_id].append(sense_data)
189 if sense_data.pos_id.endswith("-1"):
190 # extra ids for the first title
191 sense_dict[sense_data.pos_title.replace(" ", "_")].append(
192 sense_data
193 )
194 sense_dict[sense_data.pos_id.removesuffix("-1")].append(
195 sense_data
196 )
198 for pos_id_title, etymology_data in etymology_dict.items():
199 if pos_id_title == ("", ""): # add to all sense dictionaries
200 for sense_data_list in sense_dict.values():
201 for sense_data in sense_data_list:
202 sense_data.etymology_texts = etymology_data.texts
203 sense_data.categories.extend(etymology_data.categories)
204 else:
205 for pos_key in pos_id_title:
206 if pos_key in sense_dict:
207 for sense_data in sense_dict[pos_key]:
208 sense_data.etymology_texts = etymology_data.texts
209 sense_data.categories.extend(etymology_data.categories)
212def extract_etymology_examples(
213 wxr: WiktextractContext,
214 level_node: LevelNode,
215 base_data: WordEntry,
216) -> None:
217 from .gloss import process_exemple_template
219 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
220 time = ""
221 for template_node in list_item.find_child(NodeKind.TEMPLATE):
222 if template_node.template_name == "siècle":
223 time = clean_node(wxr, None, template_node).strip("() ")
224 elif template_node.template_name == "exemple": 224 ↛ 221line 224 didn't jump to line 221 because the condition on line 224 was always true
225 example_data = process_exemple_template(
226 wxr, template_node, base_data, time
227 )
228 if example_data.text != "": 228 ↛ 221line 228 didn't jump to line 221 because the condition on line 228 was always true
229 base_data.etymology_examples.append(example_data)