Coverage for src/wiktextract/extractor/fr/linkage.py: 93%
148 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..share import capture_text_in_parentheses
8from .models import Form, Linkage, WordEntry
9from .section_types import LINKAGE_SECTIONS, LINKAGE_TAGS
10from .tags import translate_raw_tags
13def extract_linkage(
14 wxr: WiktextractContext,
15 page_data: list[WordEntry],
16 level_node: LevelNode,
17 section_type: str,
18) -> None:
19 if section_type == "dérivés autres langues":
20 process_derives_autres_list(wxr, page_data, level_node)
21 elif section_type == "anagrammes":
22 for node in level_node.find_child(NodeKind.TEMPLATE):
23 if node.template_name == "voir anagrammes": 23 ↛ 22line 23 didn't jump to line 22 because the condition on line 23 was always true
24 anagram_list = process_voir_anagrammes_template(wxr, node)
25 for data in page_data:
26 if data.lang_code == page_data[-1].lang_code: 26 ↛ 25line 26 didn't jump to line 25 because the condition on line 26 was always true
27 data.anagrams.extend(anagram_list)
28 else:
29 extract_linkage_section(
30 wxr,
31 page_data[-1],
32 level_node,
33 LINKAGE_SECTIONS[section_type],
34 LINKAGE_TAGS.get(section_type, []),
35 )
38def process_derives_autres_list(
39 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
40):
41 # drrive to other languages list
42 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
43 lang_code = ""
44 lang_name = ""
45 for node in list_item.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
46 if isinstance(node, TemplateNode) and node.template_name == "L":
47 lang_code = node.template_parameters.get(1)
48 lang_name = clean_node(wxr, None, node)
49 elif node.kind == NodeKind.LINK:
50 word = clean_node(wxr, None, node)
51 page_data[-1].derived.append(
52 Linkage(lang_code=lang_code, lang=lang_name, word=word)
53 )
54 elif isinstance(node, TemplateNode) and node.template_name in [ 54 ↛ 45line 54 didn't jump to line 45 because the condition on line 54 was always true
55 "l",
56 "lien",
57 "zh-lien",
58 "zh-lien-t",
59 ]:
60 linkage_data = Linkage(
61 lang_code=lang_code, lang=lang_name, word=""
62 )
63 process_linkage_template(wxr, node, linkage_data)
64 page_data[-1].derived.append(linkage_data)
67def extract_linkage_section(
68 wxr: WiktextractContext,
69 word_entry: WordEntry,
70 level_node: LevelNode,
71 linkage_type: str,
72 section_tags: list[str] = [],
73):
74 sense_text = ""
75 sense_index = 0
76 for node in level_node.children:
77 if isinstance(node, TemplateNode) and node.template_name == "(":
78 new_sense_text = clean_node(
79 wxr, None, node.template_parameters.get(1, "")
80 )
81 if new_sense_text != "": 81 ↛ 83line 81 didn't jump to line 83 because the condition on line 81 was always true
82 sense_text = new_sense_text
83 sense_index_text = node.template_parameters.get(2, "0")
84 if ( 84 ↛ 76line 84 didn't jump to line 76 because the condition on line 84 was always true
85 isinstance(sense_index_text, str)
86 and sense_index_text.isdecimal()
87 ):
88 sense_index = int(sense_index_text)
89 elif (
90 isinstance(node, WikiNode)
91 and node.kind in NodeKind.BOLD | NodeKind.ITALIC
92 ):
93 sense_text = clean_node(wxr, None, node)
94 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
95 # sense could also be in ";" description list
96 if node.sarg in [";", ":"]:
97 for list_item in node.find_child(NodeKind.LIST_ITEM):
98 sense_text = clean_node(wxr, None, list_item.children)
99 index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$"
100 m = re.search(index_pattern, sense_text)
101 if m is not None: 101 ↛ 97line 101 didn't jump to line 97 because the condition on line 101 was always true
102 sense_text = re.sub(index_pattern, "", sense_text)
103 sense_index = int(m.group(1))
104 else:
105 for list_item in node.find_child(NodeKind.LIST_ITEM):
106 extract_linkage_list_item(
107 wxr,
108 word_entry,
109 list_item,
110 linkage_type,
111 section_tags,
112 sense_text,
113 sense_index,
114 )
117def extract_linkage_list_item(
118 wxr: WiktextractContext,
119 word_entry: WordEntry,
120 list_item: WikiNode,
121 linkage_type: str,
122 section_tags: list[str],
123 sense: str,
124 sense_index: int,
125):
126 linkage_data = Linkage(
127 word="", tags=section_tags, sense=sense, sense_index=sense_index
128 )
129 pending_tag = ""
130 inside_bracket = False
131 for index, child_node in enumerate(list_item.children):
132 if isinstance(
133 child_node, TemplateNode
134 ) and child_node.template_name in [
135 "l",
136 "lien",
137 "zh-lien",
138 "zh-lien-t",
139 ]:
140 process_linkage_template(wxr, child_node, linkage_data)
141 elif (
142 isinstance(child_node, TemplateNode)
143 and child_node.template_name == "cf"
144 ):
145 return
146 elif (
147 isinstance(child_node, WikiNode)
148 and child_node.kind == NodeKind.LINK
149 and not inside_bracket
150 ):
151 linkage_data.word = clean_node(wxr, None, child_node)
152 elif (
153 isinstance(child_node, WikiNode)
154 and child_node.kind == NodeKind.ITALIC
155 ):
156 italic_text = clean_node(wxr, None, child_node).strip("()")
157 if italic_text == "": 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true
158 continue
159 elif len(list(list_item.filter_empty_str_child())) == 1:
160 linkage_data.word = italic_text
161 elif italic_text.isdecimal(): 161 ↛ 162line 161 didn't jump to line 162 because the condition on line 161 was never true
162 linkage_data.sense_index = int(italic_text)
163 elif inside_bracket:
164 linkage_data.raw_tags.append(italic_text)
165 else:
166 linkage_data.sense = italic_text
167 elif (
168 isinstance(child_node, TemplateNode)
169 and child_node.template_name == "réf"
170 ) or (
171 isinstance(child_node, WikiNode)
172 and child_node.kind == NodeKind.LIST
173 ):
174 continue
175 else:
176 tag_text = (
177 child_node
178 if isinstance(child_node, str)
179 else clean_node(wxr, word_entry, child_node)
180 )
181 if (
182 tag_text.strip() in {",", "/", "(ou"}
183 and linkage_data.word != ""
184 ):
185 # list item has more than one word
186 add_linkage_data(word_entry, linkage_type, linkage_data)
187 linkage_data = Linkage(
188 word="",
189 tags=section_tags,
190 sense=sense,
191 sense_index=sense_index,
192 )
193 continue
194 if tag_text.strip().startswith(
195 "("
196 ) and not tag_text.strip().endswith(")"):
197 pending_tag = tag_text
198 inside_bracket = True
199 continue
200 elif not tag_text.strip().startswith(
201 "("
202 ) and tag_text.strip().endswith(")"):
203 tag_text = pending_tag + tag_text
204 pending_tag = ""
205 inside_bracket = False
206 elif len(pending_tag) > 0:
207 pending_tag += tag_text
208 continue
210 if tag_text.strip().startswith("—"):
211 linkage_data.translation = clean_node(
212 wxr,
213 None,
214 list(list_item.invert_find_child(NodeKind.LIST, True))[
215 index:
216 ],
217 ).strip("— \n")
218 break
219 elif tag_text.strip().startswith(":"):
220 sense_text = tag_text.strip().removeprefix(":").strip()
221 linkage_data.sense = sense_text
222 else:
223 tags, _ = capture_text_in_parentheses(tag_text)
224 for tag in tags:
225 if tag.isdecimal():
226 linkage_data.sense_index = int(tag)
227 else:
228 linkage_data.raw_tags.append(tag)
230 if len(linkage_data.word) > 0: 230 ↛ 232line 230 didn't jump to line 232 because the condition on line 230 was always true
231 add_linkage_data(word_entry, linkage_type, linkage_data)
232 for child_list in list_item.find_child(NodeKind.LIST):
233 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
234 extract_linkage_list_item(
235 wxr,
236 word_entry,
237 child_list_item,
238 linkage_type,
239 section_tags,
240 sense,
241 sense_index,
242 )
245def add_linkage_data(
246 word_entry: WordEntry, l_type: str, l_data: Linkage
247) -> None:
248 if l_data.word == "": 248 ↛ 249line 248 didn't jump to line 249 because the condition on line 248 was never true
249 return
250 translate_raw_tags(l_data)
251 if l_type == "forms":
252 word_entry.forms.append(
253 Form(
254 form=l_data.word,
255 tags=l_data.tags,
256 raw_tags=l_data.raw_tags,
257 roman=l_data.roman,
258 sense=l_data.sense,
259 sense_index=l_data.sense_index,
260 )
261 )
262 else:
263 getattr(word_entry, l_type).append(l_data)
266def process_linkage_template(
267 wxr: WiktextractContext,
268 node: TemplateNode,
269 linkage_data: Linkage,
270) -> None:
271 if node.template_name in ["lien", "l"]:
272 process_lien_template(wxr, node, linkage_data)
273 elif node.template_name.startswith("zh-lien"): 273 ↛ exitline 273 didn't return from function 'process_linkage_template' because the condition on line 273 was always true
274 process_zh_lien_template(wxr, node, linkage_data)
277def process_lien_template(
278 wxr: WiktextractContext,
279 node: TemplateNode,
280 linkage_data: Linkage,
281) -> None:
282 # link word template: https://fr.wiktionary.org/wiki/Modèle:lien
283 word = clean_node(
284 wxr,
285 None,
286 node.template_parameters.get("dif", node.template_parameters.get(1)),
287 )
288 linkage_data.word = word
289 if "tr" in node.template_parameters:
290 linkage_data.roman = clean_node(
291 wxr, None, node.template_parameters.get("tr")
292 )
293 if "sens" in node.template_parameters: 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true
294 linkage_data.translation = clean_node(
295 wxr, None, node.template_parameters.get("sens")
296 )
299def process_zh_lien_template(
300 wxr: WiktextractContext,
301 node: TemplateNode,
302 linkage_data: Linkage,
303) -> None:
304 # https://fr.wiktionary.org/wiki/Modèle:zh-lien
305 linkage_data.word = clean_node(wxr, None, node.template_parameters.get(1))
306 linkage_data.roman = clean_node(
307 wxr, None, node.template_parameters.get(2, "")
308 ) # pinyin
309 traditional_form = clean_node(
310 wxr, None, node.template_parameters.get(3, "")
311 )
312 if len(traditional_form) > 0:
313 linkage_data.alt = traditional_form
316def process_voir_anagrammes_template(
317 wxr: WiktextractContext, node: TemplateNode
318) -> list[Linkage]:
319 # https://fr.wiktionary.org/wiki/Modèle:voir_anagrammes
320 results = []
321 expanded_node = wxr.wtp.parse(
322 wxr.wtp.node_to_wikitext(node), expand_all=True
323 )
324 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
325 for link_node in list_item.find_child(NodeKind.LINK):
326 word = clean_node(wxr, None, link_node)
327 if len(word) > 0: 327 ↛ 325line 327 didn't jump to line 325 because the condition on line 327 was always true
328 results.append(Linkage(word=word))
329 return results