Coverage for src/wiktextract/extractor/fr/linkage.py: 94%
136 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1import re
3from wikitextprocessor import NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..share import capture_text_in_parentheses
8from .models import Form, Linkage, WordEntry
9from .section_types import LINKAGE_SECTIONS, LINKAGE_TAGS
10from .tags import translate_raw_tags
13def extract_linkage(
14 wxr: WiktextractContext,
15 page_data: list[WordEntry],
16 level_node: WikiNode,
17 section_type: str,
18) -> None:
19 if section_type == "dérivés autres langues":
20 process_derives_autres_list(wxr, page_data, level_node)
21 elif section_type == "anagrammes":
22 for node in level_node.find_child(NodeKind.TEMPLATE):
23 if node.template_name == "voir anagrammes": 23 ↛ 22line 23 didn't jump to line 22 because the condition on line 23 was always true
24 anagram_list = process_voir_anagrammes_template(wxr, node)
25 for data in page_data:
26 if data.lang_code == page_data[-1].lang_code: 26 ↛ 25line 26 didn't jump to line 25 because the condition on line 26 was always true
27 data.anagrams.extend(anagram_list)
28 else:
29 process_linkage_list(
30 wxr,
31 page_data,
32 level_node,
33 LINKAGE_SECTIONS[section_type],
34 LINKAGE_TAGS.get(section_type, []),
35 )
38def process_derives_autres_list(
39 wxr: WiktextractContext,
40 page_data: list[WordEntry],
41 level_node: WikiNode,
42):
43 # drrive to other languages list
44 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
45 lang_code = ""
46 lang_name = ""
47 for node in list_item.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
48 if isinstance(node, TemplateNode) and node.template_name == "L":
49 lang_code = node.template_parameters.get(1)
50 lang_name = clean_node(wxr, None, node)
51 elif node.kind == NodeKind.LINK:
52 word = clean_node(wxr, None, node)
53 page_data[-1].derived.append(
54 Linkage(lang_code=lang_code, lang=lang_name, word=word)
55 )
56 elif isinstance(node, TemplateNode) and node.template_name in [ 56 ↛ 47line 56 didn't jump to line 47 because the condition on line 56 was always true
57 "l",
58 "lien",
59 "zh-lien",
60 "zh-lien-t",
61 ]:
62 linkage_data = Linkage(
63 lang_code=lang_code, lang=lang_name, word=""
64 )
65 process_linkage_template(wxr, node, linkage_data)
66 page_data[-1].derived.append(linkage_data)
69def process_linkage_list(
70 wxr: WiktextractContext,
71 page_data: list[WordEntry],
72 level_node: WikiNode,
73 linkage_type: str,
74 section_tags: list[str] = [],
75) -> None:
76 sense_text = ""
77 sense_index = 0
78 for template_or_list_node in level_node.find_child_recursively(
79 NodeKind.LIST_ITEM | NodeKind.TEMPLATE
80 ):
81 # list table start template: https://fr.wiktionary.org/wiki/Modèle:(
82 if (
83 isinstance(template_or_list_node, TemplateNode)
84 and template_or_list_node.template_name == "("
85 ):
86 sense_text = clean_node(
87 wxr, None, template_or_list_node.template_parameters.get(1, "")
88 )
89 sense_index_text = template_or_list_node.template_parameters.get(
90 2, "0"
91 )
92 if ( 92 ↛ 97line 92 didn't jump to line 97 because the condition on line 92 was always true
93 isinstance(sense_index_text, str)
94 and sense_index_text.isdecimal()
95 ):
96 sense_index = int(sense_index_text)
97 continue
98 # sense could also be in ";" description list
99 if (
100 template_or_list_node.kind == NodeKind.LIST_ITEM
101 and template_or_list_node.sarg in {";", ":"}
102 ):
103 sense_text = clean_node(wxr, None, template_or_list_node.children)
104 index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$"
105 m = re.search(index_pattern, sense_text)
106 if m is not None: 106 ↛ 109line 106 didn't jump to line 109 because the condition on line 106 was always true
107 sense_text = re.sub(index_pattern, "", sense_text)
108 sense_index = int(m.group(1))
109 continue
111 linkage_data = Linkage(word="", tags=section_tags)
112 if len(sense_text) > 0:
113 linkage_data.sense = sense_text
114 if sense_index != 0:
115 linkage_data.sense_index = sense_index
116 pending_tag = ""
117 inside_bracket = False
118 for index, child_node in enumerate( # remove nested lists
119 template_or_list_node.invert_find_child(NodeKind.LIST, True)
120 ):
121 if isinstance(
122 child_node, TemplateNode
123 ) and child_node.template_name in [
124 "l",
125 "lien",
126 "zh-lien",
127 "zh-lien-t",
128 ]:
129 process_linkage_template(wxr, child_node, linkage_data)
130 elif (
131 isinstance(child_node, WikiNode)
132 and child_node.kind == NodeKind.LINK
133 and not inside_bracket
134 ):
135 linkage_data.word = clean_node(wxr, None, child_node)
136 elif (
137 isinstance(child_node, WikiNode)
138 and child_node.kind == NodeKind.ITALIC
139 ):
140 current_sense = clean_node(wxr, None, child_node).strip("()")
141 if (
142 len(list(template_or_list_node.filter_empty_str_child()))
143 == 1
144 ):
145 linkage_data.word = current_sense
146 elif current_sense.isdecimal(): 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true
147 linkage_data.sense_index = int(current_sense)
148 else:
149 linkage_data.sense = current_sense
150 elif (
151 isinstance(child_node, TemplateNode)
152 and child_node.template_name == "réf"
153 ):
154 continue
155 else:
156 tag_text = (
157 child_node
158 if isinstance(child_node, str)
159 else clean_node(wxr, page_data[-1], child_node)
160 )
161 if (
162 tag_text.strip() in {",", "/", "(ou"}
163 and linkage_data.word != ""
164 ):
165 # list item has more than one word
166 add_linkage_data(page_data[-1], linkage_type, linkage_data)
167 linkage_data = Linkage(word="", tags=section_tags)
168 continue
169 if tag_text.strip().startswith(
170 "("
171 ) and not tag_text.strip().endswith(")"):
172 pending_tag = tag_text
173 inside_bracket = True
174 continue
175 elif not tag_text.strip().startswith(
176 "("
177 ) and tag_text.strip().endswith(")"):
178 tag_text = pending_tag + tag_text
179 pending_tag = ""
180 inside_bracket = False
181 elif len(pending_tag) > 0:
182 pending_tag += tag_text
183 continue
185 if tag_text.strip().startswith("—"):
186 linkage_data.translation = clean_node(
187 wxr,
188 None,
189 list(
190 template_or_list_node.invert_find_child(
191 NodeKind.LIST, True
192 )
193 )[index:],
194 ).strip("— ")
195 break
196 elif tag_text.strip().startswith(":"):
197 sense_text = tag_text.strip().removeprefix(":").strip()
198 linkage_data.sense = sense_text
199 else:
200 tags, _ = capture_text_in_parentheses(tag_text)
201 for tag in tags:
202 if tag.isdecimal():
203 linkage_data.sense_index = int(tag)
204 else:
205 linkage_data.raw_tags.append(tag)
207 if len(linkage_data.word) > 0:
208 add_linkage_data(page_data[-1], linkage_type, linkage_data)
211def add_linkage_data(
212 word_entry: WordEntry, l_type: str, l_data: Linkage
213) -> None:
214 if l_data.word == "": 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true
215 return
216 translate_raw_tags(l_data)
217 if l_type == "forms":
218 word_entry.forms.append(
219 Form(
220 form=l_data.word,
221 tags=l_data.tags,
222 raw_tags=l_data.raw_tags,
223 roman=l_data.roman,
224 sense=l_data.sense,
225 sense_index=l_data.sense_index,
226 )
227 )
228 else:
229 getattr(word_entry, l_type).append(l_data)
232def process_linkage_template(
233 wxr: WiktextractContext,
234 node: TemplateNode,
235 linkage_data: Linkage,
236) -> None:
237 if node.template_name in ["lien", "l"]:
238 process_lien_template(wxr, node, linkage_data)
239 elif node.template_name.startswith("zh-lien"): 239 ↛ exitline 239 didn't return from function 'process_linkage_template' because the condition on line 239 was always true
240 process_zh_lien_template(wxr, node, linkage_data)
243def process_lien_template(
244 wxr: WiktextractContext,
245 node: TemplateNode,
246 linkage_data: Linkage,
247) -> None:
248 # link word template: https://fr.wiktionary.org/wiki/Modèle:lien
249 word = clean_node(
250 wxr,
251 None,
252 node.template_parameters.get("dif", node.template_parameters.get(1)),
253 )
254 linkage_data.word = word
255 if "tr" in node.template_parameters:
256 linkage_data.roman = clean_node(
257 wxr, None, node.template_parameters.get("tr")
258 )
259 if "sens" in node.template_parameters: 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true
260 linkage_data.translation = clean_node(
261 wxr, None, node.template_parameters.get("sens")
262 )
265def process_zh_lien_template(
266 wxr: WiktextractContext,
267 node: TemplateNode,
268 linkage_data: Linkage,
269) -> None:
270 # https://fr.wiktionary.org/wiki/Modèle:zh-lien
271 linkage_data.word = clean_node(wxr, None, node.template_parameters.get(1))
272 linkage_data.roman = clean_node(
273 wxr, None, node.template_parameters.get(2, "")
274 ) # pinyin
275 traditional_form = clean_node(
276 wxr, None, node.template_parameters.get(3, "")
277 )
278 if len(traditional_form) > 0:
279 linkage_data.alt = traditional_form
282def process_voir_anagrammes_template(
283 wxr: WiktextractContext, node: TemplateNode
284) -> list[Linkage]:
285 # https://fr.wiktionary.org/wiki/Modèle:voir_anagrammes
286 results = []
287 expanded_node = wxr.wtp.parse(
288 wxr.wtp.node_to_wikitext(node), expand_all=True
289 )
290 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
291 for link_node in list_item.find_child(NodeKind.LINK):
292 word = clean_node(wxr, None, link_node)
293 if len(word) > 0: 293 ↛ 291line 293 didn't jump to line 291 because the condition on line 293 was always true
294 results.append(Linkage(word=word))
295 return results