Coverage for src / wiktextract / extractor / fr / linkage.py: 95%
153 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..ruby import extract_ruby
8from ..share import capture_text_in_parentheses
9from .models import Form, Linkage, WordEntry
10from .section_types import LINKAGE_SECTIONS, LINKAGE_TAGS
11from .tags import translate_raw_tags
14def extract_linkage(
15 wxr: WiktextractContext,
16 page_data: list[WordEntry],
17 level_node: LevelNode,
18 section_type: str,
19) -> None:
20 if section_type == "anagrammes":
21 for node in level_node.find_child(NodeKind.TEMPLATE):
22 if node.template_name == "voir anagrammes": 22 ↛ 21line 22 didn't jump to line 21 because the condition on line 22 was always true
23 anagram_list = process_voir_anagrammes_template(wxr, node)
24 for data in page_data:
25 if data.lang_code == page_data[-1].lang_code: 25 ↛ 24line 25 didn't jump to line 24 because the condition on line 25 was always true
26 data.anagrams.extend(anagram_list)
27 else:
28 extract_linkage_section(
29 wxr,
30 page_data[-1],
31 level_node,
32 LINKAGE_SECTIONS[section_type],
33 LINKAGE_TAGS.get(section_type, []),
34 )
37def extract_linkage_section(
38 wxr: WiktextractContext,
39 word_entry: WordEntry,
40 level_node: LevelNode,
41 linkage_type: str,
42 section_tags: list[str] = [],
43):
44 sense_text = ""
45 sense_index = 0
46 for node in level_node.children:
47 if isinstance(node, TemplateNode) and node.template_name == "(":
48 new_sense_text = clean_node(
49 wxr, None, node.template_parameters.get(1, "")
50 )
51 if new_sense_text != "": 51 ↛ 53line 51 didn't jump to line 53 because the condition on line 51 was always true
52 sense_text = new_sense_text
53 sense_index_text = node.template_parameters.get(2, "0")
54 if ( 54 ↛ 46line 54 didn't jump to line 46 because the condition on line 54 was always true
55 isinstance(sense_index_text, str)
56 and sense_index_text.isdecimal()
57 ):
58 sense_index = int(sense_index_text)
59 elif (
60 isinstance(node, WikiNode)
61 and node.kind in NodeKind.BOLD | NodeKind.ITALIC
62 ):
63 sense_text = clean_node(wxr, None, node)
64 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
65 # sense could also be in ";" description list
66 if node.sarg in [";", ":"]:
67 for list_item in node.find_child(NodeKind.LIST_ITEM):
68 sense_text = clean_node(wxr, None, list_item.children)
69 index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$"
70 m = re.search(index_pattern, sense_text)
71 if m is not None: 71 ↛ 67line 71 didn't jump to line 67 because the condition on line 71 was always true
72 sense_text = re.sub(index_pattern, "", sense_text)
73 sense_index = int(m.group(1))
74 else:
75 for list_item in node.find_child(NodeKind.LIST_ITEM):
76 extract_linkage_list_item(
77 wxr,
78 word_entry,
79 list_item,
80 linkage_type,
81 section_tags,
82 sense_text,
83 sense_index,
84 )
87def extract_linkage_list_item(
88 wxr: WiktextractContext,
89 word_entry: WordEntry,
90 list_item: WikiNode,
91 linkage_type: str,
92 section_tags: list[str],
93 sense: str,
94 sense_index: int,
95):
96 linkage_data = Linkage(
97 word="", tags=section_tags, sense=sense, sense_index=sense_index
98 )
99 pending_tag = ""
100 inside_bracket = False
101 for index, child_node in enumerate(list_item.children):
102 if isinstance(
103 child_node, TemplateNode
104 ) and child_node.template_name in [
105 "l",
106 "lien",
107 "zh-lien",
108 "zh-lien-t",
109 ]:
110 process_linkage_template(wxr, child_node, linkage_data)
111 elif (
112 isinstance(child_node, TemplateNode)
113 and child_node.template_name == "zh-l"
114 ):
115 for l_data in extract_zh_l_template(
116 wxr, child_node, section_tags, sense, sense_index
117 ):
118 add_linkage_data(word_entry, linkage_type, l_data)
119 elif (
120 isinstance(child_node, TemplateNode)
121 and child_node.template_name == "cf"
122 ):
123 return
124 elif (
125 isinstance(child_node, WikiNode)
126 and child_node.kind == NodeKind.LINK
127 and not inside_bracket
128 ):
129 linkage_data.word = clean_node(wxr, None, child_node)
130 elif (
131 isinstance(child_node, WikiNode)
132 and child_node.kind == NodeKind.ITALIC
133 ):
134 italic_text = clean_node(wxr, None, child_node).strip("()")
135 if italic_text == "": 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true
136 continue
137 elif len(list(list_item.filter_empty_str_child())) == 1:
138 linkage_data.word = italic_text
139 elif italic_text.isdecimal(): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 linkage_data.sense_index = int(italic_text)
141 elif inside_bracket:
142 linkage_data.raw_tags.append(italic_text)
143 else:
144 linkage_data.sense = italic_text
145 elif (
146 isinstance(child_node, TemplateNode)
147 and child_node.template_name == "réf"
148 ) or (
149 isinstance(child_node, WikiNode)
150 and child_node.kind == NodeKind.LIST
151 ):
152 continue
153 else:
154 tag_text = (
155 child_node
156 if isinstance(child_node, str)
157 else clean_node(wxr, word_entry, child_node)
158 )
159 if (
160 tag_text.strip() in {",", "/", "(ou"}
161 and linkage_data.word != ""
162 ):
163 # list item has more than one word
164 add_linkage_data(word_entry, linkage_type, linkage_data)
165 linkage_data = Linkage(
166 word="",
167 tags=section_tags,
168 sense=sense,
169 sense_index=sense_index,
170 )
171 continue
172 if tag_text.strip().startswith(
173 "("
174 ) and not tag_text.strip().endswith(")"):
175 pending_tag = tag_text
176 inside_bracket = True
177 continue
178 elif not tag_text.strip().startswith(
179 "("
180 ) and tag_text.strip().endswith(")"):
181 tag_text = pending_tag + tag_text
182 pending_tag = ""
183 inside_bracket = False
184 elif len(pending_tag) > 0:
185 pending_tag += tag_text
186 continue
188 if tag_text.strip().startswith("—"):
189 linkage_data.translation = clean_node(
190 wxr,
191 None,
192 list(list_item.invert_find_child(NodeKind.LIST, True))[
193 index:
194 ],
195 ).strip("— \n")
196 break
197 elif tag_text.lstrip().startswith(":"):
198 linkage_data.sense = clean_node(
199 wxr,
200 None,
201 [tag_text.lstrip().removeprefix(":").lstrip()]
202 + [
203 n
204 for n in list_item.children[index + 1 :]
205 if not (
206 (
207 isinstance(n, TemplateNode)
208 and n.template_name == "réf"
209 )
210 or (
211 isinstance(n, WikiNode)
212 and n.kind == NodeKind.LIST
213 )
214 )
215 ],
216 )
217 break
218 else:
219 tags, _ = capture_text_in_parentheses(tag_text)
220 for tag in tags:
221 if tag.isdecimal():
222 linkage_data.sense_index = int(tag)
223 else:
224 linkage_data.raw_tags.append(tag)
226 if len(linkage_data.word) > 0:
227 add_linkage_data(word_entry, linkage_type, linkage_data)
228 for child_list in list_item.find_child(NodeKind.LIST):
229 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
230 extract_linkage_list_item(
231 wxr,
232 word_entry,
233 child_list_item,
234 linkage_type,
235 section_tags,
236 sense,
237 sense_index,
238 )
241def add_linkage_data(
242 word_entry: WordEntry, l_type: str, l_data: Linkage
243) -> None:
244 if l_data.word == "": 244 ↛ 245line 244 didn't jump to line 245 because the condition on line 244 was never true
245 return
246 translate_raw_tags(l_data)
247 if l_type == "forms":
248 word_entry.forms.append(
249 Form(
250 form=l_data.word,
251 tags=l_data.tags,
252 raw_tags=l_data.raw_tags,
253 roman=l_data.roman,
254 sense=l_data.sense,
255 sense_index=l_data.sense_index,
256 )
257 )
258 else:
259 getattr(word_entry, l_type).append(l_data)
262def process_linkage_template(
263 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage
264) -> None:
265 if node.template_name in ["lien", "l"]:
266 process_lien_template(wxr, node, linkage_data)
267 elif node.template_name.startswith("zh-lien"): 267 ↛ exitline 267 didn't return from function 'process_linkage_template' because the condition on line 267 was always true
268 process_zh_lien_template(wxr, node, linkage_data)
271def process_lien_template(
272 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage
273) -> None:
274 # link word template: https://fr.wiktionary.org/wiki/Modèle:lien
275 ruby, without_ruby = extract_ruby(
276 wxr,
277 wxr.wtp.parse(
278 wxr.wtp.node_to_wikitext(
279 node.template_parameters.get(
280 "dif", node.template_parameters.get(1)
281 )
282 ),
283 expand_all=True,
284 ),
285 )
286 linkage_data.word = clean_node(wxr, None, without_ruby)
287 linkage_data.ruby = ruby
288 linkage_data.roman = clean_node(
289 wxr, None, node.template_parameters.get("tr", "")
290 )
291 linkage_data.translation = clean_node(
292 wxr, None, node.template_parameters.get("sens", "")
293 )
296def process_zh_lien_template(
297 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage
298) -> None:
299 # https://fr.wiktionary.org/wiki/Modèle:zh-lien
300 linkage_data.word = clean_node(wxr, None, node.template_parameters.get(1))
301 linkage_data.roman = clean_node(
302 wxr, None, node.template_parameters.get(2, "")
303 ) # pinyin
304 traditional_form = clean_node(
305 wxr, None, node.template_parameters.get(3, "")
306 )
307 if len(traditional_form) > 0:
308 linkage_data.alt = traditional_form
311def process_voir_anagrammes_template(
312 wxr: WiktextractContext, node: TemplateNode
313) -> list[Linkage]:
314 # https://fr.wiktionary.org/wiki/Modèle:voir_anagrammes
315 results = []
316 expanded_node = wxr.wtp.parse(
317 wxr.wtp.node_to_wikitext(node), expand_all=True
318 )
319 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
320 for link_node in list_item.find_child(NodeKind.LINK):
321 word = clean_node(wxr, None, link_node)
322 if len(word) > 0: 322 ↛ 320line 322 didn't jump to line 320 because the condition on line 322 was always true
323 results.append(Linkage(word=word))
324 return results
327def extract_zh_l_template(
328 wxr: WiktextractContext,
329 t_node: TemplateNode,
330 raw_tags: list[str] = [],
331 sense: str = "",
332 sense_index: int = 0,
333) -> list[Linkage]:
334 # https://fr.wiktionary.org/wiki/Modèle:zh-l
335 roman = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
336 new_sense = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
337 if new_sense != "":
338 sense = new_sense
339 l_list = []
340 expanded_node = wxr.wtp.parse(
341 wxr.wtp.node_to_wikitext(t_node), expand_all=True
342 )
343 for span_tag in expanded_node.find_html(
344 "span", attr_name="lang", attr_value="zh"
345 ):
346 word = clean_node(wxr, None, span_tag)
347 if word != "": 347 ↛ 343line 347 didn't jump to line 343 because the condition on line 347 was always true
348 l_data = Linkage(
349 word=word,
350 sense=sense,
351 sense_index=sense_index,
352 raw_tags=raw_tags,
353 roman=roman,
354 )
355 translate_raw_tags(l_data)
356 l_list.append(l_data)
357 if len(l_list) == 2:
358 for index, l_data in enumerate(l_list):
359 if index == 0:
360 l_data.tags.append("Traditional-Chinese")
361 else:
362 l_data.tags.append("Simplified-Chinese")
363 return l_list