Coverage for src/wiktextract/extractor/fr/linkage.py: 94%
152 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-17 05:52 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-17 05:52 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..ruby import extract_ruby
8from ..share import capture_text_in_parentheses
9from .models import Form, Linkage, WordEntry
10from .section_types import LINKAGE_SECTIONS, LINKAGE_TAGS
11from .tags import translate_raw_tags
14def extract_linkage(
15 wxr: WiktextractContext,
16 page_data: list[WordEntry],
17 level_node: LevelNode,
18 section_type: str,
19) -> None:
20 if section_type == "anagrammes":
21 for node in level_node.find_child(NodeKind.TEMPLATE):
22 if node.template_name == "voir anagrammes": 22 ↛ 21line 22 didn't jump to line 21 because the condition on line 22 was always true
23 anagram_list = process_voir_anagrammes_template(wxr, node)
24 for data in page_data:
25 if data.lang_code == page_data[-1].lang_code: 25 ↛ 24line 25 didn't jump to line 24 because the condition on line 25 was always true
26 data.anagrams.extend(anagram_list)
27 else:
28 extract_linkage_section(
29 wxr,
30 page_data[-1],
31 level_node,
32 LINKAGE_SECTIONS[section_type],
33 LINKAGE_TAGS.get(section_type, []),
34 )
37def extract_linkage_section(
38 wxr: WiktextractContext,
39 word_entry: WordEntry,
40 level_node: LevelNode,
41 linkage_type: str,
42 section_tags: list[str] = [],
43):
44 sense_text = ""
45 sense_index = 0
46 for node in level_node.children:
47 if isinstance(node, TemplateNode) and node.template_name == "(":
48 new_sense_text = clean_node(
49 wxr, None, node.template_parameters.get(1, "")
50 )
51 if new_sense_text != "": 51 ↛ 53line 51 didn't jump to line 53 because the condition on line 51 was always true
52 sense_text = new_sense_text
53 sense_index_text = node.template_parameters.get(2, "0")
54 if ( 54 ↛ 46line 54 didn't jump to line 46 because the condition on line 54 was always true
55 isinstance(sense_index_text, str)
56 and sense_index_text.isdecimal()
57 ):
58 sense_index = int(sense_index_text)
59 elif (
60 isinstance(node, WikiNode)
61 and node.kind in NodeKind.BOLD | NodeKind.ITALIC
62 ):
63 sense_text = clean_node(wxr, None, node)
64 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
65 # sense could also be in ";" description list
66 if node.sarg in [";", ":"]:
67 for list_item in node.find_child(NodeKind.LIST_ITEM):
68 sense_text = clean_node(wxr, None, list_item.children)
69 index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$"
70 m = re.search(index_pattern, sense_text)
71 if m is not None: 71 ↛ 67line 71 didn't jump to line 67 because the condition on line 71 was always true
72 sense_text = re.sub(index_pattern, "", sense_text)
73 sense_index = int(m.group(1))
74 else:
75 for list_item in node.find_child(NodeKind.LIST_ITEM):
76 extract_linkage_list_item(
77 wxr,
78 word_entry,
79 list_item,
80 linkage_type,
81 section_tags,
82 sense_text,
83 sense_index,
84 )
87def extract_linkage_list_item(
88 wxr: WiktextractContext,
89 word_entry: WordEntry,
90 list_item: WikiNode,
91 linkage_type: str,
92 section_tags: list[str],
93 sense: str,
94 sense_index: int,
95):
96 linkage_data = Linkage(
97 word="", tags=section_tags, sense=sense, sense_index=sense_index
98 )
99 pending_tag = ""
100 inside_bracket = False
101 for index, child_node in enumerate(list_item.children):
102 if isinstance(
103 child_node, TemplateNode
104 ) and child_node.template_name in [
105 "l",
106 "lien",
107 "zh-lien",
108 "zh-lien-t",
109 ]:
110 process_linkage_template(wxr, child_node, linkage_data)
111 elif (
112 isinstance(child_node, TemplateNode)
113 and child_node.template_name == "zh-l"
114 ):
115 getattr(word_entry, linkage_type).extend(
116 extract_zh_l_template(
117 wxr, child_node, section_tags, sense, sense_index
118 )
119 )
120 elif (
121 isinstance(child_node, TemplateNode)
122 and child_node.template_name == "cf"
123 ):
124 return
125 elif (
126 isinstance(child_node, WikiNode)
127 and child_node.kind == NodeKind.LINK
128 and not inside_bracket
129 ):
130 linkage_data.word = clean_node(wxr, None, child_node)
131 elif (
132 isinstance(child_node, WikiNode)
133 and child_node.kind == NodeKind.ITALIC
134 ):
135 italic_text = clean_node(wxr, None, child_node).strip("()")
136 if italic_text == "": 136 ↛ 137line 136 didn't jump to line 137 because the condition on line 136 was never true
137 continue
138 elif len(list(list_item.filter_empty_str_child())) == 1:
139 linkage_data.word = italic_text
140 elif italic_text.isdecimal(): 140 ↛ 141line 140 didn't jump to line 141 because the condition on line 140 was never true
141 linkage_data.sense_index = int(italic_text)
142 elif inside_bracket:
143 linkage_data.raw_tags.append(italic_text)
144 else:
145 linkage_data.sense = italic_text
146 elif (
147 isinstance(child_node, TemplateNode)
148 and child_node.template_name == "réf"
149 ) or (
150 isinstance(child_node, WikiNode)
151 and child_node.kind == NodeKind.LIST
152 ):
153 continue
154 else:
155 tag_text = (
156 child_node
157 if isinstance(child_node, str)
158 else clean_node(wxr, word_entry, child_node)
159 )
160 if (
161 tag_text.strip() in {",", "/", "(ou"}
162 and linkage_data.word != ""
163 ):
164 # list item has more than one word
165 add_linkage_data(word_entry, linkage_type, linkage_data)
166 linkage_data = Linkage(
167 word="",
168 tags=section_tags,
169 sense=sense,
170 sense_index=sense_index,
171 )
172 continue
173 if tag_text.strip().startswith(
174 "("
175 ) and not tag_text.strip().endswith(")"):
176 pending_tag = tag_text
177 inside_bracket = True
178 continue
179 elif not tag_text.strip().startswith(
180 "("
181 ) and tag_text.strip().endswith(")"):
182 tag_text = pending_tag + tag_text
183 pending_tag = ""
184 inside_bracket = False
185 elif len(pending_tag) > 0:
186 pending_tag += tag_text
187 continue
189 if tag_text.strip().startswith("—"):
190 linkage_data.translation = clean_node(
191 wxr,
192 None,
193 list(list_item.invert_find_child(NodeKind.LIST, True))[
194 index:
195 ],
196 ).strip("— \n")
197 break
198 elif tag_text.strip().startswith(":"):
199 sense_text = tag_text.strip().removeprefix(":").strip()
200 linkage_data.sense = sense_text
201 else:
202 tags, _ = capture_text_in_parentheses(tag_text)
203 for tag in tags:
204 if tag.isdecimal():
205 linkage_data.sense_index = int(tag)
206 else:
207 linkage_data.raw_tags.append(tag)
209 if len(linkage_data.word) > 0:
210 add_linkage_data(word_entry, linkage_type, linkage_data)
211 for child_list in list_item.find_child(NodeKind.LIST):
212 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
213 extract_linkage_list_item(
214 wxr,
215 word_entry,
216 child_list_item,
217 linkage_type,
218 section_tags,
219 sense,
220 sense_index,
221 )
224def add_linkage_data(
225 word_entry: WordEntry, l_type: str, l_data: Linkage
226) -> None:
227 if l_data.word == "": 227 ↛ 228line 227 didn't jump to line 228 because the condition on line 227 was never true
228 return
229 translate_raw_tags(l_data)
230 if l_type == "forms":
231 word_entry.forms.append(
232 Form(
233 form=l_data.word,
234 tags=l_data.tags,
235 raw_tags=l_data.raw_tags,
236 roman=l_data.roman,
237 sense=l_data.sense,
238 sense_index=l_data.sense_index,
239 )
240 )
241 else:
242 getattr(word_entry, l_type).append(l_data)
245def process_linkage_template(
246 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage
247) -> None:
248 if node.template_name in ["lien", "l"]:
249 process_lien_template(wxr, node, linkage_data)
250 elif node.template_name.startswith("zh-lien"): 250 ↛ exitline 250 didn't return from function 'process_linkage_template' because the condition on line 250 was always true
251 process_zh_lien_template(wxr, node, linkage_data)
254def process_lien_template(
255 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage
256) -> None:
257 # link word template: https://fr.wiktionary.org/wiki/Modèle:lien
258 ruby, without_ruby = extract_ruby(
259 wxr,
260 wxr.wtp.parse(
261 wxr.wtp.node_to_wikitext(
262 node.template_parameters.get(
263 "dif", node.template_parameters.get(1)
264 )
265 ),
266 expand_all=True,
267 ),
268 )
269 linkage_data.word = clean_node(wxr, None, without_ruby)
270 linkage_data.ruby = ruby
271 linkage_data.roman = clean_node(
272 wxr, None, node.template_parameters.get("tr", "")
273 )
274 linkage_data.translation = clean_node(
275 wxr, None, node.template_parameters.get("sens", "")
276 )
279def process_zh_lien_template(
280 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage
281) -> None:
282 # https://fr.wiktionary.org/wiki/Modèle:zh-lien
283 linkage_data.word = clean_node(wxr, None, node.template_parameters.get(1))
284 linkage_data.roman = clean_node(
285 wxr, None, node.template_parameters.get(2, "")
286 ) # pinyin
287 traditional_form = clean_node(
288 wxr, None, node.template_parameters.get(3, "")
289 )
290 if len(traditional_form) > 0:
291 linkage_data.alt = traditional_form
294def process_voir_anagrammes_template(
295 wxr: WiktextractContext, node: TemplateNode
296) -> list[Linkage]:
297 # https://fr.wiktionary.org/wiki/Modèle:voir_anagrammes
298 results = []
299 expanded_node = wxr.wtp.parse(
300 wxr.wtp.node_to_wikitext(node), expand_all=True
301 )
302 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
303 for link_node in list_item.find_child(NodeKind.LINK):
304 word = clean_node(wxr, None, link_node)
305 if len(word) > 0: 305 ↛ 303line 305 didn't jump to line 303 because the condition on line 305 was always true
306 results.append(Linkage(word=word))
307 return results
310def extract_zh_l_template(
311 wxr: WiktextractContext,
312 t_node: TemplateNode,
313 raw_tags: list[str] = [],
314 sense: str = "",
315 sense_index: int = 0,
316) -> list[Linkage]:
317 # https://fr.wiktionary.org/wiki/Modèle:zh-l
318 roman = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
319 new_sense = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
320 if new_sense != "": 320 ↛ 322line 320 didn't jump to line 322 because the condition on line 320 was always true
321 sense = new_sense
322 l_list = []
323 expanded_node = wxr.wtp.parse(
324 wxr.wtp.node_to_wikitext(t_node), expand_all=True
325 )
326 for span_tag in expanded_node.find_html(
327 "span", attr_name="lang", attr_value="zh"
328 ):
329 word = clean_node(wxr, None, span_tag)
330 if word != "": 330 ↛ 326line 330 didn't jump to line 326 because the condition on line 330 was always true
331 l_data = Linkage(
332 word=word,
333 sense=sense,
334 sense_index=sense_index,
335 raw_tags=raw_tags,
336 roman=roman,
337 )
338 translate_raw_tags(l_data)
339 l_list.append(l_data)
340 if len(l_list) == 2: 340 ↛ 346line 340 didn't jump to line 346 because the condition on line 340 was always true
341 for index, l_data in enumerate(l_list):
342 if index == 0:
343 l_data.tags.append("Traditional-Chinese")
344 else:
345 l_data.tags.append("Simplified-Chinese")
346 return l_list