Coverage for src/wiktextract/extractor/fr/linkage.py: 95%
153 statements
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-14 08:49 +0000
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-14 08:49 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..ruby import extract_ruby
8from ..share import capture_text_in_parentheses
9from .models import Form, Linkage, WordEntry
10from .section_types import LINKAGE_SECTIONS, LINKAGE_TAGS
11from .tags import translate_raw_tags
14def extract_linkage(
15 wxr: WiktextractContext,
16 page_data: list[WordEntry],
17 level_node: LevelNode,
18 section_type: str,
19) -> None:
20 if section_type == "anagrammes":
21 for node in level_node.find_child(NodeKind.TEMPLATE):
22 if node.template_name == "voir anagrammes": 22 ↛ 21line 22 didn't jump to line 21 because the condition on line 22 was always true
23 anagram_list = process_voir_anagrammes_template(wxr, node)
24 for data in page_data:
25 if data.lang_code == page_data[-1].lang_code: 25 ↛ 24line 25 didn't jump to line 24 because the condition on line 25 was always true
26 data.anagrams.extend(anagram_list)
27 else:
28 extract_linkage_section(
29 wxr,
30 page_data[-1],
31 level_node,
32 LINKAGE_SECTIONS[section_type],
33 LINKAGE_TAGS.get(section_type, []),
34 )
37def extract_linkage_section(
38 wxr: WiktextractContext,
39 word_entry: WordEntry,
40 level_node: LevelNode,
41 linkage_type: str,
42 section_tags: list[str] = [],
43):
44 sense_text = ""
45 sense_index = 0
46 for node in level_node.children:
47 if isinstance(node, TemplateNode) and node.template_name == "(":
48 new_sense_text = clean_node(
49 wxr, None, node.template_parameters.get(1, "")
50 )
51 if new_sense_text != "": 51 ↛ 53line 51 didn't jump to line 53 because the condition on line 51 was always true
52 sense_text = new_sense_text
53 sense_index_text = node.template_parameters.get(2, "0")
54 if ( 54 ↛ 46line 54 didn't jump to line 46 because the condition on line 54 was always true
55 isinstance(sense_index_text, str)
56 and sense_index_text.isdecimal()
57 ):
58 sense_index = int(sense_index_text)
59 elif (
60 isinstance(node, WikiNode)
61 and node.kind in NodeKind.BOLD | NodeKind.ITALIC
62 ):
63 sense_text = clean_node(wxr, None, node)
64 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
65 # sense could also be in ";" description list
66 if node.sarg in [";", ":"]:
67 for list_item in node.find_child(NodeKind.LIST_ITEM):
68 sense_text = clean_node(wxr, None, list_item.children)
69 index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$"
70 m = re.search(index_pattern, sense_text)
71 if m is not None: 71 ↛ 67line 71 didn't jump to line 67 because the condition on line 71 was always true
72 sense_text = re.sub(index_pattern, "", sense_text)
73 sense_index = int(m.group(1))
74 else:
75 for list_item in node.find_child(NodeKind.LIST_ITEM):
76 extract_linkage_list_item(
77 wxr,
78 word_entry,
79 list_item,
80 linkage_type,
81 section_tags,
82 sense_text,
83 sense_index,
84 )
87def extract_linkage_list_item(
88 wxr: WiktextractContext,
89 word_entry: WordEntry,
90 list_item: WikiNode,
91 linkage_type: str,
92 section_tags: list[str],
93 sense: str,
94 sense_index: int,
95):
96 linkage_data = Linkage(
97 word="", tags=section_tags, sense=sense, sense_index=sense_index
98 )
99 pending_tag = ""
100 inside_bracket = False
101 for index, child_node in enumerate(list_item.children):
102 if isinstance(
103 child_node, TemplateNode
104 ) and child_node.template_name in [
105 "l",
106 "lien",
107 "zh-lien",
108 "zh-lien-t",
109 ]:
110 process_linkage_template(wxr, child_node, linkage_data)
111 elif (
112 isinstance(child_node, TemplateNode)
113 and child_node.template_name == "zh-l"
114 ):
115 for l_data in extract_zh_l_template(
116 wxr, child_node, section_tags, sense, sense_index
117 ):
118 add_linkage_data(word_entry, linkage_type, l_data)
119 elif (
120 isinstance(child_node, TemplateNode)
121 and child_node.template_name == "cf"
122 ):
123 return
124 elif (
125 isinstance(child_node, WikiNode)
126 and child_node.kind == NodeKind.LINK
127 and not inside_bracket
128 ):
129 linkage_data.word = clean_node(wxr, None, child_node)
130 elif (
131 isinstance(child_node, WikiNode)
132 and child_node.kind == NodeKind.ITALIC
133 ):
134 italic_text = clean_node(wxr, None, child_node).strip("()")
135 if italic_text == "": 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true
136 continue
137 elif len(list(list_item.filter_empty_str_child())) == 1:
138 linkage_data.word = italic_text
139 elif italic_text.isdecimal(): 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 linkage_data.sense_index = int(italic_text)
141 elif inside_bracket:
142 linkage_data.raw_tags.append(italic_text)
143 else:
144 linkage_data.sense = italic_text
145 elif (
146 isinstance(child_node, TemplateNode)
147 and child_node.template_name == "réf"
148 ) or (
149 isinstance(child_node, WikiNode)
150 and child_node.kind == NodeKind.LIST
151 ):
152 continue
153 else:
154 tag_text = (
155 child_node
156 if isinstance(child_node, str)
157 else clean_node(wxr, word_entry, child_node)
158 )
159 if (
160 tag_text.strip() in {",", "/", "(ou"}
161 and linkage_data.word != ""
162 ):
163 # list item has more than one word
164 add_linkage_data(word_entry, linkage_type, linkage_data)
165 linkage_data = Linkage(
166 word="",
167 tags=section_tags,
168 sense=sense,
169 sense_index=sense_index,
170 )
171 continue
172 if tag_text.strip().startswith(
173 "("
174 ) and not tag_text.strip().endswith(")"):
175 pending_tag = tag_text
176 inside_bracket = True
177 continue
178 elif not tag_text.strip().startswith(
179 "("
180 ) and tag_text.strip().endswith(")"):
181 tag_text = pending_tag + tag_text
182 pending_tag = ""
183 inside_bracket = False
184 elif len(pending_tag) > 0:
185 pending_tag += tag_text
186 continue
188 if tag_text.strip().startswith("—"):
189 linkage_data.translation = clean_node(
190 wxr,
191 None,
192 list(list_item.invert_find_child(NodeKind.LIST, True))[
193 index:
194 ],
195 ).strip("— \n")
196 break
197 elif tag_text.strip().startswith(":"):
198 sense_text = tag_text.strip().removeprefix(":").strip()
199 linkage_data.sense = sense_text
200 else:
201 tags, _ = capture_text_in_parentheses(tag_text)
202 for tag in tags:
203 if tag.isdecimal():
204 linkage_data.sense_index = int(tag)
205 else:
206 linkage_data.raw_tags.append(tag)
208 if len(linkage_data.word) > 0:
209 add_linkage_data(word_entry, linkage_type, linkage_data)
210 for child_list in list_item.find_child(NodeKind.LIST):
211 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
212 extract_linkage_list_item(
213 wxr,
214 word_entry,
215 child_list_item,
216 linkage_type,
217 section_tags,
218 sense,
219 sense_index,
220 )
223def add_linkage_data(
224 word_entry: WordEntry, l_type: str, l_data: Linkage
225) -> None:
226 if l_data.word == "": 226 ↛ 227line 226 didn't jump to line 227 because the condition on line 226 was never true
227 return
228 translate_raw_tags(l_data)
229 if l_type == "forms":
230 word_entry.forms.append(
231 Form(
232 form=l_data.word,
233 tags=l_data.tags,
234 raw_tags=l_data.raw_tags,
235 roman=l_data.roman,
236 sense=l_data.sense,
237 sense_index=l_data.sense_index,
238 )
239 )
240 else:
241 getattr(word_entry, l_type).append(l_data)
244def process_linkage_template(
245 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage
246) -> None:
247 if node.template_name in ["lien", "l"]:
248 process_lien_template(wxr, node, linkage_data)
249 elif node.template_name.startswith("zh-lien"): 249 ↛ exitline 249 didn't return from function 'process_linkage_template' because the condition on line 249 was always true
250 process_zh_lien_template(wxr, node, linkage_data)
253def process_lien_template(
254 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage
255) -> None:
256 # link word template: https://fr.wiktionary.org/wiki/Modèle:lien
257 ruby, without_ruby = extract_ruby(
258 wxr,
259 wxr.wtp.parse(
260 wxr.wtp.node_to_wikitext(
261 node.template_parameters.get(
262 "dif", node.template_parameters.get(1)
263 )
264 ),
265 expand_all=True,
266 ),
267 )
268 linkage_data.word = clean_node(wxr, None, without_ruby)
269 linkage_data.ruby = ruby
270 linkage_data.roman = clean_node(
271 wxr, None, node.template_parameters.get("tr", "")
272 )
273 linkage_data.translation = clean_node(
274 wxr, None, node.template_parameters.get("sens", "")
275 )
278def process_zh_lien_template(
279 wxr: WiktextractContext, node: TemplateNode, linkage_data: Linkage
280) -> None:
281 # https://fr.wiktionary.org/wiki/Modèle:zh-lien
282 linkage_data.word = clean_node(wxr, None, node.template_parameters.get(1))
283 linkage_data.roman = clean_node(
284 wxr, None, node.template_parameters.get(2, "")
285 ) # pinyin
286 traditional_form = clean_node(
287 wxr, None, node.template_parameters.get(3, "")
288 )
289 if len(traditional_form) > 0:
290 linkage_data.alt = traditional_form
293def process_voir_anagrammes_template(
294 wxr: WiktextractContext, node: TemplateNode
295) -> list[Linkage]:
296 # https://fr.wiktionary.org/wiki/Modèle:voir_anagrammes
297 results = []
298 expanded_node = wxr.wtp.parse(
299 wxr.wtp.node_to_wikitext(node), expand_all=True
300 )
301 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
302 for link_node in list_item.find_child(NodeKind.LINK):
303 word = clean_node(wxr, None, link_node)
304 if len(word) > 0: 304 ↛ 302line 304 didn't jump to line 302 because the condition on line 304 was always true
305 results.append(Linkage(word=word))
306 return results
309def extract_zh_l_template(
310 wxr: WiktextractContext,
311 t_node: TemplateNode,
312 raw_tags: list[str] = [],
313 sense: str = "",
314 sense_index: int = 0,
315) -> list[Linkage]:
316 # https://fr.wiktionary.org/wiki/Modèle:zh-l
317 roman = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
318 new_sense = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
319 if new_sense != "":
320 sense = new_sense
321 l_list = []
322 expanded_node = wxr.wtp.parse(
323 wxr.wtp.node_to_wikitext(t_node), expand_all=True
324 )
325 for span_tag in expanded_node.find_html(
326 "span", attr_name="lang", attr_value="zh"
327 ):
328 word = clean_node(wxr, None, span_tag)
329 if word != "": 329 ↛ 325line 329 didn't jump to line 325 because the condition on line 329 was always true
330 l_data = Linkage(
331 word=word,
332 sense=sense,
333 sense_index=sense_index,
334 raw_tags=raw_tags,
335 roman=roman,
336 )
337 translate_raw_tags(l_data)
338 l_list.append(l_data)
339 if len(l_list) == 2:
340 for index, l_data in enumerate(l_list):
341 if index == 0:
342 l_data.tags.append("Traditional-Chinese")
343 else:
344 l_data.tags.append("Simplified-Chinese")
345 return l_list