Coverage for src/wiktextract/extractor/fr/linkage.py: 95%
132 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
3from wikitextprocessor import NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..share import capture_text_in_parentheses
8from .models import Linkage, WordEntry
9from .section_types import LINKAGE_SECTIONS
10from .tags import translate_raw_tags
13def extract_linkage(
14 wxr: WiktextractContext,
15 page_data: list[WordEntry],
16 level_node: WikiNode,
17 section_type: str,
18) -> None:
19 if section_type == "dérivés autres langues":
20 process_derives_autres_list(wxr, page_data, level_node)
21 elif section_type == "anagrammes":
22 for node in level_node.find_child(NodeKind.TEMPLATE):
23 if node.template_name == "voir anagrammes": 23 ↛ 22line 23 didn't jump to line 22 because the condition on line 23 was always true
24 anagram_list = process_voir_anagrammes_template(wxr, node)
25 for data in page_data:
26 if data.lang_code == page_data[-1].lang_code: 26 ↛ 25line 26 didn't jump to line 25 because the condition on line 26 was always true
27 data.anagrams.extend(anagram_list)
28 else:
29 process_linkage_list(
30 wxr,
31 page_data,
32 level_node,
33 LINKAGE_SECTIONS[section_type],
34 )
37def process_derives_autres_list(
38 wxr: WiktextractContext,
39 page_data: list[WordEntry],
40 level_node: WikiNode,
41):
42 # drrive to other languages list
43 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
44 lang_code = ""
45 lang_name = ""
46 for node in list_item.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
47 if isinstance(node, TemplateNode) and node.template_name == "L":
48 lang_code = node.template_parameters.get(1)
49 lang_name = clean_node(wxr, None, node)
50 elif node.kind == NodeKind.LINK:
51 word = clean_node(wxr, None, node)
52 page_data[-1].derived.append(
53 Linkage(lang_code=lang_code, lang=lang_name, word=word)
54 )
55 elif isinstance(node, TemplateNode) and node.template_name in [ 55 ↛ 46line 55 didn't jump to line 46 because the condition on line 55 was always true
56 "l",
57 "lien",
58 "zh-lien",
59 "zh-lien-t",
60 ]:
61 linkage_data = Linkage(
62 lang_code=lang_code, lang=lang_name, word=""
63 )
64 process_linkage_template(wxr, node, linkage_data)
65 page_data[-1].derived.append(linkage_data)
68def process_linkage_list(
69 wxr: WiktextractContext,
70 page_data: list[WordEntry],
71 level_node: WikiNode,
72 linkage_type: str,
73) -> None:
74 sense_text = ""
75 sense_index = 0
76 for template_or_list_node in level_node.find_child_recursively(
77 NodeKind.LIST_ITEM | NodeKind.TEMPLATE
78 ):
79 # list table start template: https://fr.wiktionary.org/wiki/Modèle:(
80 if (
81 isinstance(template_or_list_node, TemplateNode)
82 and template_or_list_node.template_name == "("
83 ):
84 sense_text = clean_node(
85 wxr, None, template_or_list_node.template_parameters.get(1, "")
86 )
87 sense_index_text = template_or_list_node.template_parameters.get(
88 2, "0"
89 )
90 if ( 90 ↛ 95line 90 didn't jump to line 95 because the condition on line 90 was always true
91 isinstance(sense_index_text, str)
92 and sense_index_text.isdecimal()
93 ):
94 sense_index = int(sense_index_text)
95 continue
96 # sense could also be in ";" description list
97 if (
98 template_or_list_node.kind == NodeKind.LIST_ITEM
99 and template_or_list_node.sarg in {";", ":"}
100 ):
101 sense_text = clean_node(wxr, None, template_or_list_node.children)
102 index_pattern = r"\s*\((?:sens\s*)?(\d+)\)$"
103 m = re.search(index_pattern, sense_text)
104 if m is not None: 104 ↛ 107line 104 didn't jump to line 107 because the condition on line 104 was always true
105 sense_text = re.sub(index_pattern, "", sense_text)
106 sense_index = int(m.group(1))
107 continue
109 linkage_data = Linkage(word="")
110 if len(sense_text) > 0:
111 linkage_data.sense = sense_text
112 if sense_index != 0:
113 linkage_data.sense_index = sense_index
114 pending_tag = ""
115 inside_bracket = False
116 for index, child_node in enumerate( # remove nested lists
117 template_or_list_node.invert_find_child(NodeKind.LIST, True)
118 ):
119 if isinstance(
120 child_node, TemplateNode
121 ) and child_node.template_name in [
122 "l",
123 "lien",
124 "zh-lien",
125 "zh-lien-t",
126 ]:
127 process_linkage_template(wxr, child_node, linkage_data)
128 elif (
129 isinstance(child_node, WikiNode)
130 and child_node.kind == NodeKind.LINK
131 and not inside_bracket
132 ):
133 linkage_data.word = clean_node(wxr, None, child_node)
134 elif (
135 isinstance(child_node, WikiNode)
136 and child_node.kind == NodeKind.ITALIC
137 ):
138 current_sense = clean_node(wxr, None, child_node).strip("()")
139 if (
140 len(list(template_or_list_node.filter_empty_str_child()))
141 == 1
142 ):
143 linkage_data.word = current_sense
144 elif current_sense.isdecimal(): 144 ↛ 145line 144 didn't jump to line 145 because the condition on line 144 was never true
145 linkage_data.sense_index = int(current_sense)
146 else:
147 linkage_data.sense = current_sense
148 elif (
149 isinstance(child_node, TemplateNode)
150 and child_node.template_name == "réf"
151 ):
152 continue
153 else:
154 tag_text = (
155 child_node
156 if isinstance(child_node, str)
157 else clean_node(wxr, page_data[-1], child_node)
158 )
159 if (
160 tag_text.strip() in {",", "/", "(ou"}
161 and linkage_data.word != ""
162 ):
163 # list item has more than one word
164 pre_data = getattr(page_data[-1], linkage_type)
165 pre_data.append(linkage_data)
166 linkage_data = Linkage(word="")
167 continue
168 if tag_text.strip().startswith(
169 "("
170 ) and not tag_text.strip().endswith(")"):
171 pending_tag = tag_text
172 inside_bracket = True
173 continue
174 elif not tag_text.strip().startswith(
175 "("
176 ) and tag_text.strip().endswith(")"):
177 tag_text = pending_tag + tag_text
178 pending_tag = ""
179 inside_bracket = False
180 elif len(pending_tag) > 0:
181 pending_tag += tag_text
182 continue
184 if tag_text.strip().startswith("—"):
185 linkage_data.translation = clean_node(
186 wxr,
187 None,
188 list(
189 template_or_list_node.invert_find_child(
190 NodeKind.LIST, True
191 )
192 )[index:],
193 ).strip("— ")
194 break
195 elif tag_text.strip().startswith(":"):
196 sense_text = tag_text.strip().removeprefix(":").strip()
197 linkage_data.sense = sense_text
198 else:
199 tags, _ = capture_text_in_parentheses(tag_text)
200 for tag in tags:
201 if tag.isdecimal():
202 linkage_data.sense_index = int(tag)
203 else:
204 linkage_data.raw_tags.append(tag)
206 if len(linkage_data.word) > 0:
207 pre_data = getattr(page_data[-1], linkage_type)
208 translate_raw_tags(linkage_data)
209 pre_data.append(linkage_data)
212def process_linkage_template(
213 wxr: WiktextractContext,
214 node: TemplateNode,
215 linkage_data: Linkage,
216) -> None:
217 if node.template_name in ["lien", "l"]:
218 process_lien_template(wxr, node, linkage_data)
219 elif node.template_name.startswith("zh-lien"): 219 ↛ exitline 219 didn't return from function 'process_linkage_template' because the condition on line 219 was always true
220 process_zh_lien_template(wxr, node, linkage_data)
223def process_lien_template(
224 wxr: WiktextractContext,
225 node: TemplateNode,
226 linkage_data: Linkage,
227) -> None:
228 # link word template: https://fr.wiktionary.org/wiki/Modèle:lien
229 word = clean_node(
230 wxr,
231 None,
232 node.template_parameters.get("dif", node.template_parameters.get(1)),
233 )
234 linkage_data.word = word
235 if "tr" in node.template_parameters:
236 linkage_data.roman = clean_node(
237 wxr, None, node.template_parameters.get("tr")
238 )
239 if "sens" in node.template_parameters: 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true
240 linkage_data.translation = clean_node(
241 wxr, None, node.template_parameters.get("sens")
242 )
245def process_zh_lien_template(
246 wxr: WiktextractContext,
247 node: TemplateNode,
248 linkage_data: Linkage,
249) -> None:
250 # https://fr.wiktionary.org/wiki/Modèle:zh-lien
251 linkage_data.word = clean_node(wxr, None, node.template_parameters.get(1))
252 linkage_data.roman = clean_node(
253 wxr, None, node.template_parameters.get(2, "")
254 ) # pinyin
255 traditional_form = clean_node(
256 wxr, None, node.template_parameters.get(3, "")
257 )
258 if len(traditional_form) > 0:
259 linkage_data.alt = traditional_form
262def process_voir_anagrammes_template(
263 wxr: WiktextractContext, node: TemplateNode
264) -> list[Linkage]:
265 # https://fr.wiktionary.org/wiki/Modèle:voir_anagrammes
266 results = []
267 expanded_node = wxr.wtp.parse(
268 wxr.wtp.node_to_wikitext(node), expand_all=True
269 )
270 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
271 for link_node in list_item.find_child(NodeKind.LINK):
272 word = clean_node(wxr, None, link_node)
273 if len(word) > 0: 273 ↛ 271line 273 didn't jump to line 271 because the condition on line 273 was always true
274 results.append(Linkage(word=word))
275 return results