Coverage for src/wiktextract/extractor/vi/linkage.py: 88%
180 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1import re
3from wikitextprocessor import (
4 HTMLNode,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .models import Form, Linkage, WordEntry
14from .tags import translate_raw_tags
16GLOSS_LIST_LINKAGE_TEMPLATES = {
17 "antonyms": "antonyms",
18 "def-ant": "antonyms",
19 "antonym": "antonyms",
20 "coordinate terms": "coordinate_terms",
21 "def-cot": "coordinate_terms",
22 "def-coo": "coordinate_terms",
23 "cot": "coordinate_terms",
24 "holonyms": "holonyms",
25 "holonym": "holonyms",
26 "holo": "holonyms",
27 "hypernyms": "hypernyms",
28 "hyper": "hypernyms",
29 "hyponyms": "hyponyms",
30 "hypo": "hyponyms",
31 "inline alt forms": "alt_forms",
32 "alti": "alt_forms",
33 "meronyms": "meronyms",
34 "mero": "meronyms",
35 "synonyms": "synonyms",
36 "synonym": "synonyms",
37 "def-syn": "synonyms",
38 "synsee": "synonyms",
39}
41QUALIFIER_TEMPALTES = ["qualifier", "qual", "q", "qf", "i"]
44def extract_gloss_list_linkage_template(
45 wxr: WiktextractContext,
46 word_entry: WordEntry,
47 t_node: TemplateNode,
48 linkage_type: str,
49 sense: str,
50):
51 expanded_node = wxr.wtp.parse(
52 wxr.wtp.node_to_wikitext(t_node), expand_all=True
53 )
54 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
55 l_list = []
56 raw_tags = []
57 for top_span_tag in expanded_node.find_html("span"):
58 for node in top_span_tag.children:
59 if isinstance(node, HTMLNode) and node.tag == "span":
60 span_lang = node.attrs.get("lang", "")
61 span_class = node.attrs.get("class", "").split()
62 if span_lang == lang_code:
63 l_data = Linkage(
64 word=clean_node(wxr, None, node),
65 sense=sense,
66 raw_tags=raw_tags,
67 )
68 if "Hant" in span_class: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 l_data.tags.append("Traditional-Chinese")
70 elif "Hans" in span_class: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 l_data.tags.append("Simplified-Chinese")
72 if l_data.word != "": 72 ↛ 58line 72 didn't jump to line 58 because the condition on line 72 was always true
73 translate_raw_tags(l_data)
74 l_list.append(l_data)
75 elif span_lang == f"{lang_code}-Latn" or "tr" in span_class:
76 roman = clean_node(wxr, None, node)
77 for d in l_list:
78 d.roman = roman
79 elif "mention-gloss" in span_class: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true
80 sense = clean_node(wxr, None, node)
81 for d in l_list:
82 d.sense = sense
83 elif "qualifier-content" in span_class:
84 raw_tag_str = clean_node(wxr, None, node)
85 for raw_tag in raw_tag_str.split(","):
86 raw_tag = raw_tag.strip()
87 if raw_tag != "": 87 ↛ 85line 87 didn't jump to line 85 because the condition on line 87 was always true
88 raw_tags.append(raw_tag)
89 elif isinstance(node, str) and node.strip() == ",":
90 if linkage_type == "alt_forms": 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true
91 for l_data in l_list:
92 word_entry.forms.append(
93 Form(
94 form=l_data.word,
95 sense=l_data.sense,
96 tags=l_data.tags + ["alternative"],
97 raw_tags=l_data.raw_tags,
98 roman=l_data.roman,
99 )
100 )
101 else:
102 getattr(word_entry, linkage_type).extend(l_list)
103 l_list.clear()
104 raw_tags.clear()
106 if linkage_type == "alt_forms":
107 for l_data in l_list:
108 word_entry.forms.append(
109 Form(
110 form=l_data.word,
111 sense=l_data.sense,
112 tags=l_data.tags + ["alternative"],
113 raw_tags=l_data.raw_tags,
114 roman=l_data.roman,
115 )
116 )
117 else:
118 getattr(word_entry, linkage_type).extend(l_list)
121def extract_alt_form_section(
122 wxr: WiktextractContext,
123 base_data: WordEntry,
124 page_data: list[WordEntry],
125 level_node: LevelNode,
126):
127 forms = []
128 for list_node in level_node.find_child(NodeKind.LIST):
129 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
130 raw_tags = []
131 for node in list_item.children:
132 if isinstance(node, TemplateNode) and node.template_name in [
133 "alter",
134 "def-alt",
135 ]:
136 forms.extend(extract_alter_template(wxr, node, raw_tags))
137 elif (
138 isinstance(node, TemplateNode)
139 and node.template_name in QUALIFIER_TEMPALTES
140 ):
141 raw_tags.extend(extract_qualifier_template(wxr, node))
143 if len(page_data) == 0 or page_data[-1].lang != base_data.lang:
144 base_data.forms.extend(forms)
145 else:
146 page_data[-1].forms.extend(forms)
149def extract_alter_template(
150 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str]
151) -> list[Form]:
152 forms = []
153 expanded_node = wxr.wtp.parse(
154 wxr.wtp.node_to_wikitext(t_node), expand_all=True
155 )
156 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
157 for span_tag in expanded_node.find_html(
158 "span", attr_name="lang", attr_value=lang_code
159 ):
160 word = clean_node(wxr, None, span_tag)
161 if word != "": 161 ↛ 157line 161 didn't jump to line 157 because the condition on line 161 was always true
162 form = Form(form=word, tags=["alternative"], raw_tags=raw_tags)
163 translate_raw_tags(form)
164 forms.append(form)
165 return forms
168def extract_qualifier_template(
169 wxr: WiktextractContext, t_node: TemplateNode
170) -> list[str]:
171 raw_tags = []
172 for raw_tag in clean_node(wxr, None, t_node).strip("()").split(","):
173 raw_tag = raw_tag.strip()
174 if raw_tag != "": 174 ↛ 172line 174 didn't jump to line 172 because the condition on line 174 was always true
175 raw_tags.append(raw_tag)
176 return raw_tags
179def extract_linkage_section(
180 wxr: WiktextractContext,
181 page_data: list[WordEntry],
182 level_node: LevelNode,
183 linkage_type: str,
184):
185 l_data = []
186 if linkage_type == "idioms":
187 l_data.extend(extract_idiom_section(wxr, level_node))
188 linkage_type = "related"
189 else:
190 for node in level_node.children:
191 if isinstance(node, TemplateNode) and (
192 re.fullmatch(r"(?:col|der|rel)(?:\d+)?", node.template_name)
193 or node.template_name in ["columns", "column"]
194 ):
195 l_data.extend(extract_col_template(wxr, node))
196 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
197 for list_item in node.find_child(NodeKind.LIST_ITEM):
198 l_data.extend(extract_linkage_list_item(wxr, list_item))
200 if level_node.kind == NodeKind.LEVEL3: 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true
201 for data in page_data:
202 if data.lang_code == page_data[-1].lang_code:
203 getattr(data, linkage_type).extend(l_data)
204 elif len(page_data) > 0: 204 ↛ exitline 204 didn't return from function 'extract_linkage_section' because the condition on line 204 was always true
205 getattr(page_data[-1], linkage_type).extend(l_data)
208def extract_col_template(
209 wxr: WiktextractContext, t_node: TemplateNode
210) -> list[Linkage]:
211 l_list = []
212 expanded_template = wxr.wtp.parse(
213 wxr.wtp.node_to_wikitext(t_node), expand_all=True
214 )
215 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
216 for li_tag in expanded_template.find_html_recursively("li"):
217 first_word = True
218 translation = ""
219 for node in li_tag.children:
220 if isinstance(node, str):
221 m = re.search(r"“(.+)”", node)
222 if m is not None:
223 translation = m.group(1).strip()
224 for span_tag in li_tag.find_html("span"):
225 span_lang = span_tag.attrs.get("lang", "")
226 span_class = span_tag.attrs.get("class", "")
227 if span_lang.endswith("-Latn") and len(l_list) > 0:
228 l_list[-1].roman = clean_node(wxr, None, span_tag)
229 elif span_lang == lang_code:
230 if lang_code == "zh":
231 l_data = Linkage(word=clean_node(wxr, None, span_tag))
232 if "Hant" in span_class:
233 l_data.tags.append("Traditional-Chinese")
234 elif "Hans" in span_class: 234 ↛ 236line 234 didn't jump to line 236 because the condition on line 234 was always true
235 l_data.tags.append("Simplified-Chinese")
236 l_list.append(l_data)
237 elif not first_word:
238 l_list[-1].other = clean_node(wxr, None, span_tag)
239 else:
240 l_list.append(
241 Linkage(
242 word=clean_node(wxr, None, span_tag),
243 translation=translation,
244 )
245 )
246 first_word = False
248 return l_list
251def extract_linkage_list_item(
252 wxr: WiktextractContext, list_item: WikiNode
253) -> list[Linkage]:
254 l_list = []
255 sense = ""
256 for node in list_item.children:
257 if isinstance(node, TemplateNode):
258 if node.template_name in ["sense", "s"]:
259 sense = clean_node(wxr, None, node).strip("(): ")
260 elif node.template_name in ["l", "link"]: 260 ↛ 256line 260 didn't jump to line 256 because the condition on line 260 was always true
261 l_list.extend(extract_link_template(wxr, node, sense))
262 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 262 ↛ 263line 262 didn't jump to line 263 because the condition on line 262 was never true
263 word = clean_node(wxr, None, node)
264 if word != "":
265 l_list.append(Linkage(word=word, sense=sense))
266 return l_list
269def extract_link_template(
270 wxr: WiktextractContext, t_node: TemplateNode, sense: str
271) -> list[Linkage]:
272 l_list = []
273 expanded_template = wxr.wtp.parse(
274 wxr.wtp.node_to_wikitext(t_node), expand_all=True
275 )
276 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
277 for span_tag in expanded_template.find_html("span"):
278 span_lang = span_tag.attrs.get("lang", "")
279 if span_lang == lang_code: 279 ↛ 277line 279 didn't jump to line 277 because the condition on line 279 was always true
280 l_list.append(
281 Linkage(word=clean_node(wxr, None, span_tag), sense=sense)
282 )
284 return l_list
287def extract_idiom_section(
288 wxr: WiktextractContext, level_node: LevelNode
289) -> list[Linkage]:
290 l_list = []
291 for list_node in level_node.find_child(NodeKind.LIST):
292 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
293 l_list.extend(extract_idiom_list_item(wxr, list_item))
295 return l_list
298def extract_idiom_list_item(
299 wxr: WiktextractContext, list_item: WikiNode
300) -> list[Linkage]:
301 l_list = []
302 bold_index = 0
303 sense_nodes = []
304 for index, node in enumerate(list_item.children):
305 if isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
306 word = clean_node(wxr, None, node)
307 if word != "": 307 ↛ 304line 307 didn't jump to line 304 because the condition on line 307 was always true
308 bold_index = index
309 l_list.append(Linkage(word=word, tags=["idiomatic"]))
310 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
311 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
312 sense = clean_node(wxr, None, child_list_item.children)
313 if sense != "" and len(l_list) > 0: 313 ↛ 311line 313 didn't jump to line 311 because the condition on line 313 was always true
314 l_list[-1].senses.append(sense)
315 elif index > bold_index:
316 sense_nodes.append(node)
318 sense = clean_node(wxr, None, sense_nodes).strip(": ")
319 if sense != "" and len(l_list) > 0:
320 l_list[-1].sense = sense
322 return l_list