Coverage for src/wiktextract/extractor/vi/linkage.py: 60%
315 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
2from itertools import count
4from wikitextprocessor import (
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ..ruby import extract_ruby
15from .models import Form, Linkage, WordEntry
16from .tags import translate_raw_tags
18GLOSS_LIST_LINKAGE_TEMPLATES = {
19 "antonyms": "antonyms",
20 "def-ant": "antonyms",
21 "antonym": "antonyms",
22 "coordinate terms": "coordinate_terms",
23 "def-cot": "coordinate_terms",
24 "def-coo": "coordinate_terms",
25 "cot": "coordinate_terms",
26 "holonyms": "holonyms",
27 "holonym": "holonyms",
28 "holo": "holonyms",
29 "hypernyms": "hypernyms",
30 "hyper": "hypernyms",
31 "hyponyms": "hyponyms",
32 "hypo": "hyponyms",
33 "inline alt forms": "alt_forms",
34 "alti": "alt_forms",
35 "meronyms": "meronyms",
36 "mero": "meronyms",
37 "synonyms": "synonyms",
38 "synonym": "synonyms",
39 "def-syn": "synonyms",
40 "synsee": "synonyms",
41}
43QUALIFIER_TEMPALTES = ["qualifier", "qual", "q", "qf", "i"]
46def extract_gloss_list_linkage_template(
47 wxr: WiktextractContext,
48 word_entry: WordEntry,
49 t_node: TemplateNode,
50 linkage_type: str,
51 sense: str,
52):
53 expanded_node = wxr.wtp.parse(
54 wxr.wtp.node_to_wikitext(t_node), expand_all=True
55 )
56 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
57 l_list = []
58 raw_tags = []
59 for top_span_tag in expanded_node.find_html("span"):
60 for node in top_span_tag.children:
61 if isinstance(node, HTMLNode) and node.tag == "span":
62 span_lang = node.attrs.get("lang", "")
63 span_class = node.attrs.get("class", "").split()
64 if span_lang == lang_code:
65 l_data = Linkage(
66 word=clean_node(wxr, None, node),
67 sense=sense,
68 raw_tags=raw_tags,
69 )
70 if "Hant" in span_class: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 l_data.tags.append("Traditional-Chinese")
72 elif "Hans" in span_class: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true
73 l_data.tags.append("Simplified-Chinese")
74 if l_data.word != "": 74 ↛ 60line 74 didn't jump to line 60 because the condition on line 74 was always true
75 translate_raw_tags(l_data)
76 l_list.append(l_data)
77 elif span_lang == f"{lang_code}-Latn" or "tr" in span_class:
78 roman = clean_node(wxr, None, node)
79 for d in l_list:
80 d.roman = roman
81 elif "mention-gloss" in span_class: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true
82 sense = clean_node(wxr, None, node)
83 for d in l_list:
84 d.sense = sense
85 elif "qualifier-content" in span_class:
86 raw_tag_str = clean_node(wxr, None, node)
87 for raw_tag in raw_tag_str.split(","):
88 raw_tag = raw_tag.strip()
89 if raw_tag != "": 89 ↛ 87line 89 didn't jump to line 87 because the condition on line 89 was always true
90 raw_tags.append(raw_tag)
91 elif isinstance(node, str) and node.strip() == ",":
92 if linkage_type == "alt_forms": 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true
93 for l_data in l_list:
94 word_entry.forms.append(
95 Form(
96 form=l_data.word,
97 sense=l_data.sense,
98 tags=l_data.tags + ["alternative"],
99 raw_tags=l_data.raw_tags,
100 roman=l_data.roman,
101 )
102 )
103 else:
104 getattr(word_entry, linkage_type).extend(l_list)
105 l_list.clear()
106 raw_tags.clear()
108 if linkage_type == "alt_forms":
109 for l_data in l_list:
110 word_entry.forms.append(
111 Form(
112 form=l_data.word,
113 sense=l_data.sense,
114 tags=l_data.tags + ["alternative"],
115 raw_tags=l_data.raw_tags,
116 roman=l_data.roman,
117 )
118 )
119 else:
120 getattr(word_entry, linkage_type).extend(l_list)
123def extract_alt_form_section(
124 wxr: WiktextractContext,
125 base_data: WordEntry,
126 page_data: list[WordEntry],
127 level_node: LevelNode,
128):
129 forms = []
130 for list_node in level_node.find_child(NodeKind.LIST):
131 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
132 raw_tags = []
133 for node in list_item.children:
134 if isinstance(node, TemplateNode) and node.template_name in [
135 "alter",
136 "def-alt",
137 ]:
138 forms.extend(extract_alter_template(wxr, node, raw_tags))
139 elif (
140 isinstance(node, TemplateNode)
141 and node.template_name in QUALIFIER_TEMPALTES
142 ):
143 raw_tags.extend(extract_qualifier_template(wxr, node))
145 if len(page_data) == 0 or page_data[-1].lang != base_data.lang:
146 base_data.forms.extend(forms)
147 else:
148 page_data[-1].forms.extend(forms)
151def extract_alter_template(
152 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str]
153) -> list[Form]:
154 forms = []
155 expanded_node = wxr.wtp.parse(
156 wxr.wtp.node_to_wikitext(t_node), expand_all=True
157 )
158 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
159 for span_tag in expanded_node.find_html(
160 "span", attr_name="lang", attr_value=lang_code
161 ):
162 word = clean_node(wxr, None, span_tag)
163 if word != "": 163 ↛ 159line 163 didn't jump to line 159 because the condition on line 163 was always true
164 form = Form(form=word, tags=["alternative"], raw_tags=raw_tags)
165 translate_raw_tags(form)
166 forms.append(form)
167 return forms
170def extract_qualifier_template(
171 wxr: WiktextractContext, t_node: TemplateNode
172) -> list[str]:
173 raw_tags = []
174 for raw_tag in clean_node(wxr, None, t_node).strip("()").split(","):
175 raw_tag = raw_tag.strip()
176 if raw_tag != "": 176 ↛ 174line 176 didn't jump to line 174 because the condition on line 176 was always true
177 raw_tags.append(raw_tag)
178 return raw_tags
181def extract_linkage_section(
182 wxr: WiktextractContext,
183 page_data: list[WordEntry],
184 level_node: LevelNode,
185 linkage_type: str,
186):
187 l_list = []
188 sense = ""
189 for node in level_node.children:
190 if isinstance(node, TemplateNode) and (
191 re.fullmatch(r"(?:col|der|rel)(?:\d+)?", node.template_name)
192 or node.template_name in ["columns", "column"]
193 ):
194 l_list.extend(extract_col_template(wxr, node))
195 elif isinstance(node, TemplateNode) and node.template_name.startswith(
196 "der-top"
197 ):
198 sense = clean_node(wxr, None, node.template_parameters.get(1, ""))
199 elif isinstance(node, TemplateNode) and node.template_name in [ 199 ↛ 203line 199 didn't jump to line 203 because the condition on line 199 was never true
200 "zh-dial",
201 "zho-dial",
202 ]:
203 l_list.extend(extract_zh_dial_template(wxr, node, sense))
204 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
205 for list_item in node.find_child(NodeKind.LIST_ITEM):
206 l_list.extend(
207 extract_idiom_list_item(wxr, list_item)
208 if linkage_type == "idioms"
209 and list_item.contain_node(NodeKind.BOLD)
210 else extract_linkage_list_item(wxr, list_item, sense)
211 )
212 if linkage_type == "idioms":
213 linkage_type = "related"
214 if level_node.kind == NodeKind.LEVEL3: 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true
215 for data in page_data:
216 if data.lang_code == page_data[-1].lang_code:
217 getattr(data, linkage_type).extend(l_list)
218 for l_data in l_list:
219 data.categories.extend(l_data.categories)
220 elif len(page_data) > 0: 220 ↛ exitline 220 didn't return from function 'extract_linkage_section' because the condition on line 220 was always true
221 getattr(page_data[-1], linkage_type).extend(l_list)
222 for l_data in l_list:
223 page_data[-1].categories.extend(l_data.categories)
226def extract_col_template(
227 wxr: WiktextractContext, t_node: TemplateNode
228) -> list[Linkage]:
229 l_list = []
230 expanded_template = wxr.wtp.parse(
231 wxr.wtp.node_to_wikitext(t_node), expand_all=True
232 )
233 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
234 for li_tag in expanded_template.find_html_recursively("li"):
235 first_word = True
236 translation = ""
237 for node in li_tag.children:
238 if isinstance(node, str):
239 m = re.search(r"“(.+)”", node)
240 if m is not None:
241 translation = m.group(1).strip()
242 for span_tag in li_tag.find_html("span"):
243 span_lang = span_tag.attrs.get("lang", "")
244 span_class = span_tag.attrs.get("class", "")
245 if span_lang.endswith("-Latn") and len(l_list) > 0:
246 l_list[-1].roman = clean_node(wxr, None, span_tag)
247 elif span_lang == lang_code:
248 if lang_code == "zh":
249 l_data = Linkage(word=clean_node(wxr, None, span_tag))
250 if "Hant" in span_class:
251 l_data.tags.append("Traditional-Chinese")
252 elif "Hans" in span_class: 252 ↛ 254line 252 didn't jump to line 254 because the condition on line 252 was always true
253 l_data.tags.append("Simplified-Chinese")
254 l_list.append(l_data)
255 elif not first_word:
256 l_list[-1].other = clean_node(wxr, None, span_tag)
257 else:
258 l_list.append(
259 Linkage(
260 word=clean_node(wxr, None, span_tag),
261 translation=translation,
262 )
263 )
264 first_word = False
266 return l_list
269def extract_linkage_list_item(
270 wxr: WiktextractContext, list_item: WikiNode, sense: str
271) -> list[Linkage]:
272 l_list = []
273 raw_tags = []
274 for index, node in enumerate(list_item.children):
275 if isinstance(node, TemplateNode):
276 if node.template_name in ["sense", "s"]:
277 sense = clean_node(wxr, None, node).strip("(): ")
278 elif node.template_name in ["l", "link"]:
279 l_list.extend(extract_link_template(wxr, node, sense))
280 elif node.template_name in ["qualifier", "qual"]:
281 raw_tags.append(clean_node(wxr, None, node).strip("()"))
282 elif node.template_name in ["zh-l", "zho-l"]: 282 ↛ 283line 282 didn't jump to line 283 because the condition on line 282 was never true
283 l_list.extend(extract_zh_l_template(wxr, node, sense, raw_tags))
284 raw_tags.clear()
285 elif node.template_name in ["ja-r", "jpn-r"]: 285 ↛ 286line 285 didn't jump to line 286 because the condition on line 285 was never true
286 l_list.append(extract_ja_r_template(wxr, node, sense, raw_tags))
287 raw_tags.clear()
288 elif node.template_name in ["vi-l", "vie-l"]: 288 ↛ 291line 288 didn't jump to line 291 because the condition on line 288 was always true
289 l_list.append(extract_vi_l_template(wxr, node, sense, raw_tags))
290 raw_tags.clear()
291 elif node.template_name in ["anagrams", "Anagrams", "đảo chữ"]:
292 l_list.extend(
293 extract_anagrams_template(wxr, node, sense, raw_tags)
294 )
295 raw_tags.clear()
296 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
297 word = clean_node(wxr, None, node)
298 if word != "": 298 ↛ 274line 298 didn't jump to line 274 because the condition on line 298 was always true
299 l_data = Linkage(word=word, sense=sense, raw_tags=raw_tags)
300 translate_raw_tags(l_data)
301 l_list.append(l_data)
302 elif (
303 isinstance(node, str)
304 and node.strip().startswith("-")
305 and len(l_list) > 0
306 ):
307 l_list[-1].sense = clean_node(
308 wxr, None, list_item.children[index:]
309 ).strip("- \n")
310 break
311 if len(raw_tags) > 0 and len(l_list) > 0:
312 l_list[-1].raw_tags.extend(raw_tags)
313 translate_raw_tags(l_list[-1])
314 return l_list
317def extract_link_template(
318 wxr: WiktextractContext, t_node: TemplateNode, sense: str
319) -> list[Linkage]:
320 l_list = []
321 expanded_template = wxr.wtp.parse(
322 wxr.wtp.node_to_wikitext(t_node), expand_all=True
323 )
324 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
325 for span_tag in expanded_template.find_html("span"):
326 span_lang = span_tag.attrs.get("lang", "")
327 if span_lang == lang_code: 327 ↛ 325line 327 didn't jump to line 325 because the condition on line 327 was always true
328 l_list.append(
329 Linkage(word=clean_node(wxr, None, span_tag), sense=sense)
330 )
332 return l_list
335def extract_idiom_list_item(
336 wxr: WiktextractContext, list_item: WikiNode
337) -> list[Linkage]:
338 l_list = []
339 bold_index = 0
340 sense_nodes = []
341 for index, node in enumerate(list_item.children):
342 if isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:
343 word = clean_node(wxr, None, node)
344 if word != "": 344 ↛ 341line 344 didn't jump to line 341 because the condition on line 344 was always true
345 bold_index = index
346 l_list.append(Linkage(word=word, tags=["idiomatic"]))
347 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
348 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
349 sense = clean_node(wxr, None, child_list_item.children)
350 if sense != "" and len(l_list) > 0: 350 ↛ 348line 350 didn't jump to line 348 because the condition on line 350 was always true
351 l_list[-1].senses.append(sense)
352 elif index > bold_index:
353 sense_nodes.append(node)
355 sense = clean_node(wxr, None, sense_nodes).strip(": ")
356 if sense != "" and len(l_list) > 0:
357 l_list[-1].sense = sense
359 return l_list
362def extract_zh_l_template(
363 wxr: WiktextractContext,
364 template_node: TemplateNode,
365 sense: str,
366 raw_tags: list[str] = [],
367) -> list[Linkage]:
368 expanded_node = wxr.wtp.parse(
369 wxr.wtp.node_to_wikitext(template_node), expand_all=True
370 )
371 roman = ""
372 linkage_list = []
373 for i_tag in expanded_node.find_html_recursively(
374 "span", attr_name="class", attr_value="Latn"
375 ):
376 roman = clean_node(wxr, None, i_tag)
377 for span_tag in expanded_node.find_html(
378 "span", attr_name="lang", attr_value="zh"
379 ):
380 linkage_data = Linkage(
381 sense=sense,
382 raw_tags=raw_tags,
383 roman=roman,
384 word=clean_node(wxr, None, span_tag),
385 )
386 lang_attr = span_tag.attrs.get("lang", "")
387 if lang_attr == "zh-Hant":
388 linkage_data.tags.append("Traditional-Chinese")
389 elif lang_attr == "zh-Hans":
390 linkage_data.tags.append("Simplified-Chinese")
391 if len(linkage_data.word) > 0 and linkage_data.word != "/":
392 translate_raw_tags(linkage_data)
393 linkage_list.append(linkage_data)
394 return linkage_list
397def extract_ja_r_template(
398 wxr: WiktextractContext,
399 template_node: TemplateNode,
400 sense: str,
401 raw_tags: list[str] = [],
402) -> Linkage:
403 expanded_node = wxr.wtp.parse(
404 wxr.wtp.node_to_wikitext(template_node), expand_all=True
405 )
406 linkage_data = Linkage(word="", sense=sense, raw_tags=raw_tags)
407 for span_node in expanded_node.find_html("span"):
408 span_class = span_node.attrs.get("class", "").split()
409 if "lang" in span_node.attrs:
410 ruby_data, no_ruby_nodes = extract_ruby(wxr, span_node)
411 linkage_data.word = clean_node(wxr, None, no_ruby_nodes)
412 linkage_data.ruby = ruby_data
413 elif "tr" in span_class:
414 linkage_data.roman = clean_node(wxr, None, span_node)
415 elif "mention-gloss" in span_class:
416 linkage_data.sense = clean_node(wxr, None, span_node)
418 translate_raw_tags(linkage_data)
419 return linkage_data
422def extract_vi_l_template(
423 wxr: WiktextractContext,
424 t_node: TemplateNode,
425 sense: str,
426 raw_tags: list[str],
427) -> Linkage:
428 l_data = Linkage(word="", sense=sense, raw_tags=raw_tags)
429 expanded_node = wxr.wtp.parse(
430 wxr.wtp.node_to_wikitext(t_node), expand_all=True
431 )
432 for span_tag in expanded_node.find_html("span"):
433 span_lang = span_tag.attrs.get("lang", "")
434 match span_lang:
435 case "vi": 435 ↛ 437line 435 didn't jump to line 437 because the pattern on line 435 always matched
436 l_data.word = clean_node(wxr, None, span_tag)
437 case "vi-Latn":
438 l_data.roman = clean_node(wxr, None, span_tag)
439 for link_node in expanded_node.find_child(NodeKind.LINK): 439 ↛ 440line 439 didn't jump to line 440 because the loop on line 439 never started
440 clean_node(wxr, l_data, link_node)
441 return l_data
444def extract_anagrams_template(
445 wxr: WiktextractContext,
446 t_node: TemplateNode,
447 sense: str,
448 raw_tags: list[str],
449) -> list[Linkage]:
450 l_list = []
451 for arg_index in count(2):
452 if arg_index not in t_node.template_parameters:
453 break
454 word = clean_node(wxr, None, t_node.template_parameters[arg_index])
455 if word != "":
456 l_data = Linkage(word=word, sense=sense, raw_tags=raw_tags)
457 translate_raw_tags(l_data)
458 l_list.append(l_data)
460 return l_list
463def extract_zh_dial_template(
464 wxr: WiktextractContext, template_node: TemplateNode, sense: str
465) -> list[Linkage]:
466 from .sound import split_zh_pron_raw_tag
468 linkage_list = []
469 expanded_node = wxr.wtp.parse(
470 wxr.wtp.node_to_wikitext(template_node), expand_all=True
471 )
472 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE):
473 is_note_row = False
474 note_tags = {}
475 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
476 for cell_node in row_node.find_child(
477 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
478 ):
479 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
480 is_note_row = clean_node(wxr, None, cell_node) == "Ghi chú"
481 elif is_note_row:
482 for note_str in clean_node(wxr, None, cell_node).split(";"):
483 if "-" in note_str:
484 note_symbol, note = note_str.split("-", maxsplit=1)
485 note_symbol = note_symbol.strip()
486 note = note.strip()
487 if note_symbol != "" and note != "":
488 note_tags[note_symbol] = note
489 lang_tags = []
490 region_tags = []
491 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
492 if not row_node.contain_node(NodeKind.TABLE_CELL):
493 continue # skip header row
494 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):
495 lang_tags = split_zh_pron_raw_tag(
496 clean_node(wxr, None, header_node)
497 )
498 if lang_tags == ["Ghi chú"]: # skip last note row
499 continue
500 for cell_node in row_node.find_child(NodeKind.TABLE_CELL):
501 for link_node in cell_node.find_child(NodeKind.LINK):
502 region_tags = split_zh_pron_raw_tag(
503 clean_node(wxr, None, link_node)
504 )
505 for span_tag in cell_node.find_html("span"):
506 span_text = clean_node(wxr, None, span_tag)
507 if span_text == "":
508 continue
509 if (
510 span_tag.attrs.get("lang", "") == "zh"
511 and span_text != wxr.wtp.title
512 ):
513 l_data = Linkage(word=span_text, sense=sense)
514 if len(lang_tags) > 0:
515 l_data.raw_tags.extend(lang_tags)
516 if len(region_tags) > 0:
517 l_data.raw_tags.extend(region_tags)
518 translate_raw_tags(l_data)
519 linkage_list.append(l_data)
520 elif (
521 span_tag.attrs.get("style", "") == "font-size:60%"
522 and len(linkage_list) > 0
523 ):
524 for note_symbol in span_text.split(","):
525 note_symbol = note_symbol.strip()
526 raw_tag = note_symbol
527 if note_symbol in note_tags:
528 raw_tag = note_tags[note_symbol]
529 if raw_tag != "":
530 linkage_list[-1].raw_tags.append(raw_tag)
531 translate_raw_tags(linkage_list[-1])
533 return linkage_list