Coverage for src / wiktextract / extractor / vi / etymology.py: 33%
50 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
1from wikitextprocessor import (
2 HTMLNode,
3 LevelNode,
4 NodeKind,
5 TemplateNode,
6 WikiNode,
7)
9from ...page import clean_node
10from ...wxr_context import WiktextractContext
11from .models import Form, WordEntry
12from .tags import translate_raw_tags
15def extract_etymology_section(
16 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
17):
18 e_nodes = []
19 for node in level_node.children:
20 if isinstance(node, TemplateNode) and ( 20 ↛ 24line 20 didn't jump to line 24 because the condition on line 20 was never true
21 node.template_name.endswith("-kanjitab")
22 or node.template_name == "ja-kt"
23 ):
24 extract_ja_kanjitab_template(wxr, node, base_data)
25 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
26 for list_item in node.find_child(NodeKind.LIST_ITEM):
27 e_text = clean_node(wxr, base_data, list_item.children)
28 if e_text != "": 28 ↛ 26line 28 didn't jump to line 26 because the condition on line 28 was always true
29 base_data.etymology_texts.append(e_text)
30 elif isinstance(node, LevelNode): 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true
31 break
32 else:
33 e_nodes.append(node)
35 if len(e_nodes) > 0: 35 ↛ exitline 35 didn't return from function 'extract_etymology_section' because the condition on line 35 was always true
36 e_text = clean_node(wxr, base_data, e_nodes)
37 if e_text != "": 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true
38 base_data.etymology_texts.append(e_text)
41def extract_ja_kanjitab_template(
42 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordEntry
43):
44 expanded_node = wxr.wtp.parse(
45 wxr.wtp.node_to_wikitext(t_node), expand_all=True
46 )
47 for table in expanded_node.find_child(NodeKind.TABLE):
48 is_alt_form_table = False
49 for row in table.find_child(NodeKind.TABLE_ROW):
50 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL):
51 header_text = clean_node(wxr, None, header_node)
52 if header_text == "Cách viết khác":
53 is_alt_form_table = True
54 if not is_alt_form_table:
55 continue
56 forms = []
57 for row in table.find_child(NodeKind.TABLE_ROW):
58 for cell_node in row.find_child(NodeKind.TABLE_CELL):
59 for child_node in cell_node.children:
60 if isinstance(child_node, HTMLNode):
61 if child_node.tag == "span":
62 word = clean_node(wxr, None, child_node)
63 if word != "":
64 forms.append(
65 Form(
66 form=word, tags=["alternative", "kanji"]
67 )
68 )
69 elif child_node.tag == "small":
70 raw_tag = clean_node(wxr, None, child_node).strip(
71 "()"
72 )
73 if raw_tag != "" and len(forms) > 0:
74 forms[-1].raw_tags.append(raw_tag)
75 translate_raw_tags(forms[-1])
76 base_data.forms.extend(forms)
77 for link_node in expanded_node.find_child(NodeKind.LINK):
78 clean_node(wxr, base_data, link_node)