Coverage for src / wiktextract / extractor / zh / etymology.py: 50%
69 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
1from wikitextprocessor import (
2 HTMLNode,
3 LevelNode,
4 NodeKind,
5 TemplateNode,
6 WikiNode,
7)
9from ...page import clean_node
10from ...wxr_context import WiktextractContext
11from .models import Example, Form, WordEntry
12from .tags import translate_raw_tags
15def extract_etymology_section(
16 wxr: WiktextractContext,
17 page_data: list[WordEntry],
18 base_data: WordEntry,
19 level_node: WikiNode,
20):
21 from .example import extract_template_zh_x
23 e_nodes = []
24 for node in level_node.children:
25 if isinstance(node, TemplateNode) and node.template_name in [
26 "zh-x",
27 "zh-q",
28 ]:
29 for example_data in extract_template_zh_x(
30 wxr, node, Example(text="")
31 ):
32 base_data.etymology_examples.append(example_data)
33 clean_node(wxr, base_data, node)
34 elif isinstance(node, TemplateNode) and node.template_name.lower() in [ 34 ↛ 41line 34 didn't jump to line 41 because the condition on line 34 was never true
35 "rfe", # missing etymology
36 "zh-forms",
37 "zh-wp",
38 "wp",
39 "wikipedia",
40 ]:
41 continue
42 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
43 has_zh_x = False
44 for template_node in node.find_child_recursively(NodeKind.TEMPLATE):
45 if template_node.template_name in ["zh-x", "zh-q"]:
46 has_zh_x = True
47 for example_data in extract_template_zh_x(
48 wxr, template_node, Example(text="")
49 ):
50 base_data.etymology_examples.append(example_data)
51 clean_node(wxr, base_data, template_node)
52 if not has_zh_x:
53 for list_item in node.find_child(NodeKind.LIST_ITEM):
54 e_text = clean_node(wxr, None, list_item.children)
55 if len(e_text) > 0: 55 ↛ 53line 55 didn't jump to line 53 because the condition on line 55 was always true
56 base_data.etymology_texts.append(e_text)
57 elif isinstance(node, TemplateNode) and node.template_name in [ 57 ↛ 62line 57 didn't jump to line 62 because the condition on line 57 was never true
58 "ja-see",
59 "ja-see-kango",
60 "zh-see",
61 ]:
62 from .page import process_soft_redirect_template
64 page_data.append(base_data.model_copy(deep=True))
65 process_soft_redirect_template(wxr, node, page_data[-1])
66 elif isinstance(node, TemplateNode) and ( 66 ↛ 70line 66 didn't jump to line 70 because the condition on line 66 was never true
67 node.template_name.endswith("-kanjitab")
68 or node.template_name == "ja-kt"
69 ):
70 extract_ja_kanjitab_template(wxr, node, base_data)
71 elif isinstance(node, LevelNode):
72 break
73 else:
74 e_nodes.append(node)
76 if len(e_nodes) > 0: 76 ↛ exitline 76 didn't return from function 'extract_etymology_section' because the condition on line 76 was always true
77 etymology_text = clean_node(wxr, base_data, e_nodes)
78 if len(etymology_text) > 0:
79 base_data.etymology_texts.append(etymology_text)
82def extract_ja_kanjitab_template(
83 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordEntry
84):
85 # https://zh.wiktionary.org/wiki/Template:ja-kanjitab
86 expanded_node = wxr.wtp.parse(
87 wxr.wtp.node_to_wikitext(t_node), expand_all=True
88 )
89 for table in expanded_node.find_child(NodeKind.TABLE):
90 is_alt_form_table = False
91 for row in table.find_child(NodeKind.TABLE_ROW):
92 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL):
93 header_text = clean_node(wxr, None, header_node)
94 if header_text == "其他表記":
95 is_alt_form_table = True
96 if not is_alt_form_table:
97 continue
98 forms = []
99 for row in table.find_child(NodeKind.TABLE_ROW):
100 for cell_node in row.find_child(NodeKind.TABLE_CELL):
101 for child_node in cell_node.children:
102 if isinstance(child_node, HTMLNode):
103 if child_node.tag == "span":
104 word = clean_node(wxr, None, child_node)
105 if word != "":
106 forms.append(
107 Form(
108 form=word, tags=["alternative", "kanji"]
109 )
110 )
111 elif child_node.tag == "small":
112 raw_tag = clean_node(wxr, None, child_node).strip(
113 "()"
114 )
115 if raw_tag != "" and len(forms) > 0:
116 forms[-1].raw_tags.append(raw_tag)
117 translate_raw_tags(forms[-1])
118 base_data.forms.extend(forms)
119 for link_node in expanded_node.find_child(NodeKind.LINK):
120 clean_node(wxr, base_data, link_node)