Coverage for src / wiktextract / extractor / ko / etymology.py: 43%
55 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1from wikitextprocessor import HTMLNode, LevelNode, NodeKind, TemplateNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from .models import Form, WordEntry
6from .tags import translate_raw_tags
9def extract_etymology_section(
10 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
11) -> None:
12 if len(word_entry.etymology_texts) > 0:
13 word_entry.etymology_texts.clear()
14 word_entry.categories.clear()
16 has_list = False
17 for list_node in level_node.find_child(NodeKind.LIST):
18 has_list = True
19 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
20 text = clean_node(wxr, word_entry, list_item.children)
21 if len(text) > 0: 21 ↛ 19line 21 didn't jump to line 19 because the condition on line 21 was always true
22 word_entry.etymology_texts.append(text)
24 if not has_list:
25 e_nodes = []
26 for node in level_node.children:
27 if isinstance(node, TemplateNode) and ( 27 ↛ 31line 27 didn't jump to line 31 because the condition on line 27 was never true
28 node.template_name.endswith("-kanjitab")
29 or node.template_name == "ja-kt"
30 ):
31 extract_ja_kanjitab_template(wxr, node, word_entry)
32 elif isinstance(node, LevelNode):
33 break
34 else:
35 e_nodes.append(node)
37 text = clean_node(wxr, word_entry, e_nodes)
38 if len(text) > 0:
39 word_entry.etymology_texts.append(text)
42def extract_ja_kanjitab_template(
43 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordEntry
44):
45 expanded_node = wxr.wtp.parse(
46 wxr.wtp.node_to_wikitext(t_node), expand_all=True
47 )
48 for table in expanded_node.find_child(NodeKind.TABLE):
49 is_alt_form_table = False
50 for row in table.find_child(NodeKind.TABLE_ROW):
51 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL):
52 header_text = clean_node(wxr, None, header_node)
53 if header_text == "다른 표기":
54 is_alt_form_table = True
55 if not is_alt_form_table:
56 continue
57 forms = []
58 for row in table.find_child(NodeKind.TABLE_ROW):
59 for cell_node in row.find_child(NodeKind.TABLE_CELL):
60 for child_node in cell_node.children:
61 if isinstance(child_node, HTMLNode):
62 if child_node.tag == "span":
63 word = clean_node(wxr, None, child_node)
64 if word != "":
65 forms.append(
66 Form(
67 form=word, tags=["alternative", "kanji"]
68 )
69 )
70 elif child_node.tag == "small":
71 raw_tag = clean_node(wxr, None, child_node).strip(
72 "()"
73 )
74 if raw_tag != "" and len(forms) > 0:
75 forms[-1].raw_tags.append(raw_tag)
76 translate_raw_tags(forms[-1])
77 base_data.forms.extend(forms)
78 for link_node in expanded_node.find_child(NodeKind.LINK):
79 clean_node(wxr, base_data, link_node)