Coverage for src / wiktextract / extractor / th / etymology.py: 37%
49 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1from wikitextprocessor import (
2 HTMLNode,
3 LevelNode,
4 NodeKind,
5 TemplateNode,
6 WikiNode,
7)
9from ...page import clean_node
10from ...wxr_context import WiktextractContext
11from .models import Form, WordEntry
12from .tags import translate_raw_tags
15def extract_etymology_section(
16 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
17):
18 e_nodes = []
19 for node in level_node.children:
20 if isinstance(node, TemplateNode) and ( 20 ↛ 24line 20 didn't jump to line 24 because the condition on line 20 was never true
21 node.template_name.endswith("-kanjitab")
22 or node.template_name == "ja-kt"
23 ):
24 extract_ja_kanjitab_template(wxr, node, base_data)
25 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
26 for list_item in node.find_child(NodeKind.LIST_ITEM):
27 e_text = clean_node(wxr, base_data, list_item.children)
28 if e_text != "": 28 ↛ 26line 28 didn't jump to line 26 because the condition on line 28 was always true
29 base_data.etymology_texts.append(e_text)
30 elif not (
31 isinstance(node, LevelNode)
32 or (
33 isinstance(node, TemplateNode)
34 and node.template_name in ["ja-see", "ja-see-kango"]
35 )
36 ):
37 e_nodes.append(node)
39 if len(e_nodes) > 0: 39 ↛ exitline 39 didn't return from function 'extract_etymology_section' because the condition on line 39 was always true
40 e_str = clean_node(wxr, base_data, e_nodes)
41 if e_str != "":
42 base_data.etymology_texts.append(e_str)
45def extract_ja_kanjitab_template(
46 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordEntry
47):
48 # https://th.wiktionary.org/wiki/Template:ja-kanjitab
49 expanded_node = wxr.wtp.parse(
50 wxr.wtp.node_to_wikitext(t_node), expand_all=True
51 )
52 for table in expanded_node.find_child(NodeKind.TABLE):
53 is_alt_form_table = False
54 for row in table.find_child(NodeKind.TABLE_ROW):
55 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL):
56 header_text = clean_node(wxr, None, header_node)
57 if header_text.startswith("การสะกดแบบอื่น"):
58 is_alt_form_table = True
59 if not is_alt_form_table:
60 continue
61 forms = []
62 for row in table.find_child(NodeKind.TABLE_ROW):
63 for cell_node in row.find_child(NodeKind.TABLE_CELL):
64 for child_node in cell_node.children:
65 if isinstance(child_node, HTMLNode):
66 if child_node.tag == "span":
67 word = clean_node(wxr, None, child_node)
68 if word != "":
69 forms.append(
70 Form(
71 form=word, tags=["alternative", "kanji"]
72 )
73 )
74 elif child_node.tag == "small":
75 raw_tag = clean_node(wxr, None, child_node).strip(
76 "()"
77 )
78 if raw_tag != "" and len(forms) > 0:
79 forms[-1].raw_tags.append(raw_tag)
80 translate_raw_tags(forms[-1])
81 base_data.forms.extend(forms)
82 for link_node in expanded_node.find_child(NodeKind.LINK):
83 clean_node(wxr, base_data, link_node)