Coverage for src/wiktextract/extractor/th/descendant.py: 97%
63 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1from mediawiki_langcodes import code_to_name
2from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
4from ...page import clean_node
5from ...wxr_context import WiktextractContext
6from ..ruby import extract_ruby
7from .models import Descendant, WordEntry
10def extract_descendant_section(
11 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode
12):
13 for t_node in level_node.find_child(NodeKind.TEMPLATE):
14 if t_node.template_name in ["CJKV", "Sinoxenic-word"]: 14 ↛ 13line 14 didn't jump to line 13 because the condition on line 14 was always true
15 extract_cjkv_template(wxr, word_entry, t_node)
17 for list_node in level_node.find_child(NodeKind.LIST):
18 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
19 extract_desc_list_item(wxr, word_entry, [], list_item)
22def extract_desc_list_item(
23 wxr: WiktextractContext,
24 word_entry: WordEntry,
25 parent_data: list[Descendant],
26 list_item: WikiNode,
27):
28 desc_list = []
29 for node in list_item.children:
30 if isinstance(node, TemplateNode) and node.template_name in [
31 "desc",
32 "descendant",
33 "desctree",
34 "descendants tree",
35 ]:
36 desc_list.extend(
37 extract_desc_template(wxr, word_entry, parent_data, node)
38 )
39 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
40 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
41 extract_desc_list_item(
42 wxr, word_entry, desc_list, child_list_item
43 )
46def extract_desc_template(
47 wxr: WiktextractContext,
48 word_entry: WordEntry,
49 parent_data: list[Descendant],
50 t_node: TemplateNode,
51) -> list[Descendant]:
52 desc_data = []
53 expanded_node = wxr.wtp.parse(
54 wxr.wtp.node_to_wikitext(t_node), expand_all=True
55 )
56 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
57 lang_name = code_to_name(lang_code, "th") or "unknown"
58 for span_tag in expanded_node.find_html("span"):
59 span_lang = span_tag.attrs.get("lang", "")
60 span_class = span_tag.attrs.get("class", "").split()
61 if (span_lang.endswith("-Latn") or "tr" in span_class) and len(
62 desc_data
63 ) > 0:
64 desc_data[-1].roman = clean_node(wxr, None, span_tag)
65 elif "mention-gloss" in span_class and len(desc_data) > 0:
66 desc_data[-1].sense = clean_node(wxr, None, span_tag)
67 elif span_lang == lang_code:
68 desc_data.append(
69 Descendant(
70 lang_code=lang_code,
71 lang=lang_name,
72 word=clean_node(wxr, None, span_tag),
73 )
74 )
76 if len(parent_data) > 0:
77 for p_data in parent_data:
78 p_data.descendants.extend(desc_data)
79 else:
80 word_entry.descendants.extend(desc_data)
81 clean_node(wxr, word_entry, expanded_node)
82 return desc_data
85def extract_cjkv_template(
86 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
87):
88 expanded_node = wxr.wtp.parse(
89 wxr.wtp.node_to_wikitext(t_node), expand_all=True
90 )
91 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
92 desc_data = Descendant(word="", lang="unknown", lang_code="unknown")
93 for node in list_item.children:
94 if (
95 isinstance(node, str)
96 and node.strip().endswith(":")
97 and desc_data.lang == "unknown"
98 ):
99 desc_data.lang = node.strip(": ")
100 elif isinstance(node, HTMLNode) and node.tag == "span":
101 span_class = node.attrs.get("class", "")
102 if span_class == "desc-arr":
103 raw_tag = node.attrs.get("title", "")
104 if raw_tag != "": 104 ↛ 93line 104 didn't jump to line 93 because the condition on line 104 was always true
105 desc_data.raw_tags.append(raw_tag)
106 elif span_class == "tr":
107 desc_data.roman = clean_node(wxr, None, node)
108 elif "lang" in node.attrs:
109 desc_data.lang_code = node.attrs["lang"]
110 ruby_data, nodes_without_ruby = extract_ruby(wxr, node)
111 desc_data.ruby = ruby_data
112 desc_data.word = clean_node(wxr, None, nodes_without_ruby)
113 if desc_data.word != "": 113 ↛ 91line 113 didn't jump to line 91 because the condition on line 113 was always true
114 word_entry.descendants.append(desc_data)