Coverage for src/wiktextract/extractor/th/descendant.py: 97%
63 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1from mediawiki_langcodes import code_to_name
2from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
4from ...page import clean_node
5from ...wxr_context import WiktextractContext
6from ..ruby import extract_ruby
7from .models import Descendant, WordEntry
10def extract_descendant_section(
11 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode
12):
13 for t_node in level_node.find_child(NodeKind.TEMPLATE):
14 if t_node.template_name in ["CJKV", "Sinoxenic-word"]: 14 ↛ 13line 14 didn't jump to line 13 because the condition on line 14 was always true
15 extract_cjkv_template(wxr, word_entry, t_node)
17 for list_node in level_node.find_child(NodeKind.LIST):
18 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
19 extract_desc_list_item(wxr, word_entry, [], list_item)
22def extract_desc_list_item(
23 wxr: WiktextractContext,
24 word_entry: WordEntry,
25 parent_data: list[Descendant],
26 list_item: WikiNode,
27):
28 desc_list = []
29 for node in list_item.children:
30 if isinstance(node, TemplateNode) and node.template_name in [
31 "desc",
32 "descendant",
33 "desctree",
34 "descendants tree",
35 ]:
36 desc_list.extend(
37 extract_desc_template(wxr, word_entry, parent_data, node)
38 )
39 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
40 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
41 extract_desc_list_item(
42 wxr, word_entry, desc_list, child_list_item
43 )
46def extract_desc_template(
47 wxr: WiktextractContext,
48 word_entry: WordEntry,
49 parent_data: list[Descendant],
50 t_node: TemplateNode,
51) -> list[Descendant]:
52 desc_data = []
53 expanded_node = wxr.wtp.parse(
54 wxr.wtp.node_to_wikitext(t_node), expand_all=True
55 )
56 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
57 lang_name = code_to_name(lang_code, "th") or "unknown"
58 for span_tag in expanded_node.find_html("span"):
59 span_lang = span_tag.attrs.get("lang", "")
60 span_class = span_tag.attrs.get("class", "")
61 if span_lang == lang_code:
62 desc_data.append(
63 Descendant(
64 lang_code=lang_code,
65 lang=lang_name,
66 word=clean_node(wxr, None, span_tag),
67 )
68 )
69 elif span_lang.endswith("-Latn") and len(desc_data) > 0:
70 desc_data[-1].roman = clean_node(wxr, None, span_tag)
71 elif span_class == "mention-gloss" and len(desc_data) > 0:
72 desc_data[-1].sense = clean_node(wxr, None, span_tag)
74 if len(parent_data) > 0:
75 for p_data in parent_data:
76 p_data.descendants.extend(desc_data)
77 else:
78 word_entry.descendants.extend(desc_data)
79 clean_node(wxr, word_entry, expanded_node)
80 return desc_data
83def extract_cjkv_template(
84 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
85):
86 expanded_node = wxr.wtp.parse(
87 wxr.wtp.node_to_wikitext(t_node), expand_all=True
88 )
89 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
90 desc_data = Descendant(word="", lang="unknown", lang_code="unknown")
91 for node in list_item.children:
92 if (
93 isinstance(node, str)
94 and node.strip().endswith(":")
95 and desc_data.lang == "unknown"
96 ):
97 desc_data.lang = node.strip(": ")
98 elif isinstance(node, HTMLNode) and node.tag == "span":
99 span_class = node.attrs.get("class", "")
100 if span_class == "desc-arr":
101 raw_tag = node.attrs.get("title", "")
102 if raw_tag != "": 102 ↛ 91line 102 didn't jump to line 91 because the condition on line 102 was always true
103 desc_data.raw_tags.append(raw_tag)
104 elif span_class == "tr":
105 desc_data.roman = clean_node(wxr, None, node)
106 elif "lang" in node.attrs:
107 desc_data.lang_code = node.attrs["lang"]
108 ruby_data, nodes_without_ruby = extract_ruby(wxr, node)
109 desc_data.ruby = ruby_data
110 desc_data.word = clean_node(wxr, None, nodes_without_ruby)
111 if desc_data.word != "": 111 ↛ 89line 111 didn't jump to line 89 because the condition on line 111 was always true
112 word_entry.descendants.append(desc_data)