Coverage for src/wiktextract/extractor/th/descendant.py: 100%
37 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1from mediawiki_langcodes import code_to_name
2from wikitextprocessor import NodeKind, TemplateNode, WikiNode
4from ...page import clean_node
5from ...wxr_context import WiktextractContext
6from .models import Descendant, WordEntry
9def extract_descendant_section(
10 wxr: WiktextractContext,
11 word_entry: WordEntry,
12 level_node: WikiNode,
13) -> None:
14 for list_node in level_node.find_child(NodeKind.LIST):
15 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
16 extract_desc_list_item(wxr, word_entry, [], list_item)
19def extract_desc_list_item(
20 wxr: WiktextractContext,
21 word_entry: WordEntry,
22 parent_data: list[Descendant],
23 list_item: WikiNode,
24) -> None:
25 desc_list = []
26 for node in list_item.children:
27 if isinstance(node, TemplateNode) and node.template_name in [
28 "desc",
29 "descendant",
30 "desctree",
31 "descendants tree",
32 ]:
33 desc_list.extend(
34 extract_desc_template(wxr, word_entry, parent_data, node)
35 )
36 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
37 for child_list_item in node.find_child(NodeKind.LIST_ITEM):
38 extract_desc_list_item(
39 wxr, word_entry, desc_list, child_list_item
40 )
43def extract_desc_template(
44 wxr: WiktextractContext,
45 word_entry: WordEntry,
46 parent_data: list[Descendant],
47 t_node: TemplateNode,
48) -> list[Descendant]:
49 desc_data = []
50 expanded_node = wxr.wtp.parse(
51 wxr.wtp.node_to_wikitext(t_node), expand_all=True
52 )
53 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
54 lang_name = code_to_name(lang_code, "th") or "unknown"
55 for span_tag in expanded_node.find_html("span"):
56 span_lang = span_tag.attrs.get("lang", "")
57 span_class = span_tag.attrs.get("class", "")
58 if span_lang == lang_code:
59 desc_data.append(
60 Descendant(
61 lang_code=lang_code,
62 lang=lang_name,
63 word=clean_node(wxr, None, span_tag),
64 )
65 )
66 elif span_lang.endswith("-Latn") and len(desc_data) > 0:
67 desc_data[-1].roman = clean_node(wxr, None, span_tag)
68 elif span_class == "mention-gloss" and len(desc_data) > 0:
69 desc_data[-1].sense = clean_node(wxr, None, span_tag)
71 if len(parent_data) > 0:
72 for p_data in parent_data:
73 p_data.descendants.extend(desc_data)
74 else:
75 word_entry.descendants.extend(desc_data)
76 clean_node(wxr, word_entry, expanded_node)
77 return desc_data