Coverage for src/wiktextract/extractor/th/descendant.py: 100%

37 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1from mediawiki_langcodes import code_to_name 

2from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

3 

4from ...page import clean_node 

5from ...wxr_context import WiktextractContext 

6from .models import Descendant, WordEntry 

7 

8 

9def extract_descendant_section( 

10 wxr: WiktextractContext, 

11 word_entry: WordEntry, 

12 level_node: WikiNode, 

13) -> None: 

14 for list_node in level_node.find_child(NodeKind.LIST): 

15 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

16 extract_desc_list_item(wxr, word_entry, [], list_item) 

17 

18 

19def extract_desc_list_item( 

20 wxr: WiktextractContext, 

21 word_entry: WordEntry, 

22 parent_data: list[Descendant], 

23 list_item: WikiNode, 

24) -> None: 

25 desc_list = [] 

26 for node in list_item.children: 

27 if isinstance(node, TemplateNode) and node.template_name in [ 

28 "desc", 

29 "descendant", 

30 "desctree", 

31 "descendants tree", 

32 ]: 

33 desc_list.extend( 

34 extract_desc_template(wxr, word_entry, parent_data, node) 

35 ) 

36 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

37 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

38 extract_desc_list_item( 

39 wxr, word_entry, desc_list, child_list_item 

40 ) 

41 

42 

43def extract_desc_template( 

44 wxr: WiktextractContext, 

45 word_entry: WordEntry, 

46 parent_data: list[Descendant], 

47 t_node: TemplateNode, 

48) -> list[Descendant]: 

49 desc_data = [] 

50 expanded_node = wxr.wtp.parse( 

51 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

52 ) 

53 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

54 lang_name = code_to_name(lang_code, "th") or "unknown" 

55 for span_tag in expanded_node.find_html("span"): 

56 span_lang = span_tag.attrs.get("lang", "") 

57 span_class = span_tag.attrs.get("class", "") 

58 if span_lang == lang_code: 

59 desc_data.append( 

60 Descendant( 

61 lang_code=lang_code, 

62 lang=lang_name, 

63 word=clean_node(wxr, None, span_tag), 

64 ) 

65 ) 

66 elif span_lang.endswith("-Latn") and len(desc_data) > 0: 

67 desc_data[-1].roman = clean_node(wxr, None, span_tag) 

68 elif span_class == "mention-gloss" and len(desc_data) > 0: 

69 desc_data[-1].sense = clean_node(wxr, None, span_tag) 

70 

71 if len(parent_data) > 0: 

72 for p_data in parent_data: 

73 p_data.descendants.extend(desc_data) 

74 else: 

75 word_entry.descendants.extend(desc_data) 

76 clean_node(wxr, word_entry, expanded_node) 

77 return desc_data