Coverage for src/wiktextract/extractor/th/descendant.py: 97%

63 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1from mediawiki_langcodes import code_to_name 

2from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

3 

4from ...page import clean_node 

5from ...wxr_context import WiktextractContext 

6from ..ruby import extract_ruby 

7from .models import Descendant, WordEntry 

8 

9 

10def extract_descendant_section( 

11 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

12): 

13 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

14 if t_node.template_name in ["CJKV", "Sinoxenic-word"]: 14 ↛ 13line 14 didn't jump to line 13 because the condition on line 14 was always true

15 extract_cjkv_template(wxr, word_entry, t_node) 

16 

17 for list_node in level_node.find_child(NodeKind.LIST): 

18 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

19 extract_desc_list_item(wxr, word_entry, [], list_item) 

20 

21 

22def extract_desc_list_item( 

23 wxr: WiktextractContext, 

24 word_entry: WordEntry, 

25 parent_data: list[Descendant], 

26 list_item: WikiNode, 

27): 

28 desc_list = [] 

29 for node in list_item.children: 

30 if isinstance(node, TemplateNode) and node.template_name in [ 

31 "desc", 

32 "descendant", 

33 "desctree", 

34 "descendants tree", 

35 ]: 

36 desc_list.extend( 

37 extract_desc_template(wxr, word_entry, parent_data, node) 

38 ) 

39 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

40 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

41 extract_desc_list_item( 

42 wxr, word_entry, desc_list, child_list_item 

43 ) 

44 

45 

46def extract_desc_template( 

47 wxr: WiktextractContext, 

48 word_entry: WordEntry, 

49 parent_data: list[Descendant], 

50 t_node: TemplateNode, 

51) -> list[Descendant]: 

52 desc_data = [] 

53 expanded_node = wxr.wtp.parse( 

54 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

55 ) 

56 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

57 lang_name = code_to_name(lang_code, "th") or "unknown" 

58 for span_tag in expanded_node.find_html("span"): 

59 span_lang = span_tag.attrs.get("lang", "") 

60 span_class = span_tag.attrs.get("class", "") 

61 if span_lang == lang_code: 

62 desc_data.append( 

63 Descendant( 

64 lang_code=lang_code, 

65 lang=lang_name, 

66 word=clean_node(wxr, None, span_tag), 

67 ) 

68 ) 

69 elif span_lang.endswith("-Latn") and len(desc_data) > 0: 

70 desc_data[-1].roman = clean_node(wxr, None, span_tag) 

71 elif span_class == "mention-gloss" and len(desc_data) > 0: 

72 desc_data[-1].sense = clean_node(wxr, None, span_tag) 

73 

74 if len(parent_data) > 0: 

75 for p_data in parent_data: 

76 p_data.descendants.extend(desc_data) 

77 else: 

78 word_entry.descendants.extend(desc_data) 

79 clean_node(wxr, word_entry, expanded_node) 

80 return desc_data 

81 

82 

83def extract_cjkv_template( 

84 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

85): 

86 expanded_node = wxr.wtp.parse( 

87 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

88 ) 

89 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

90 desc_data = Descendant(word="", lang="unknown", lang_code="unknown") 

91 for node in list_item.children: 

92 if ( 

93 isinstance(node, str) 

94 and node.strip().endswith(":") 

95 and desc_data.lang == "unknown" 

96 ): 

97 desc_data.lang = node.strip(": ") 

98 elif isinstance(node, HTMLNode) and node.tag == "span": 

99 span_class = node.attrs.get("class", "") 

100 if span_class == "desc-arr": 

101 raw_tag = node.attrs.get("title", "") 

102 if raw_tag != "": 102 ↛ 91line 102 didn't jump to line 91 because the condition on line 102 was always true

103 desc_data.raw_tags.append(raw_tag) 

104 elif span_class == "tr": 

105 desc_data.roman = clean_node(wxr, None, node) 

106 elif "lang" in node.attrs: 

107 desc_data.lang_code = node.attrs["lang"] 

108 ruby_data, nodes_without_ruby = extract_ruby(wxr, node) 

109 desc_data.ruby = ruby_data 

110 desc_data.word = clean_node(wxr, None, nodes_without_ruby) 

111 if desc_data.word != "": 111 ↛ 89line 111 didn't jump to line 89 because the condition on line 111 was always true

112 word_entry.descendants.append(desc_data)