Coverage for src/wiktextract/extractor/th/descendant.py: 97%

63 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1from mediawiki_langcodes import code_to_name 

2from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode 

3 

4from ...page import clean_node 

5from ...wxr_context import WiktextractContext 

6from ..ruby import extract_ruby 

7from .models import Descendant, WordEntry 

8 

9 

10def extract_descendant_section( 

11 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

12): 

13 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

14 if t_node.template_name in ["CJKV", "Sinoxenic-word"]: 14 ↛ 13line 14 didn't jump to line 13 because the condition on line 14 was always true

15 extract_cjkv_template(wxr, word_entry, t_node) 

16 

17 for list_node in level_node.find_child(NodeKind.LIST): 

18 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

19 extract_desc_list_item(wxr, word_entry, [], list_item) 

20 

21 

22def extract_desc_list_item( 

23 wxr: WiktextractContext, 

24 word_entry: WordEntry, 

25 parent_data: list[Descendant], 

26 list_item: WikiNode, 

27): 

28 desc_list = [] 

29 for node in list_item.children: 

30 if isinstance(node, TemplateNode) and node.template_name in [ 

31 "desc", 

32 "descendant", 

33 "desctree", 

34 "descendants tree", 

35 ]: 

36 desc_list.extend( 

37 extract_desc_template(wxr, word_entry, parent_data, node) 

38 ) 

39 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

40 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

41 extract_desc_list_item( 

42 wxr, word_entry, desc_list, child_list_item 

43 ) 

44 

45 

46def extract_desc_template( 

47 wxr: WiktextractContext, 

48 word_entry: WordEntry, 

49 parent_data: list[Descendant], 

50 t_node: TemplateNode, 

51) -> list[Descendant]: 

52 desc_data = [] 

53 expanded_node = wxr.wtp.parse( 

54 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

55 ) 

56 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

57 lang_name = code_to_name(lang_code, "th") or "unknown" 

58 for span_tag in expanded_node.find_html("span"): 

59 span_lang = span_tag.attrs.get("lang", "") 

60 span_class = span_tag.attrs.get("class", "").split() 

61 if (span_lang.endswith("-Latn") or "tr" in span_class) and len( 

62 desc_data 

63 ) > 0: 

64 desc_data[-1].roman = clean_node(wxr, None, span_tag) 

65 elif "mention-gloss" in span_class and len(desc_data) > 0: 

66 desc_data[-1].sense = clean_node(wxr, None, span_tag) 

67 elif span_lang == lang_code: 

68 desc_data.append( 

69 Descendant( 

70 lang_code=lang_code, 

71 lang=lang_name, 

72 word=clean_node(wxr, None, span_tag), 

73 ) 

74 ) 

75 

76 if len(parent_data) > 0: 

77 for p_data in parent_data: 

78 p_data.descendants.extend(desc_data) 

79 else: 

80 word_entry.descendants.extend(desc_data) 

81 clean_node(wxr, word_entry, expanded_node) 

82 return desc_data 

83 

84 

85def extract_cjkv_template( 

86 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

87): 

88 expanded_node = wxr.wtp.parse( 

89 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

90 ) 

91 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

92 desc_data = Descendant(word="", lang="unknown", lang_code="unknown") 

93 for node in list_item.children: 

94 if ( 

95 isinstance(node, str) 

96 and node.strip().endswith(":") 

97 and desc_data.lang == "unknown" 

98 ): 

99 desc_data.lang = node.strip(": ") 

100 elif isinstance(node, HTMLNode) and node.tag == "span": 

101 span_class = node.attrs.get("class", "") 

102 if span_class == "desc-arr": 

103 raw_tag = node.attrs.get("title", "") 

104 if raw_tag != "": 104 ↛ 93line 104 didn't jump to line 93 because the condition on line 104 was always true

105 desc_data.raw_tags.append(raw_tag) 

106 elif span_class == "tr": 

107 desc_data.roman = clean_node(wxr, None, node) 

108 elif "lang" in node.attrs: 

109 desc_data.lang_code = node.attrs["lang"] 

110 ruby_data, nodes_without_ruby = extract_ruby(wxr, node) 

111 desc_data.ruby = ruby_data 

112 desc_data.word = clean_node(wxr, None, nodes_without_ruby) 

113 if desc_data.word != "": 113 ↛ 91line 113 didn't jump to line 91 because the condition on line 113 was always true

114 word_entry.descendants.append(desc_data)