Coverage for src/wiktextract/extractor/vi/descendant.py: 7%

95 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1from mediawiki_langcodes import name_to_code 

2from wikitextprocessor import ( 

3 HTMLNode, 

4 LevelNode, 

5 NodeKind, 

6 TemplateNode, 

7 WikiNode, 

8) 

9 

10from ...page import clean_node 

11from ...wxr_context import WiktextractContext 

12from ..ruby import extract_ruby 

13from .models import Descendant, WordEntry 

14from .tags import translate_raw_tags 

15 

16 

17def extract_descendant_section( 

18 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

19): 

20 desc_list = [] 

21 for node in level_node.children: 

22 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

23 for list_node in level_node.find_child(NodeKind.LIST): 

24 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

25 desc_list.extend( 

26 extract_desc_list_item(wxr, list_item, [], [])[0] 

27 ) 

28 elif ( 

29 isinstance(node, TemplateNode) 

30 and node.template_name.lower() == "cjkv" 

31 ): 

32 desc_list.extend(extract_cjkv_template(wxr, node)) 

33 word_entry.descendants.extend(desc_list) 

34 

35 

36def extract_cjkv_template( 

37 wxr: WiktextractContext, t_node: TemplateNode 

38) -> list[Descendant]: 

39 expanded_node = wxr.wtp.parse( 

40 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

41 ) 

42 desc_list = [] 

43 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

44 desc_list.extend(extract_desc_list_item(wxr, list_item, [], [])[0]) 

45 return desc_list 

46 

47 

48def extract_desc_list_item( 

49 wxr: WiktextractContext, 

50 list_item: WikiNode, 

51 parent_data: list[Descendant], 

52 raw_tags: list[str], 

53 lang_code: str = "unknown", 

54 lang_name: str = "unknown", 

55) -> tuple[list[Descendant], str, str]: 

56 # process list item node and <li> tag 

57 data_list = [] 

58 before_word_raw_tags = [] 

59 after_word = False 

60 for child in list_item.children: 

61 if isinstance(child, str) and child.strip().endswith(":"): 

62 lang_name = child.strip(": ") or "unknown" 

63 lang_code = name_to_code(lang_name, "vi") or "unknown" 

64 elif isinstance(child, str) and child.strip() == ",": 

65 after_word = False 

66 elif isinstance(child, HTMLNode) and child.tag == "span": 

67 extract_desc_span_tag( 

68 wxr, 

69 child, 

70 data_list, 

71 lang_code, 

72 lang_name, 

73 raw_tags, 

74 before_word_raw_tags, 

75 after_word, 

76 ) 

77 elif ( 

78 isinstance(child, HTMLNode) 

79 and child.tag == "i" 

80 and len(data_list) > 0 

81 ): 

82 for span_tag in child.find_html( 

83 "span", attr_name="class", attr_value="Latn" 

84 ): 

85 roman = clean_node(wxr, None, span_tag) 

86 data_list[-1].roman = roman 

87 if ( 

88 len(data_list) > 1 

89 and "Traditional-Chinese" in data_list[-2].tags 

90 ): 

91 data_list[-2].roman = roman 

92 elif isinstance(child, TemplateNode) and child.template_name in [ 

93 "desctree", 

94 "descendants tree", 

95 "desc", 

96 "descendant", 

97 "ja-r", 

98 "jpn-r", 

99 "zh-l", 

100 "zho-l", 

101 "zh-m", 

102 "zho-m", 

103 ]: 

104 if child.template_name.startswith("desc"): 

105 lang_code = child.template_parameters.get(1, "") or "unknown" 

106 expanded_template = wxr.wtp.parse( 

107 wxr.wtp.node_to_wikitext(child), expand_all=True 

108 ) 

109 new_data, new_l_code, new_l_name = extract_desc_list_item( 

110 wxr, 

111 expanded_template, 

112 [], # avoid add twice 

113 raw_tags, 

114 lang_code, 

115 lang_name, 

116 ) 

117 data_list.extend(new_data) 

118 # save lang data from desc template 

119 lang_code = new_l_code 

120 lang_name = new_l_name 

121 

122 for ul_tag in list_item.find_html("ul"): 

123 for li_tag in ul_tag.find_html("li"): 

124 extract_desc_list_item(wxr, li_tag, data_list, []) 

125 for next_list in list_item.find_child(NodeKind.LIST): 

126 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM): 

127 extract_desc_list_item(wxr, next_list_item, data_list, []) 

128 

129 for p_data in parent_data: 

130 p_data.descendants.extend(data_list) 

131 return data_list, lang_code, lang_name 

132 

133 

134def extract_desc_span_tag( 

135 wxr: WiktextractContext, 

136 span_tag: HTMLNode, 

137 desc_lists: list[Descendant], 

138 lang_code: str, 

139 lang_name: str, 

140 raw_tags: list[str], 

141 before_word_raw_tags: list[str], 

142 after_word: bool, 

143) -> bool: 

144 class_names = span_tag.attrs.get("class", "").split() 

145 span_lang = span_tag.attrs.get("lang", "") 

146 span_title = span_tag.attrs.get("title", "") 

147 if ("tr" in class_names or span_lang.endswith("-Latn")) and len( 

148 desc_lists 

149 ) > 0: 

150 roman = clean_node(wxr, None, span_tag) 

151 desc_lists[-1].roman = roman 

152 if len(desc_lists) > 1 and "Traditional-Chinese" in desc_lists[-2].tags: 

153 desc_lists[-2].roman = roman 

154 elif ( 

155 "qualifier-content" in class_names 

156 or "gender" in class_names 

157 or "label-content" in class_names 

158 ) and len(desc_lists) > 0: 

159 for raw_tag in clean_node(wxr, None, span_tag).split(","): 

160 raw_tag = raw_tag.strip() 

161 if raw_tag == "": 

162 continue 

163 if after_word: 

164 desc_lists[-1].raw_tags.append(raw_tag) 

165 translate_raw_tags(desc_lists[-1]) 

166 else: 

167 before_word_raw_tags.append(raw_tag) 

168 elif span_lang != "": 

169 ruby_data, nodes_without_ruby = extract_ruby(wxr, span_tag) 

170 desc_data = Descendant( 

171 lang=lang_name, 

172 lang_code=lang_code, 

173 word=clean_node(wxr, None, nodes_without_ruby), 

174 ruby=ruby_data, 

175 raw_tags=before_word_raw_tags + raw_tags, 

176 ) 

177 before_word_raw_tags.clear() 

178 if desc_data.lang_code == "unknown": 

179 desc_data.lang_code = span_lang 

180 if "Hant" in class_names: 

181 desc_data.tags.append("Traditional-Chinese") 

182 elif "Hans" in class_names: 

183 desc_data.tags.append("Simplified-Chinese") 

184 if desc_data.word not in ["", "/"]: 

185 translate_raw_tags(desc_data) 

186 desc_lists.append(desc_data) 

187 after_word = True 

188 elif span_title != "" and clean_node(wxr, None, span_tag) in [ 

189 "→", 

190 "⇒", 

191 ">", 

192 "?", 

193 ]: 

194 raw_tags.append(span_title) 

195 elif "mention-gloss" in class_names and len(desc_lists) > 0: 

196 desc_lists[-1].sense = clean_node(wxr, None, span_tag) 

197 

198 return after_word