Coverage for src/wiktextract/extractor/zh/descendant.py: 91%

101 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1import re 

2 

3from mediawiki_langcodes import name_to_code 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..ruby import extract_ruby 

15from .models import Descendant, WordEntry 

16from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags 

17 

18 

19def extract_descendant_section( 

20 wxr: WiktextractContext, level_node: LevelNode, page_data: list[WordEntry] 

21) -> None: 

22 desc_list = [] 

23 for node in level_node.children: 

24 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

25 for list_node in level_node.find_child(NodeKind.LIST): 

26 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

27 desc_list.extend( 

28 process_desc_list_item(wxr, list_item, [], [])[0] 

29 ) 

30 elif ( 

31 isinstance(node, TemplateNode) 

32 and node.template_name.lower() == "cjkv" 

33 ): 

34 desc_list.extend(process_cjkv_template(wxr, node)) 

35 

36 page_data[-1].descendants.extend(desc_list) 

37 for data in page_data[:-1]: 

38 if ( 38 ↛ 37line 38 didn't jump to line 37 because the condition on line 38 was always true

39 data.lang_code == page_data[-1].lang_code 

40 and data.sounds == page_data[-1].sounds 

41 and data.etymology_text == page_data[-1].etymology_text 

42 and data.pos_level == page_data[-1].pos_level == level_node.kind 

43 ): 

44 data.descendants.extend(desc_list) 

45 

46 

47def process_cjkv_template( 

48 wxr: WiktextractContext, t_node: TemplateNode 

49) -> list[Descendant]: 

50 expanded_node = wxr.wtp.parse( 

51 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

52 ) 

53 desc_list = [] 

54 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

55 desc_list.extend(process_desc_list_item(wxr, list_item, [], [])[0]) 

56 return desc_list 

57 

58 

59def process_desc_list_item( 

60 wxr: WiktextractContext, 

61 list_item: WikiNode, 

62 parent_data: list[Descendant], 

63 raw_tags: list[str], 

64 lang_code: str = "unknown", 

65 lang_name: str = "unknown", 

66) -> tuple[list[Descendant], str, str]: 

67 # process list item node and <li> tag 

68 data_list = [] 

69 before_word_raw_tags = [] 

70 after_word = False 

71 for child in list_item.children: 

72 if isinstance(child, str) and child.strip().endswith(":"): 

73 lang_name = child.strip(": ") or "unknown" 

74 lang_code = name_to_code(lang_name, "zh") or "unknown" 

75 elif isinstance(child, str) and child.strip() == ",": 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true

76 after_word = False 

77 elif isinstance(child, HTMLNode) and child.tag == "span": 

78 extract_desc_span_tag( 

79 wxr, 

80 child, 

81 data_list, 

82 lang_code, 

83 lang_name, 

84 raw_tags, 

85 before_word_raw_tags, 

86 after_word, 

87 ) 

88 elif ( 

89 isinstance(child, HTMLNode) 

90 and child.tag == "i" 

91 and len(data_list) > 0 

92 ): 

93 for span_tag in child.find_html( 

94 "span", attr_name="class", attr_value="Latn" 

95 ): 

96 roman = clean_node(wxr, None, span_tag) 

97 data_list[-1].roman = roman 

98 if ( 

99 len(data_list) > 1 

100 and "Traditional-Chinese" in data_list[-2].tags 

101 ): 

102 data_list[-2].roman = roman 

103 elif isinstance(child, TemplateNode) and child.template_name in [ 

104 "desctree", 

105 "descendants tree", 

106 "desc", 

107 "descendant", 

108 "ja-r", 

109 "zh-l", 

110 "zh-m", 

111 ]: 

112 if child.template_name.startswith("desc"): 

113 lang_code = child.template_parameters.get(1, "") or "unknown" 

114 expanded_template = wxr.wtp.parse( 

115 wxr.wtp.node_to_wikitext(child), expand_all=True 

116 ) 

117 new_data, new_l_code, new_l_name = process_desc_list_item( 

118 wxr, 

119 expanded_template, 

120 [], # avoid add twice 

121 raw_tags, 

122 lang_code, 

123 lang_name, 

124 ) 

125 data_list.extend(new_data) 

126 # save lang data from desc template 

127 lang_code = new_l_code 

128 lang_name = new_l_name 

129 

130 for ul_tag in list_item.find_html("ul"): 

131 for li_tag in ul_tag.find_html("li"): 

132 process_desc_list_item(wxr, li_tag, data_list, []) 

133 for next_list in list_item.find_child(NodeKind.LIST): 

134 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM): 

135 process_desc_list_item(wxr, next_list_item, data_list, []) 

136 

137 for p_data in parent_data: 

138 p_data.descendants.extend(data_list) 

139 return data_list, lang_code, lang_name 

140 

141 

142def extract_desc_span_tag( 

143 wxr: WiktextractContext, 

144 span_tag: HTMLNode, 

145 desc_lists: list[Descendant], 

146 lang_code: str, 

147 lang_name: str, 

148 raw_tags: list[str], 

149 before_word_raw_tags: list[str], 

150 after_word: bool, 

151) -> bool: 

152 class_names = span_tag.attrs.get("class", "").split() 

153 span_lang = span_tag.attrs.get("lang", "") 

154 span_title = span_tag.attrs.get("title", "") 

155 if ("tr" in class_names or span_lang.endswith("-Latn")) and len( 

156 desc_lists 

157 ) > 0: 

158 roman = clean_node(wxr, None, span_tag) 

159 desc_lists[-1].roman = roman 

160 if len(desc_lists) > 1 and "Traditional-Chinese" in desc_lists[-2].tags: 

161 desc_lists[-2].roman = roman 

162 elif ( 

163 "qualifier-content" in class_names 

164 or "gender" in class_names 

165 or "label-content" in class_names 

166 ) and len(desc_lists) > 0: 

167 for raw_tag in re.split(r",|,", clean_node(wxr, None, span_tag)): 

168 raw_tag = raw_tag.strip() 

169 if raw_tag == "": 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true

170 continue 

171 if after_word: 171 ↛ 172line 171 didn't jump to line 172 because the condition on line 171 was never true

172 if raw_tag in TEMPLATE_TAG_ARGS: 

173 desc_lists[-1].tags.append(TEMPLATE_TAG_ARGS[raw_tag]) 

174 else: 

175 desc_lists[-1].raw_tags.append(raw_tag) 

176 translate_raw_tags(desc_lists[-1]) 

177 else: 

178 before_word_raw_tags.append(raw_tag) 

179 elif span_lang != "": 

180 ruby_data, nodes_without_ruby = extract_ruby(wxr, span_tag) 

181 desc_data = Descendant( 

182 lang=lang_name, 

183 lang_code=lang_code, 

184 word=clean_node(wxr, None, nodes_without_ruby), 

185 ruby=ruby_data, 

186 raw_tags=before_word_raw_tags + raw_tags, 

187 ) 

188 before_word_raw_tags.clear() 

189 if desc_data.lang_code == "unknown": 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true

190 desc_data.lang_code = span_lang 

191 if "Hant" in class_names: 

192 desc_data.tags.append("Traditional-Chinese") 

193 elif "Hans" in class_names: 

194 desc_data.tags.append("Simplified-Chinese") 

195 if desc_data.word not in ["", "/"]: 

196 translate_raw_tags(desc_data) 

197 desc_lists.append(desc_data) 

198 after_word = True 

199 elif span_title != "" and clean_node(wxr, None, span_tag) in [ 

200 "→", 

201 "⇒", 

202 ">", 

203 "?", 

204 ]: 

205 raw_tags.append(span_title) 

206 elif "mention-gloss" in class_names and len(desc_lists) > 0: 206 ↛ 207line 206 didn't jump to line 207 because the condition on line 206 was never true

207 desc_lists[-1].sense = clean_node(wxr, None, span_tag) 

208 

209 return after_word