Coverage for src/wiktextract/extractor/en/descendant.py: 80%

124 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1from copy import deepcopy 

2 

3from mediawiki_langcodes import name_to_code 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...datautils import data_append, data_extend 

13from ...page import clean_node 

14from ...tags import valid_tags 

15from ...wxr_context import WiktextractContext 

16from ..ruby import extract_ruby 

17from .type_utils import DescendantData, WordData 

18 

19 

20def extract_descendant_section( 

21 wxr: WiktextractContext, 

22 word_entry: WordData, 

23 level_node: LevelNode, 

24 is_derived: bool, 

25): 

26 desc_list = [] 

27 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

28 if ( 

29 isinstance(t_node, TemplateNode) 

30 and t_node.template_name.lower() == "cjkv" 

31 ): 

32 desc_list.extend(extract_cjkv_template(wxr, t_node)) 

33 

34 seen_lists = set() 

35 # get around unnecessarily pre-expanded "top" template 

36 for list_node in level_node.find_child_recursively(NodeKind.LIST): 

37 if list_node in seen_lists: 

38 continue 

39 seen_lists.add(list_node) 

40 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

41 desc_list.extend( 

42 extract_desc_list_item(wxr, list_item, [], seen_lists, [])[0] 

43 ) 

44 

45 if is_derived: 

46 for data in desc_list: 

47 if "derived" not in data.get("tags", []): 47 ↛ 46line 47 didn't jump to line 46 because the condition on line 47 was always true

48 data_append(data, "tags", "derived") 

49 if len(desc_list) > 0: 

50 data_extend(word_entry, "descendants", desc_list) 

51 

52 

53def extract_cjkv_template( 

54 wxr: WiktextractContext, t_node: TemplateNode 

55) -> list[DescendantData]: 

56 expanded_template = wxr.wtp.parse( 

57 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

58 ) 

59 seen_lists = set() 

60 desc_list = [] 

61 for list_node in expanded_template.find_child_recursively(NodeKind.LIST): 61 ↛ 62line 61 didn't jump to line 62 because the loop on line 61 never started

62 if list_node in seen_lists: 

63 continue 

64 seen_lists.add(list_node) 

65 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

66 desc_list.extend( 

67 extract_desc_list_item(wxr, list_item, [], seen_lists, [])[0] 

68 ) 

69 return desc_list 

70 

71 

72def extract_desc_list_item( 

73 wxr: WiktextractContext, 

74 list_item: WikiNode, 

75 parent_data: list[DescendantData], 

76 seen_lists: set[WikiNode], 

77 raw_tags: list[str], 

78 lang_code: str = "unknown", 

79 lang_name: str = "unknown", 

80) -> tuple[list[DescendantData], str, str]: 

81 # process list item node and <li> tag 

82 data_list = [] 

83 before_word_raw_tags = [] 

84 after_word = False 

85 for child in list_item.children: 

86 if isinstance(child, str) and child.strip().endswith(":"): 

87 lang_name = child.strip(": \n") or "unknown" 

88 lang_code = name_to_code(lang_name, "en") or "unknown" 

89 elif isinstance(child, str) and child.strip() == ",": 

90 after_word = False 

91 elif isinstance(child, HTMLNode) and child.tag == "span": 

92 after_word = extract_desc_span_tag( 

93 wxr, 

94 child, 

95 data_list, 

96 lang_code, 

97 lang_name, 

98 raw_tags, 

99 before_word_raw_tags, 

100 after_word, 

101 ) 

102 elif ( 102 ↛ 107line 102 didn't jump to line 107 because the condition on line 102 was never true

103 isinstance(child, HTMLNode) 

104 and child.tag == "i" 

105 and len(data_list) > 0 

106 ): 

107 for span_tag in child.find_html( 

108 "span", attr_name="class", attr_value="Latn" 

109 ): 

110 roman = clean_node(wxr, None, span_tag) 

111 if roman != "": 

112 data_list[-1]["roman"] = roman 

113 if len( 

114 data_list 

115 ) > 1 and "Traditional-Chinese" in data_list[-2].get( 

116 "tags", [] 

117 ): 

118 data_list[-2]["roman"] = roman 

119 elif isinstance(child, TemplateNode) and child.template_name in [ 

120 "desctree", 

121 "descendants tree", 

122 "desc", 

123 "descendant", 

124 "ja-r", 

125 "zh-l", 

126 "zh-m", 

127 "link", # used in Reconstruction pages 

128 "l", 

129 ]: 

130 if child.template_name.startswith("desc"): 

131 lang_code = child.template_parameters.get(1, "") or "unknown" 

132 expanded_template = wxr.wtp.parse( 

133 wxr.wtp.node_to_wikitext(child), expand_all=True 

134 ) 

135 new_data, new_l_code, new_l_name = extract_desc_list_item( 

136 wxr, 

137 expanded_template, 

138 [], # avoid add twice 

139 seen_lists, 

140 raw_tags, 

141 lang_code, 

142 lang_name, 

143 ) 

144 data_list.extend(new_data) 

145 # save lang data from desc template 

146 lang_code = new_l_code 

147 lang_name = new_l_name 

148 

149 if ( 

150 wxr.wtp.title.startswith("Reconstruction:") 

151 and len(data_list) == 0 

152 and (lang_code != "unknown" or lang_name != "unknown") 

153 ): 

154 data = DescendantData(lang_code=lang_code, lang=lang_name) 

155 if len(raw_tags) > 0: 

156 data["raw_tags"] = raw_tags 

157 data_list.append(data) 

158 

159 for ul_tag in list_item.find_html("ul"): 159 ↛ 160line 159 didn't jump to line 160 because the loop on line 159 never started

160 for li_tag in ul_tag.find_html("li"): 

161 extract_desc_list_item(wxr, li_tag, data_list, seen_lists, []) 

162 for next_list in list_item.find_child(NodeKind.LIST): 

163 if next_list in seen_lists: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true

164 continue 

165 seen_lists.add(next_list) 

166 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM): 

167 extract_desc_list_item( 

168 wxr, next_list_item, data_list, seen_lists, [] 

169 ) 

170 

171 for p_data in parent_data: 

172 data_extend(p_data, "descendants", data_list) 

173 return data_list, lang_code, lang_name 

174 

175 

176def extract_desc_span_tag( 

177 wxr: WiktextractContext, 

178 span_tag: HTMLNode, 

179 desc_lists: list[DescendantData], 

180 lang_code: str, 

181 lang_name: str, 

182 raw_tags: list[str], 

183 before_word_raw_tags: list[str], 

184 after_word: bool, 

185) -> bool: 

186 class_names = span_tag.attrs.get("class", "").split() 

187 span_lang = span_tag.attrs.get("lang", "") 

188 span_title = span_tag.attrs.get("title", "") 

189 if ("tr" in class_names or span_lang.endswith("-Latn")) and len( 

190 desc_lists 

191 ) > 0: 

192 roman = clean_node(wxr, None, span_tag) 

193 if roman != "": 193 ↛ 253line 193 didn't jump to line 253 because the condition on line 193 was always true

194 desc_lists[-1]["roman"] = clean_node(wxr, None, span_tag) 

195 if len(desc_lists) > 1 and "Traditional-Chinese" in desc_lists[ 195 ↛ 198line 195 didn't jump to line 198 because the condition on line 195 was never true

196 -2 

197 ].get("tags", []): 

198 desc_lists[-2]["roman"] = roman 

199 elif ( 

200 "qualifier-content" in class_names 

201 or "gender" in class_names 

202 or "label-content" in class_names 

203 ) and len(desc_lists) > 0: 

204 for raw_tag in clean_node(wxr, None, span_tag).split(","): 

205 raw_tag = raw_tag.strip() 

206 if raw_tag != "": 206 ↛ 204line 206 didn't jump to line 204 because the condition on line 206 was always true

207 if after_word: 

208 data_append( 

209 desc_lists[-1], 

210 "tags" if raw_tag in valid_tags else "raw_tags", 

211 raw_tag, 

212 ) 

213 else: 

214 before_word_raw_tags.append(raw_tag) 

215 elif span_lang != "": 

216 ruby_data, nodes_without_ruby = extract_ruby(wxr, span_tag) 

217 desc_data = DescendantData( 

218 lang=lang_name, 

219 lang_code=lang_code, 

220 word=clean_node(wxr, None, nodes_without_ruby), 

221 ) 

222 for raw_tag_list in [before_word_raw_tags, raw_tags]: 

223 for raw_tag in raw_tag_list: 

224 data_append( 

225 desc_data, 

226 "tags" if raw_tag in valid_tags else "raw_tags", 

227 raw_tag, 

228 ) 

229 before_word_raw_tags.clear() 

230 if len(ruby_data) > 0: 230 ↛ 231line 230 didn't jump to line 231 because the condition on line 230 was never true

231 desc_data["ruby"] = ruby_data 

232 if desc_data["lang_code"] == "unknown": 

233 desc_data["lang_code"] = span_lang 

234 if "Hant" in class_names: 234 ↛ 235line 234 didn't jump to line 235 because the condition on line 234 was never true

235 data_append(desc_data, "tags", "Traditional-Chinese") 

236 elif "Hans" in class_names: 236 ↛ 237line 236 didn't jump to line 237 because the condition on line 236 was never true

237 data_append(desc_data, "tags", "Simplified-Chinese") 

238 if desc_data["word"] not in ["", "/"]: 238 ↛ 240line 238 didn't jump to line 240 because the condition on line 238 was always true

239 desc_lists.append(deepcopy(desc_data)) 

240 after_word = True 

241 elif span_title != "" and clean_node(wxr, None, span_tag) in [ 

242 "→", 

243 "⇒", 

244 ">", 

245 "?", 

246 ]: 

247 raw_tags.append(span_title) 

248 elif "mention-gloss" in class_names and len(desc_lists) > 0: 

249 sense = clean_node(wxr, None, span_tag) 

250 if sense != "": 250 ↛ 253line 250 didn't jump to line 253 because the condition on line 250 was always true

251 desc_lists[-1]["sense"] = sense 

252 

253 return after_word