Coverage for src/wiktextract/extractor/en/descendant.py: 83%

154 statements  

« prev     ^ index     » next       coverage.py v7.14.3, created at 2026-06-23 09:14 +0000

1from copy import deepcopy 

2 

3from mediawiki_langcodes import name_to_code 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...datautils import data_append, data_extend 

13from ...page import clean_node 

14from ...tags import valid_tags 

15from ...wxr_context import WiktextractContext 

16from ..ruby import extract_ruby 

17from .type_utils import DescendantData, WordData 

18 

19 

20def extract_descendant_section( 

21 wxr: WiktextractContext, 

22 word_entry: WordData, 

23 level_node: LevelNode, 

24 is_derived: bool, 

25): 

26 desc_list = [] 

27 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

28 if ( 

29 isinstance(t_node, TemplateNode) 

30 and t_node.template_name.lower() == "cjkv" 

31 ): 

32 desc_list.extend(extract_cjkv_template(wxr, t_node)) 

33 

34 seen_lists = set() 

35 # get around unnecessarily pre-expanded "top" template 

36 for list_node in level_node.find_child_recursively(NodeKind.LIST): 

37 if list_node in seen_lists: 

38 continue 

39 seen_lists.add(list_node) 

40 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

41 desc_list.extend( 

42 extract_desc_list_item(wxr, list_item, [], seen_lists, [])[0] 

43 ) 

44 

45 if is_derived: 

46 for data in desc_list: 

47 if "derived" not in data.get("tags", []): 47 ↛ 46line 47 didn't jump to line 46 because the condition on line 47 was always true

48 data_append(data, "tags", "derived") 

49 if len(desc_list) > 0: 

50 data_extend(word_entry, "descendants", desc_list) 

51 

52 

53def extract_cjkv_template( 

54 wxr: WiktextractContext, t_node: TemplateNode 

55) -> list[DescendantData]: 

56 expanded_template = wxr.wtp.parse( 

57 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

58 ) 

59 seen_lists = set() 

60 desc_list = [] 

61 for list_node in expanded_template.find_child_recursively(NodeKind.LIST): 61 ↛ 62line 61 didn't jump to line 62 because the loop on line 61 never started

62 if list_node in seen_lists: 

63 continue 

64 seen_lists.add(list_node) 

65 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

66 desc_list.extend( 

67 extract_desc_list_item(wxr, list_item, [], seen_lists, [])[0] 

68 ) 

69 return desc_list 

70 

71 

72def extract_desc_list_item( 

73 wxr: WiktextractContext, 

74 list_item: WikiNode, 

75 parent_descendant_datas: list[DescendantData], 

76 seen_lists: set[WikiNode], 

77 raw_tags: list[str], 

78 lang_code: str = "unknown", 

79 lang_name: str = "unknown", 

80) -> tuple[list[DescendantData], str, str]: 

81 # process list item node and <li> tag 

82 data_list = [] 

83 before_word_raw_tags = [] 

84 after_word = False 

85 for child in list_item.children: 

86 if isinstance(child, str): 

87 child = child.strip() 

88 if child == ",": 

89 after_word = False 

90 elif child.endswith(":"): 

91 lang_name = child.strip(": \n") or "unknown" 

92 lang_code = ( 

93 choose_more_specific_langcode( 

94 name_to_code(lang_name, "en"), lang_code 

95 ) 

96 or "unknown" 

97 ) 

98 elif lcode := name_to_code(child): 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true

99 lang_name = child 

100 lang_code = lcode 

101 lang_code = ( 

102 choose_more_specific_langcode(lcode, lang_code) or "unknown" 

103 ) 

104 elif lname := does_text_look_like_language_name(child): 

105 lang_name = lname 

106 lang_code = ( 

107 choose_more_specific_langcode( 

108 name_to_code(lang_name, "en"), lang_code 

109 ) 

110 or "unknown" 

111 ) 

112 elif isinstance(child, HTMLNode) and child.tag == "span": 

113 after_word = extract_desc_span_tag( 

114 wxr, 

115 child, 

116 data_list, 

117 lang_code, 

118 lang_name, 

119 raw_tags, 

120 before_word_raw_tags, 

121 after_word, 

122 ) 

123 elif ( 123 ↛ 128line 123 didn't jump to line 128 because the condition on line 123 was never true

124 isinstance(child, HTMLNode) 

125 and child.tag == "i" 

126 and len(data_list) > 0 

127 ): 

128 for span_tag in child.find_html( 

129 "span", attr_name="class", attr_value="Latn" 

130 ): 

131 roman = clean_node(wxr, None, span_tag) 

132 if roman != "": 

133 data_list[-1]["roman"] = roman 

134 if len( 

135 data_list 

136 ) > 1 and "Traditional-Chinese" in data_list[-2].get( 

137 "tags", [] 

138 ): 

139 data_list[-2]["roman"] = roman 

140 elif isinstance(child, TemplateNode) and child.template_name in [ 

141 "desctree", 

142 "descendants tree", 

143 "desc", 

144 "descendant", 

145 "ja-r", 

146 "zh-l", 

147 "zh-m", 

148 "link", # used in Reconstruction pages 

149 "l", 

150 ]: 

151 if child.template_name.startswith("desc"): 

152 lang_code = child.template_parameters.get(1, "") or "unknown" 

153 expanded_template = wxr.wtp.parse( 

154 wxr.wtp.node_to_wikitext(child), expand_all=True 

155 ) 

156 new_data, new_l_code, new_l_name = extract_desc_list_item( 

157 wxr, 

158 expanded_template, 

159 [], # avoid add twice 

160 seen_lists, 

161 raw_tags, 

162 lang_code, 

163 lang_name, 

164 ) 

165 data_list.extend(new_data) 

166 # save lang data from desc template 

167 lang_code = new_l_code 

168 lang_name = new_l_name 

169 

170 if len(data_list) == 0 and ( 

171 lang_code != "unknown" or lang_name != "unknown" 

172 ): 

173 data = DescendantData(lang_code=lang_code, lang=lang_name) 

174 if len(raw_tags) > 0: 

175 data["raw_tags"] = raw_tags 

176 data_list.append(data) 

177 

178 for ul_tag in list_item.find_html("ul"): 

179 for li_tag in ul_tag.find_html("li"): 

180 extract_desc_list_item(wxr, li_tag, data_list, seen_lists, []) 

181 for next_list in list_item.find_child(NodeKind.LIST): 

182 if next_list in seen_lists: 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true

183 continue 

184 seen_lists.add(next_list) 

185 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM): 

186 extract_desc_list_item( 

187 wxr, next_list_item, data_list, seen_lists, [] 

188 ) 

189 

190 for p_data in parent_descendant_datas: 

191 data_extend(p_data, "descendants", data_list) 

192 return data_list, lang_code, lang_name 

193 

194 

195def extract_desc_span_tag( 

196 wxr: WiktextractContext, 

197 span_tag: HTMLNode, 

198 desc_lists: list[DescendantData], 

199 lang_code: str, 

200 lang_name: str, 

201 raw_tags: list[str], 

202 before_word_raw_tags: list[str], 

203 after_word: bool, 

204) -> bool: 

205 class_names = span_tag.attrs.get("class", "").split() 

206 span_lang = span_tag.attrs.get("lang", "") 

207 span_title = span_tag.attrs.get("title", "") 

208 if ("tr" in class_names or span_lang.endswith("-Latn")) and len( 

209 desc_lists 

210 ) > 0: 

211 roman = clean_node(wxr, None, span_tag) 

212 if roman != "": 212 ↛ 272line 212 didn't jump to line 272 because the condition on line 212 was always true

213 desc_lists[-1]["roman"] = clean_node(wxr, None, span_tag) 

214 if len(desc_lists) > 1 and "Traditional-Chinese" in desc_lists[ 214 ↛ 217line 214 didn't jump to line 217 because the condition on line 214 was never true

215 -2 

216 ].get("tags", []): 

217 desc_lists[-2]["roman"] = roman 

218 elif ( 

219 "qualifier-content" in class_names 

220 or "gender" in class_names 

221 or "label-content" in class_names 

222 ) and len(desc_lists) > 0: 

223 for raw_tag in clean_node(wxr, None, span_tag).split(","): 

224 raw_tag = raw_tag.strip() 

225 if raw_tag != "": 225 ↛ 223line 225 didn't jump to line 223 because the condition on line 225 was always true

226 if after_word: 

227 data_append( 

228 desc_lists[-1], 

229 "tags" if raw_tag in valid_tags else "raw_tags", 

230 raw_tag, 

231 ) 

232 else: 

233 before_word_raw_tags.append(raw_tag) 

234 elif span_lang != "": 

235 ruby_data, nodes_without_ruby = extract_ruby(wxr, span_tag) 

236 desc_data = DescendantData( 

237 lang=lang_name, 

238 lang_code=lang_code, 

239 word=clean_node(wxr, None, nodes_without_ruby), 

240 ) 

241 for raw_tag_list in [before_word_raw_tags, raw_tags]: 

242 for raw_tag in raw_tag_list: 

243 data_append( 

244 desc_data, 

245 "tags" if raw_tag in valid_tags else "raw_tags", 

246 raw_tag, 

247 ) 

248 before_word_raw_tags.clear() 

249 if len(ruby_data) > 0: 249 ↛ 250line 249 didn't jump to line 250 because the condition on line 249 was never true

250 desc_data["ruby"] = ruby_data 

251 if desc_data["lang_code"] == "unknown": 

252 desc_data["lang_code"] = span_lang 

253 if "Hant" in class_names: 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true

254 data_append(desc_data, "tags", "Traditional-Chinese") 

255 elif "Hans" in class_names: 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true

256 data_append(desc_data, "tags", "Simplified-Chinese") 

257 if desc_data["word"] not in ["", "/"]: 257 ↛ 259line 257 didn't jump to line 259 because the condition on line 257 was always true

258 desc_lists.append(deepcopy(desc_data)) 

259 after_word = True 

260 elif span_title != "" and clean_node(wxr, None, span_tag) in [ 

261 "→", 

262 "⇒", 

263 ">", 

264 "?", 

265 ]: 

266 raw_tags.append(span_title) 

267 elif "mention-gloss" in class_names and len(desc_lists) > 0: 

268 sense = clean_node(wxr, None, span_tag) 

269 if sense != "": 269 ↛ 272line 269 didn't jump to line 272 because the condition on line 269 was always true

270 desc_lists[-1]["sense"] = sense 

271 

272 return after_word 

273 

274 

275def does_text_look_like_language_name(text: str) -> str | None: 

276 text = text.strip() 

277 if not text: 

278 return None 

279 split_text = text.replace("-", " ").split() 

280 if any(name_to_code(s.strip(), "en") for s in split_text): 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true

281 return text 

282 if len(split_text) >= 2: 

283 if all(s != "" and s[0].isupper() for s in split_text): 

284 return text 

285 # len(text) == 1 

286 elif text.endswith(("ic", "ish", "an")): 

287 return text 

288 return None 

289 

290 

291def choose_more_specific_langcode(new: str | None, old: str) -> str | None: 

292 if old == "unknown": 

293 return new 

294 if new is None or new == "": 

295 return old 

296 if old.startswith(new + "-"): 296 ↛ 298line 296 didn't jump to line 298 because the condition on line 296 was never true

297 # "fa-cls" or "fa" -> "fa-cls" 

298 return old 

299 return new