Coverage for src/wiktextract/categories.py: 8%

68 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-11 10:26 +0000

1# Extracting the category tree from Wiktionary 

2# 

3# Copyright (c) 2021 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5from typing import TypedDict 

6 

7from wikitextprocessor.core import NamespaceDataEntry 

8 

9from .page import clean_node 

10from .wxr_context import WiktextractContext 

11 

12LUA_CODE = r""" 

13local export = {} 

14 

15topic_data = require("Module:category tree/topic cat/data") 

16poscat_data = require("Module:category tree/poscatboiler/data") 

17top_data = require("Module:category tree/data") 

18 

19local function extract_tree(data, parts) 

20 for k, v in pairs(data.LABELS) do 

21 desc = v.description or "" 

22 if type(desc) == "function" then 

23 -- Module:category tree/poscatboiler/data/non-lemma forms 

24 -- Turns out category tree can return a function where we 

25 -- expect a string, and the function is called with a `data`- 

26 -- table containing some kind of context data when appropriate, 

27 -- in a similar way to how all the {{{langname}}} calls are 

28 -- filled in when appropriate. However, we are just getting 

29 -- the "templates" here, so we don't have a context to call 

30 -- the function with: so instead just give it an empty table 

31 -- and hope the function has a sensible condition structure 

32 -- that first checks whether it should output a default: 

33 -- the gerund template in the above url does this. 

34 print("Function returned in description of category tree template `".. 

35 k.."`: "..tostring(v.description)) 

36 desc = desc({}) 

37 end 

38 print( k..": "..desc ) 

39 desc = string.gsub(desc, "\n", "\\n") 

40 table.insert(parts, k .. "@@" .. desc) 

41 if type(v.parents) == "table" then 

42 for kk, vv in pairs(v.parents) do 

43 local name 

44 local sort = "" 

45 if type(vv) == "table" then 

46 name = vv.name 

47 sort = vv.sort or "" 

48 else 

49 name = vv 

50 end 

51 if name then 

52 table.insert(parts, "@@" .. name .. "@@" .. sort) 

53 end 

54 end 

55 end 

56 table.insert(parts, "\n") 

57 end 

58end 

59 

60function export.main() 

61 local parts = {} 

62 extract_tree(topic_data, parts) 

63 extract_tree(poscat_data, parts) 

64 for k, v in pairs(top_data) do 

65 table.insert(parts, k .. "@@@@Fundamental@@\n") 

66 end 

67 local ret = table.concat(parts, "") 

68 return ret 

69end 

70 

71return export 

72""" 

73 

74CategoryEntry = TypedDict( 

75 "CategoryEntry", 

76 { 

77 "name": str, 

78 "desc": str, 

79 "clean_desc": str, 

80 "children": list[str], 

81 "sort": list[str], 

82 }, 

83 total=False, 

84) 

85 

86CategoryReturn = TypedDict( 

87 "CategoryReturn", 

88 { 

89 "roots": list[str], 

90 "nodes": dict[str, CategoryEntry], 

91 }, 

92 total=False, 

93) 

94 

95 

96def extract_categories(wxr: WiktextractContext) -> CategoryReturn: 

97 """Extracts the category tree from Wiktionary.""" 

98 module_ns: NamespaceDataEntry = wxr.wtp.NAMESPACE_DATA["Module"] 

99 wxr.wtp.add_page( 

100 f"{module_ns['name']}:wiktextract cat tree", 

101 module_ns["id"], 

102 LUA_CODE, 

103 model="Scribunto", 

104 ) 

105 wxr.wtp.start_page("Wiktextract category tree extraction") 

106 rawdata = wxr.wtp.expand("{{#invoke:wiktextract cat tree|main}}") 

107 ht: dict[str, CategoryEntry] = {} 

108 for line in rawdata.split("\n"): 

109 if not line: 

110 continue 

111 parts = line.split("@@") 

112 if len(parts) < 2: 

113 continue 

114 name = parts[0] 

115 desc = parts[1] 

116 name = name.removeprefix("Category:") 

117 name_lc = name.lower() 

118 clean_desc = clean_node(wxr, None, desc) 

119 if name_lc not in ht: 

120 ht[name_lc] = {"name": name} 

121 dt = ht[name_lc] 

122 if desc and not dt.get("desc"): 

123 dt["desc"] = desc 

124 if clean_desc and not dt.get("clean_desc"): 

125 dt["clean_desc"] = clean_desc 

126 for i in range(2, len(parts), 2): 

127 parent_name = parts[i] 

128 parent_name = parent_name.removeprefix("Category:") 

129 parent_name_lc = parent_name.lower() 

130 parent_sort = parts[i + 1] 

131 if parent_name_lc not in ht: 

132 p: CategoryEntry = {"name": parent_name} 

133 ht[parent_name_lc] = p 

134 else: 

135 p = ht[parent_name_lc] 

136 if "children" not in p: 

137 p["children"] = [] 

138 p["children"].append(name) 

139 if parent_sort and parent_sort.strip(): 

140 if "sort" not in p: 

141 p["sort"] = [] 

142 p["sort"].append(parent_sort) 

143 

144 seen: set[str] = set() 

145 is_child: set[str] = set() 

146 

147 def recurse(name: str) -> None: 

148 if name in seen: 

149 return 

150 seen.add(name) 

151 for child in ht[name].get("children", ()): 

152 recurse(child.lower()) 

153 

154 recurse("fundamental") 

155 

156 for k, v in ht.items(): 

157 for child in v.get("children", ()): 

158 is_child.add(child.lower()) 

159 

160 notseen_set = set(x.lower() for x in ht.keys()) - seen - is_child 

161 notseen = list(ht[x]["name"] for x in sorted(notseen_set)) 

162 # if notseen: 

163 # print("NOT SEEN:", "; ".join(notseen)) 

164 

165 # Sort lists of children 

166 for v in ht.values(): 

167 if "children" in v: 

168 v["children"] = list(sorted(v["children"])) 

169 

170 roots = ["Fundamental"] 

171 roots.extend(notseen) 

172 ret: CategoryReturn = {"roots": roots, "nodes": ht} 

173 # import json 

174 # print(json.dumps(ret, sort_keys=True, indent=2)) 

175 return ret