Coverage for src/wiktextract/categories.py: 8%

69 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1# Extracting the category tree from Wiktionary 

2# 

3# Copyright (c) 2021 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5from typing import ( 

6 Optional, 

7 TypedDict, 

8) 

9 

10from wikitextprocessor.core import NamespaceDataEntry 

11 

12from .page import clean_node 

13from .wxr_context import WiktextractContext 

14 

15LUA_CODE = r""" 

16local export = {} 

17 

18topic_data = require("Module:category tree/topic cat/data") 

19poscat_data = require("Module:category tree/poscatboiler/data") 

20top_data = require("Module:category tree/data") 

21 

22local function extract_tree(data, parts) 

23 for k, v in pairs(data.LABELS) do 

24 desc = v.description or "" 

25 if type(desc) == "function" then 

26 -- Module:category tree/poscatboiler/data/non-lemma forms 

27 -- Turns out category tree can return a function where we 

28 -- expect a string, and the function is called with a `data`- 

29 -- table containing some kind of context data when appropriate, 

30 -- in a similar way to how all the {{{langname}}} calls are 

31 -- filled in when appropriate. However, we are just getting 

32 -- the "templates" here, so we don't have a context to call 

33 -- the function with: so instead just give it an empty table 

34 -- and hope the function has a sensible condition structure 

35 -- that first checks whether it should output a default: 

36 -- the gerund template in the above url does this. 

37 print("Function returned in description of category tree template `".. 

38 k.."`: "..tostring(v.description)) 

39 desc = desc({}) 

40 end 

41 print( k..": "..desc ) 

42 desc = string.gsub(desc, "\n", "\\n") 

43 table.insert(parts, k .. "@@" .. desc) 

44 for kk, vv in pairs(v.parents) do 

45 local name 

46 local sort = "" 

47 if type(vv) == "table" then 

48 name = vv.name 

49 sort = vv.sort or "" 

50 else 

51 name = vv 

52 end 

53 if name then 

54 table.insert(parts, "@@" .. name .. "@@" .. sort) 

55 end 

56 end 

57 table.insert(parts, "\n") 

58 end 

59end 

60 

61function export.main() 

62 local parts = {} 

63 extract_tree(topic_data, parts) 

64 extract_tree(poscat_data, parts) 

65 for k, v in pairs(top_data) do 

66 table.insert(parts, k .. "@@@@Fundamental@@\n") 

67 end 

68 local ret = table.concat(parts, "") 

69 return ret 

70end 

71 

72return export 

73""" 

74 

75CategoryEntry = TypedDict( 

76 "CategoryEntry", 

77 { 

78 "name": str, 

79 "desc": str, 

80 "clean_desc": str, 

81 "children": list[str], 

82 "sort": list[str], 

83 }, 

84 total=False, 

85) 

86 

87CategoryReturn = TypedDict( 

88 "CategoryReturn", 

89 { 

90 "roots": list[str], 

91 "nodes": dict[str, CategoryEntry], 

92 }, 

93 total=False, 

94) 

95 

96def extract_categories(wxr: WiktextractContext) -> CategoryReturn: 

97 """Extracts the category tree from Wiktionary.""" 

98 module_ns: Optional[NamespaceDataEntry] = wxr.wtp.NAMESPACE_DATA.get( 

99 "Module", None) 

100 assert module_ns is not None 

101 module_ns_local_name = module_ns.get("name") 

102 module_ns_id = module_ns.get("id") 

103 wxr.wtp.add_page(f"{module_ns_local_name}:wiktextract cat tree", 

104 module_ns_id, LUA_CODE, model="Scribunto") 

105 wxr.wtp.start_page("Wiktextract category tree extraction") 

106 rawdata = wxr.wtp.expand("{{#invoke:wiktextract cat tree|main}}") 

107 ht: dict[str, CategoryEntry] = {} 

108 for line in rawdata.split("\n"): 

109 if not line: 

110 continue 

111 parts = line.split("@@") 

112 name = parts[0] 

113 desc = parts[1] 

114 name = name.removeprefix("Category:") 

115 name_lc = name.lower() 

116 clean_desc = clean_node(wxr, None, desc) 

117 if name_lc not in ht: 

118 ht[name_lc] = {"name": name} 

119 dt = ht[name_lc] 

120 if desc and not dt.get("desc"): 

121 dt["desc"] = desc 

122 if clean_desc and not dt.get("clean_desc"): 

123 dt["clean_desc"] = clean_desc 

124 for i in range(2, len(parts), 2): 

125 parent_name = parts[i] 

126 parent_name = parent_name.removeprefix("Category:") 

127 parent_name_lc = parent_name.lower() 

128 parent_sort = parts[i + 1] 

129 if parent_name_lc not in ht: 

130 p: CategoryEntry = {"name": parent_name} 

131 ht[parent_name_lc] = p 

132 else: 

133 p = ht[parent_name_lc] 

134 if "children" not in p: 

135 p["children"] = [] 

136 p["children"].append(name) 

137 if parent_sort and parent_sort.strip(): 

138 if "sort" not in p: 

139 p["sort"] = [] 

140 p["sort"].append(parent_sort) 

141 

142 seen: set[str] = set() 

143 is_child: set[str] = set() 

144 

145 def recurse(name: str) -> None: 

146 if name in seen: 

147 return 

148 seen.add(name) 

149 for child in ht[name].get("children", ()): 

150 recurse(child.lower()) 

151 

152 recurse("fundamental") 

153 

154 for k, v in ht.items(): 

155 for child in v.get("children", ()): 

156 is_child.add(child.lower()) 

157 

158 notseen_set = set(x.lower() for x in ht.keys()) - seen - is_child 

159 notseen = list(ht[x]["name"] for x in sorted(notseen_set)) 

160 #if notseen: 

161 # print("NOT SEEN:", "; ".join(notseen)) 

162 

163 # Sort lists of children 

164 for v in ht.values(): 

165 if "children" in v: 

166 v["children"] = list(sorted(v["children"])) 

167 

168 roots = ["Fundamental"] 

169 roots.extend(notseen) 

170 ret: CategoryReturn = {"roots": roots, "nodes": ht} 

171 # import json 

172 # print(json.dumps(ret, sort_keys=True, indent=2)) 

173 return ret