Coverage for src/wiktextract/categories.py: 8%

1# Extracting the category tree from Wiktionary

5from typing import TypedDict

7from wikitextprocessor.core import NamespaceDataEntry

9from .page import clean_node

10from .wxr_context import WiktextractContext

12LUA_CODE = r"""

13local export = {}

15topic_data = require("Module:category tree/topic cat/data")

16poscat_data = require("Module:category tree/poscatboiler/data")

17top_data = require("Module:category tree/data")

19local function extract_tree(data, parts)

20 for k, v in pairs(data.LABELS) do

21 desc = v.description or ""

22 if type(desc) == "function" then

23 -- Module:category tree/poscatboiler/data/non-lemma forms

24 -- Turns out category tree can return a function where we

25 -- expect a string, and the function is called with a `data`-

26 -- table containing some kind of context data when appropriate,

27 -- in a similar way to how all the {{{langname}}} calls are

28 -- filled in when appropriate. However, we are just getting

29 -- the "templates" here, so we don't have a context to call

30 -- the function with: so instead just give it an empty table

31 -- and hope the function has a sensible condition structure

32 -- that first checks whether it should output a default:

33 -- the gerund template in the above url does this.

34 print("Function returned in description of category tree template `"..

35 k.."`: "..tostring(v.description))

36 desc = desc({})

37 end

38 print( k..": "..desc )

39 desc = string.gsub(desc, "\n", "\\n")

40 table.insert(parts, k .. "@@" .. desc)

41 if type(v.parents) == "table" then

42 for kk, vv in pairs(v.parents) do

43 local name

44 local sort = ""

45 if type(vv) == "table" then

46 name = vv.name

47 sort = vv.sort or ""

48 else

49 name = vv

50 end

51 if name then

52 table.insert(parts, "@@" .. name .. "@@" .. sort)

53 end

54 end

55 end

56 table.insert(parts, "\n")

57 end

58end

60function export.main()

61 local parts = {}

62 extract_tree(topic_data, parts)

63 extract_tree(poscat_data, parts)

64 for k, v in pairs(top_data) do

65 table.insert(parts, k .. "@@@@Fundamental@@\n")

66 end

67 local ret = table.concat(parts, "")

68 return ret

69end

71return export

72"""

74CategoryEntry = TypedDict(

75 "CategoryEntry",

76 {

77 "name": str,

78 "desc": str,

79 "clean_desc": str,

80 "children": list[str],

81 "sort": list[str],

82 },

83 total=False,

84)

86CategoryReturn = TypedDict(

87 "CategoryReturn",

88 {

89 "roots": list[str],

90 "nodes": dict[str, CategoryEntry],

91 },

92 total=False,

93)

96def extract_categories(wxr: WiktextractContext) -> CategoryReturn:

97 """Extracts the category tree from Wiktionary."""

98 module_ns: NamespaceDataEntry = wxr.wtp.NAMESPACE_DATA["Module"]

99 wxr.wtp.add_page(

100 f"{module_ns['name']}:wiktextract cat tree",

101 module_ns["id"],

102 LUA_CODE,

103 model="Scribunto",

104 )

105 wxr.wtp.start_page("Wiktextract category tree extraction")

106 rawdata = wxr.wtp.expand("{{#invoke:wiktextract cat tree|main}}")

107 ht: dict[str, CategoryEntry] = {}

108 for line in rawdata.split("\n"):

109 if not line:

110 continue

111 parts = line.split("@@")

112 if len(parts) < 2:

113 continue

114 name = parts[0]

115 desc = parts[1]

116 name = name.removeprefix("Category:")

117 name_lc = name.lower()

118 clean_desc = clean_node(wxr, None, desc)

119 if name_lc not in ht:

120 ht[name_lc] = {"name": name}

121 dt = ht[name_lc]

122 if desc and not dt.get("desc"):

123 dt["desc"] = desc

124 if clean_desc and not dt.get("clean_desc"):

125 dt["clean_desc"] = clean_desc

126 for i in range(2, len(parts), 2):

127 parent_name = parts[i]

128 parent_name = parent_name.removeprefix("Category:")

129 parent_name_lc = parent_name.lower()

130 parent_sort = parts[i + 1]

131 if parent_name_lc not in ht:

132 p: CategoryEntry = {"name": parent_name}

133 ht[parent_name_lc] = p

134 else:

135 p = ht[parent_name_lc]

136 if "children" not in p:

137 p["children"] = []

138 p["children"].append(name)

139 if parent_sort and parent_sort.strip():

140 if "sort" not in p:

141 p["sort"] = []

142 p["sort"].append(parent_sort)

143

144 seen: set[str] = set()

145 is_child: set[str] = set()

146

147 def recurse(name: str) -> None:

148 if name in seen:

149 return

150 seen.add(name)

151 for child in ht[name].get("children", ()):

152 recurse(child.lower())

153

154 recurse("fundamental")

155

156 for k, v in ht.items():

157 for child in v.get("children", ()):

158 is_child.add(child.lower())

159

160 notseen_set = set(x.lower() for x in ht.keys()) - seen - is_child

161 notseen = list(ht[x]["name"] for x in sorted(notseen_set))

162 # if notseen:

163 # print("NOT SEEN:", "; ".join(notseen))

164

165 # Sort lists of children

166 for v in ht.values():

167 if "children" in v:

168 v["children"] = list(sorted(v["children"]))

169

170 roots = ["Fundamental"]

171 roots.extend(notseen)

172 ret: CategoryReturn = {"roots": roots, "nodes": ht}

173 # import json

174 # print(json.dumps(ret, sort_keys=True, indent=2))

175 return ret