Coverage for src/wiktextract/categories.py: 8%
68 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
1# Extracting the category tree from Wiktionary
2#
3# Copyright (c) 2021 Tatu Ylonen. See file LICENSE and https://ylonen.org
5from typing import TypedDict
7from wikitextprocessor.core import NamespaceDataEntry
9from .page import clean_node
10from .wxr_context import WiktextractContext
12LUA_CODE = r"""
13local export = {}
15topic_data = require("Module:category tree/topic cat/data")
16poscat_data = require("Module:category tree/poscatboiler/data")
17top_data = require("Module:category tree/data")
19local function extract_tree(data, parts)
20 for k, v in pairs(data.LABELS) do
21 desc = v.description or ""
22 if type(desc) == "function" then
23 -- Module:category tree/poscatboiler/data/non-lemma forms
24 -- Turns out category tree can return a function where we
25 -- expect a string, and the function is called with a `data`-
26 -- table containing some kind of context data when appropriate,
27 -- in a similar way to how all the {{{langname}}} calls are
28 -- filled in when appropriate. However, we are just getting
29 -- the "templates" here, so we don't have a context to call
30 -- the function with: so instead just give it an empty table
31 -- and hope the function has a sensible condition structure
32 -- that first checks whether it should output a default:
33 -- the gerund template in the above url does this.
34 print("Function returned in description of category tree template `"..
35 k.."`: "..tostring(v.description))
36 desc = desc({})
37 end
38 print( k..": "..desc )
39 desc = string.gsub(desc, "\n", "\\n")
40 table.insert(parts, k .. "@@" .. desc)
41 if type(v.parents) == "table" then
42 for kk, vv in pairs(v.parents) do
43 local name
44 local sort = ""
45 if type(vv) == "table" then
46 name = vv.name
47 sort = vv.sort or ""
48 else
49 name = vv
50 end
51 if name then
52 table.insert(parts, "@@" .. name .. "@@" .. sort)
53 end
54 end
55 end
56 table.insert(parts, "\n")
57 end
58end
60function export.main()
61 local parts = {}
62 extract_tree(topic_data, parts)
63 extract_tree(poscat_data, parts)
64 for k, v in pairs(top_data) do
65 table.insert(parts, k .. "@@@@Fundamental@@\n")
66 end
67 local ret = table.concat(parts, "")
68 return ret
69end
71return export
72"""
74CategoryEntry = TypedDict(
75 "CategoryEntry",
76 {
77 "name": str,
78 "desc": str,
79 "clean_desc": str,
80 "children": list[str],
81 "sort": list[str],
82 },
83 total=False,
84)
86CategoryReturn = TypedDict(
87 "CategoryReturn",
88 {
89 "roots": list[str],
90 "nodes": dict[str, CategoryEntry],
91 },
92 total=False,
93)
96def extract_categories(wxr: WiktextractContext) -> CategoryReturn:
97 """Extracts the category tree from Wiktionary."""
98 module_ns: NamespaceDataEntry = wxr.wtp.NAMESPACE_DATA["Module"]
99 wxr.wtp.add_page(
100 f"{module_ns['name']}:wiktextract cat tree",
101 module_ns["id"],
102 LUA_CODE,
103 model="Scribunto",
104 )
105 wxr.wtp.start_page("Wiktextract category tree extraction")
106 rawdata = wxr.wtp.expand("{{#invoke:wiktextract cat tree|main}}")
107 ht: dict[str, CategoryEntry] = {}
108 for line in rawdata.split("\n"):
109 if not line:
110 continue
111 parts = line.split("@@")
112 if len(parts) < 2:
113 continue
114 name = parts[0]
115 desc = parts[1]
116 name = name.removeprefix("Category:")
117 name_lc = name.lower()
118 clean_desc = clean_node(wxr, None, desc)
119 if name_lc not in ht:
120 ht[name_lc] = {"name": name}
121 dt = ht[name_lc]
122 if desc and not dt.get("desc"):
123 dt["desc"] = desc
124 if clean_desc and not dt.get("clean_desc"):
125 dt["clean_desc"] = clean_desc
126 for i in range(2, len(parts), 2):
127 parent_name = parts[i]
128 parent_name = parent_name.removeprefix("Category:")
129 parent_name_lc = parent_name.lower()
130 parent_sort = parts[i + 1]
131 if parent_name_lc not in ht:
132 p: CategoryEntry = {"name": parent_name}
133 ht[parent_name_lc] = p
134 else:
135 p = ht[parent_name_lc]
136 if "children" not in p:
137 p["children"] = []
138 p["children"].append(name)
139 if parent_sort and parent_sort.strip():
140 if "sort" not in p:
141 p["sort"] = []
142 p["sort"].append(parent_sort)
144 seen: set[str] = set()
145 is_child: set[str] = set()
147 def recurse(name: str) -> None:
148 if name in seen:
149 return
150 seen.add(name)
151 for child in ht[name].get("children", ()):
152 recurse(child.lower())
154 recurse("fundamental")
156 for k, v in ht.items():
157 for child in v.get("children", ()):
158 is_child.add(child.lower())
160 notseen_set = set(x.lower() for x in ht.keys()) - seen - is_child
161 notseen = list(ht[x]["name"] for x in sorted(notseen_set))
162 # if notseen:
163 # print("NOT SEEN:", "; ".join(notseen))
165 # Sort lists of children
166 for v in ht.values():
167 if "children" in v:
168 v["children"] = list(sorted(v["children"]))
170 roots = ["Fundamental"]
171 roots.extend(notseen)
172 ret: CategoryReturn = {"roots": roots, "nodes": ht}
173 # import json
174 # print(json.dumps(ret, sort_keys=True, indent=2))
175 return ret