Coverage for src/wiktextract/categories.py: 8%
69 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1# Extracting the category tree from Wiktionary
2#
3# Copyright (c) 2021 Tatu Ylonen. See file LICENSE and https://ylonen.org
5from typing import (
6 Optional,
7 TypedDict,
8)
10from wikitextprocessor.core import NamespaceDataEntry
12from .page import clean_node
13from .wxr_context import WiktextractContext
15LUA_CODE = r"""
16local export = {}
18topic_data = require("Module:category tree/topic cat/data")
19poscat_data = require("Module:category tree/poscatboiler/data")
20top_data = require("Module:category tree/data")
22local function extract_tree(data, parts)
23 for k, v in pairs(data.LABELS) do
24 desc = v.description or ""
25 if type(desc) == "function" then
26 -- Module:category tree/poscatboiler/data/non-lemma forms
27 -- Turns out category tree can return a function where we
28 -- expect a string, and the function is called with a `data`-
29 -- table containing some kind of context data when appropriate,
30 -- in a similar way to how all the {{{langname}}} calls are
31 -- filled in when appropriate. However, we are just getting
32 -- the "templates" here, so we don't have a context to call
33 -- the function with: so instead just give it an empty table
34 -- and hope the function has a sensible condition structure
35 -- that first checks whether it should output a default:
36 -- the gerund template in the above url does this.
37 print("Function returned in description of category tree template `"..
38 k.."`: "..tostring(v.description))
39 desc = desc({})
40 end
41 print( k..": "..desc )
42 desc = string.gsub(desc, "\n", "\\n")
43 table.insert(parts, k .. "@@" .. desc)
44 for kk, vv in pairs(v.parents) do
45 local name
46 local sort = ""
47 if type(vv) == "table" then
48 name = vv.name
49 sort = vv.sort or ""
50 else
51 name = vv
52 end
53 if name then
54 table.insert(parts, "@@" .. name .. "@@" .. sort)
55 end
56 end
57 table.insert(parts, "\n")
58 end
59end
61function export.main()
62 local parts = {}
63 extract_tree(topic_data, parts)
64 extract_tree(poscat_data, parts)
65 for k, v in pairs(top_data) do
66 table.insert(parts, k .. "@@@@Fundamental@@\n")
67 end
68 local ret = table.concat(parts, "")
69 return ret
70end
72return export
73"""
75CategoryEntry = TypedDict(
76 "CategoryEntry",
77 {
78 "name": str,
79 "desc": str,
80 "clean_desc": str,
81 "children": list[str],
82 "sort": list[str],
83 },
84 total=False,
85)
87CategoryReturn = TypedDict(
88 "CategoryReturn",
89 {
90 "roots": list[str],
91 "nodes": dict[str, CategoryEntry],
92 },
93 total=False,
94)
96def extract_categories(wxr: WiktextractContext) -> CategoryReturn:
97 """Extracts the category tree from Wiktionary."""
98 module_ns: Optional[NamespaceDataEntry] = wxr.wtp.NAMESPACE_DATA.get(
99 "Module", None)
100 assert module_ns is not None
101 module_ns_local_name = module_ns.get("name")
102 module_ns_id = module_ns.get("id")
103 wxr.wtp.add_page(f"{module_ns_local_name}:wiktextract cat tree",
104 module_ns_id, LUA_CODE, model="Scribunto")
105 wxr.wtp.start_page("Wiktextract category tree extraction")
106 rawdata = wxr.wtp.expand("{{#invoke:wiktextract cat tree|main}}")
107 ht: dict[str, CategoryEntry] = {}
108 for line in rawdata.split("\n"):
109 if not line:
110 continue
111 parts = line.split("@@")
112 name = parts[0]
113 desc = parts[1]
114 name = name.removeprefix("Category:")
115 name_lc = name.lower()
116 clean_desc = clean_node(wxr, None, desc)
117 if name_lc not in ht:
118 ht[name_lc] = {"name": name}
119 dt = ht[name_lc]
120 if desc and not dt.get("desc"):
121 dt["desc"] = desc
122 if clean_desc and not dt.get("clean_desc"):
123 dt["clean_desc"] = clean_desc
124 for i in range(2, len(parts), 2):
125 parent_name = parts[i]
126 parent_name = parent_name.removeprefix("Category:")
127 parent_name_lc = parent_name.lower()
128 parent_sort = parts[i + 1]
129 if parent_name_lc not in ht:
130 p: CategoryEntry = {"name": parent_name}
131 ht[parent_name_lc] = p
132 else:
133 p = ht[parent_name_lc]
134 if "children" not in p:
135 p["children"] = []
136 p["children"].append(name)
137 if parent_sort and parent_sort.strip():
138 if "sort" not in p:
139 p["sort"] = []
140 p["sort"].append(parent_sort)
142 seen: set[str] = set()
143 is_child: set[str] = set()
145 def recurse(name: str) -> None:
146 if name in seen:
147 return
148 seen.add(name)
149 for child in ht[name].get("children", ()):
150 recurse(child.lower())
152 recurse("fundamental")
154 for k, v in ht.items():
155 for child in v.get("children", ()):
156 is_child.add(child.lower())
158 notseen_set = set(x.lower() for x in ht.keys()) - seen - is_child
159 notseen = list(ht[x]["name"] for x in sorted(notseen_set))
160 #if notseen:
161 # print("NOT SEEN:", "; ".join(notseen))
163 # Sort lists of children
164 for v in ht.values():
165 if "children" in v:
166 v["children"] = list(sorted(v["children"]))
168 roots = ["Fundamental"]
169 roots.extend(notseen)
170 ret: CategoryReturn = {"roots": roots, "nodes": ht}
171 # import json
172 # print(json.dumps(ret, sort_keys=True, indent=2))
173 return ret