-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathK_means.sql
More file actions
82 lines (72 loc) · 2.96 KB
/
Copy pathK_means.sql
File metadata and controls
82 lines (72 loc) · 2.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
-- ⚠ This variant implements K-Means using recursive UNION set semantics,
-- Set of points P that we will cluster
DROP TABLE IF EXISTS duration_class;
CREATE TABLE duration_class (
id int GENERATED ALWAYS AS IDENTITY,
trip_id integer, -- unique point ID/label
loc point -- location of point in 2D space
);
-- Instantiate P
INSERT INTO duration_class(trip_id, loc)
SELECT e.trip_id, point((EXTRACT(EPOCH FROM e.duration)/60), 0)
FROM edit_t_19_q1 AS e
WITH RECURSIVE
-- k_means(‹i›, ‹p›, ‹c›, ‹mean›):
-- in iteration ‹i›, point ID ‹p› has been assigned to cluster ID ‹c›,
-- the centroid of ‹c› is point ‹mean›
-- (i.e., there exists an FD ‹c› → ‹mean›).
k_means (iter, id, cluster, mean)AS (
SELECT 0 AS iter, d.id, ROW_NUMBER() OVER () AS cluster, d.loc AS mean
--
-- From P, choose a sample of points (these will become the
-- initial cluster centers), assign unique cluster IDs
FROM duration_class AS d
WHERE d.id IN (15, 226, 275) -- choose points as initial cluster centers
UNION
-- 2. Update
SELECT (assign.iter + 1) AS iter, assign.id, assign.cluster,
🠷
--AVG(assign.loc) OVER cluster AS mean
point(AVG(assign.loc[0]) OVER cluster,
AVG(assign.loc[1]) OVER cluster) AS mean
FROM (SELECT DISTINCT ON (d.id) k.iter, d.id, k.cluster, d.loc
FROM duration_class AS d, k_means AS k
ORDER BY d.id, d.loc <-> k.mean
) AS assign(iter, id, cluster, loc)
--The iter has been defined manually, and the check_iter
--query will check each time.
--If the check_iter returns no row, it means it is a correct iter,
--and we will find the min iter by try and error.
WHERE assign.iter < 11
WINDOW cluster AS (PARTITION BY assign.cluster)
),
--Check whether the selected iter led to the same result in the two last iterations.
check_iter AS(
SELECT k.id, k.cluster, k.mean
FROM k_means AS k
WHERE k.iter :: int = (SELECT MAX(k1.iter)
FROM k_means AS k1)
EXCEPT
SELECT k.id, k.cluster, k.mean
FROM k_means AS k
WHERE k.iter :: int = (SELECT MAX(k1.iter)-1
FROM k_means AS k1)
),
-- the output of the last iteration has been considered as the result of the query
k_means_result AS(
SELECT k.id, k.cluster, k.mean
FROM k_means AS k
WHERE k.iter :: int = (SELECT MAX(k1.iter)
FROM k_means AS k1)
),
--the total within-cluster sum of square (WCSS) in order to use the Elbow method
--to identify the beast k.
WCSS AS(
SELECT SUM ((k.mean[0] - d.loc[0])^ 2 + (k.mean[1] - d.loc[1])^2) AS WCSS
FROM k_means_result AS k, duration_class AS d
WHERE k.id = d.id
)
TABLE check_iter;
TABLE WCSS;
SELECT k.*
FROM k_means_result AS k;