SQL-toolkit/K_means.sql at main · sastmo/SQL-toolkit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82

-- ⚠ This variant implements K-Means using recursive UNION set semantics,

-- Set of points P that we will cluster
DROP TABLE IF EXISTS duration_class;
CREATE TABLE duration_class (
  id            int GENERATED ALWAYS AS IDENTITY,
  trip_id       integer,                 -- unique point ID/label
  loc           point                     -- location of point in 2D space
);


-- Instantiate P
INSERT INTO duration_class(trip_id, loc)
SELECT e.trip_id, point((EXTRACT(EPOCH FROM e.duration)/60), 0)
FROM   edit_t_19_q1 AS e


WITH RECURSIVE
-- k_means(‹i›, ‹p›, ‹c›, ‹mean›):
--   in iteration ‹i›, point ID ‹p› has been assigned to cluster ID ‹c›,
--   the centroid of ‹c› is point ‹mean›
--   (i.e., there exists an FD ‹c› → ‹mean›).
k_means (iter, id, cluster, mean)AS (
  SELECT 0  AS iter, d.id, ROW_NUMBER() OVER () AS cluster, d.loc AS mean
  --
  -- From P, choose a sample of points (these will become the
  -- initial cluster centers), assign unique cluster IDs
  FROM   duration_class AS d
  WHERE  d.id IN (15, 226, 275) -- choose points as initial cluster centers
    UNION
      -- 2. Update
  SELECT (assign.iter + 1) AS iter, assign.id, assign.cluster,
         🠷
    --AVG(assign.loc) OVER cluster AS mean
         point(AVG(assign.loc[0]) OVER cluster,
         AVG(assign.loc[1]) OVER cluster) AS mean
  FROM   (SELECT DISTINCT ON (d.id) k.iter, d.id, k.cluster, d.loc
          FROM    duration_class AS d, k_means AS k
          ORDER BY d.id, d.loc <-> k.mean
               ) AS assign(iter, id, cluster, loc)
   --The iter has been defined manually, and the check_iter
   --query will check each time.
   --If the check_iter returns no row, it means it is a correct iter,
   --and we will find the min iter by try and error.
  WHERE assign.iter < 11
  WINDOW cluster AS (PARTITION BY assign.cluster)
 ),
 --Check whether the selected iter led to the same result in the two last iterations.
 check_iter AS(
  SELECT k.id, k.cluster, k.mean
  FROM   k_means AS k
  WHERE  k.iter :: int =  (SELECT MAX(k1.iter)
                           FROM   k_means AS k1)
           EXCEPT
  SELECT k.id, k.cluster, k.mean
  FROM   k_means AS k
  WHERE  k.iter :: int = (SELECT MAX(k1.iter)-1
                           FROM   k_means AS k1)
 ),
 -- the output of the last iteration has been considered as the result of the query
 k_means_result AS(
  SELECT k.id, k.cluster, k.mean
  FROM   k_means AS k
  WHERE  k.iter :: int =  (SELECT MAX(k1.iter)
                           FROM   k_means AS k1)
 ),
 --the total within-cluster sum of square (WCSS) in order to use the Elbow method
 --to identify the beast k.
WCSS AS(
  SELECT SUM ((k.mean[0] - d.loc[0])^ 2 + (k.mean[1] - d.loc[1])^2) AS WCSS
  FROM   k_means_result AS k, duration_class AS d
  WHERE  k.id = d.id
)
TABLE check_iter;

TABLE WCSS;

SELECT k.*
FROM   k_means_result AS k;