-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathClustering_CURE.py
More file actions
142 lines (124 loc) · 4.82 KB
/
Clustering_CURE.py
File metadata and controls
142 lines (124 loc) · 4.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import sys
import math
import matplotlib.pyplot as plt
def getDistanceFromRepresentatives(point, representativePoints_shifted):
minimumDistance = float("inf")
for repr_point in representativePoints_shifted:
distance = getDistance(point, repr_point)
if distance < minimumDistance:
minimumDistance = distance
return minimumDistance
def computeCentroid(initialCluster):
x = 0
y = 0
for point in initialCluster:
x += point[0]
y += point[1]
numberOfPoints = len(initialCluster)
return (x / numberOfPoints, y / numberOfPoints)
def findRepresentativePoints(initialCluster):
representativePoints = []
representativePoints.append(list(initialCluster[0]))
for i in range(n - 1):
maximumDistance = float("-inf")
for point in initialCluster:
if point in representativePoints:
continue
minimumDistance = float("inf")
for representativePoint in representativePoints:
distance = getDistance(point, representativePoint)
if distance < minimumDistance:
minimumDistance = distance
if minimumDistance > maximumDistance:
candidateRepresentativePoint = point
maximumDistance = minimumDistance
representativePoints.append(list(candidateRepresentativePoint))
return representativePoints
def getDistance(point1, point2):
distance = (point1[0] - point2[0]) ** 2 + (point1[1] - point2[1]) ** 2
return math.sqrt(distance)
def clusterDistance(cluster1, cluster2):
minimumDistance = float("inf")
for point1 in cluster1:
for point2 in cluster2:
if minimumDistance > getDistance(point1, point2):
minimumDistance = getDistance(point1, point2)
return minimumDistance
def formClusters_heirarchical(sampleData):
clusters = [[i] for i in sampleData]
iters = len(sampleData) - k
for iter in range(iters):
min = float("inf")
for i in range(0, len(clusters) - 1):
for j in range(i + 1, len(clusters)):
if min > clusterDistance(clusters[i], clusters[j]):
min = clusterDistance(clusters[i], clusters[j])
c1 = i
c2 = j
clusters[c1].extend(clusters[c2])
del clusters[c2]
return clusters
sampleDataFile = open(sys.argv[1]).readlines()
completeDataFile = open(sys.argv[2]).readlines()
k = int(sys.argv[3])
n = int(sys.argv[4])
p = float(sys.argv[5])
outputFileName = sys.argv[6]
completeData = []
for line in completeDataFile:
line = line.split(",")
completeData.append((float(line[0]), float(line[1])))
sampleData = []
for line in sampleDataFile:
line = line.split(",")
sampleData.append((float(line[0]), float(line[1])))
sampleData = sorted(sampleData, key=lambda x: (x[0], x[1]))
initialClusters = formClusters_heirarchical(sampleData)
representativePointsList = []
representativePoints_shifted = []
# For plotting initial clusters
initialClusterAssignments = []
for clusterId, initialCluster in enumerate(initialClusters):
for point in initialCluster:
initialClusterAssignments.append((point, clusterId))
# Find representative points and shift them
for initialCluster in initialClusters:
representivePoints = findRepresentativePoints(initialCluster)
representativePointsList.append(representivePoints)
shiftedRepresentativePoints = []
centroid = computeCentroid(initialCluster)
for representativePoint in representivePoints:
shiftX = (centroid[0] - representativePoint[0]) * p
shiftY = (centroid[1] - representativePoint[1]) * p
shiftedRepresentativePoints.append((representativePoint[0] + shiftX, representativePoint[1] + shiftY))
representativePoints_shifted.append(shiftedRepresentativePoints)
outputPointList = []
for point in completeData:
minimumDistance = float("inf")
for clusterNum in range(k):
distance = getDistanceFromRepresentatives(point, representativePoints_shifted[clusterNum])
if distance < minimumDistance:
minimumDistance = distance
clusterId = clusterNum
outputPointList.append((point, clusterId))
w = open(outputFileName, 'w')
for point in outputPointList:
w.write(str(point[0][0]) + "," + str(point[0][1]) + "," + str(point[1]) + "\n")
w.close()
# Plot initial clusters
colors = plt.cm.get_cmap("tab10", k)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
for point, clusterId in initialClusterAssignments:
plt.scatter(point[0], point[1], color=colors(clusterId))
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Initial Clusters')
# Plot final clusters
plt.subplot(1, 2, 2)
for point, clusterId in outputPointList:
plt.scatter(point[0], point[1], color=colors(clusterId))
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Final CURE Clusters')
plt.show()