-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathQT_2.0.py
More file actions
219 lines (180 loc) · 7.91 KB
/
QT_2.0.py
File metadata and controls
219 lines (180 loc) · 7.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import math
import sys
import time
from collections import defaultdict
def readpoints(infile):
point_list = []
n = 0
with open(infile) as file:
for line in file:
n += 1
line = line.strip().split()
# Files with index column starting with "point"
if line[0].lower().startswith("point"):
point_list.append((line[0].replace("p", "P", 1), *[float(value) for value in line[1:]]))
# For files with no index column
else:
point_list.append((f"Point{1+n}", *[float(value) for value in line]))
return point_list
def euclidean_dist(pointA, pointB):
"""Takes two points as input and calculates the euclidean distance between them"""
coords1 = pointA[1:]
coords2 = pointB[1:]
# Verify both points have the same number of dimensions
if len(coords1) != len(coords2):
raise ValueError("Points must have the same number of dimensions")
# Calculate square sum of differences (using faster approach)
square_sum = sum((coords2[i] - coords1[i]) ** 2 for i in range(len(coords1)))
# Get the euclidean distance
return math.sqrt(square_sum)
def calculate_distance_matrix(point_data):
"""Pre-calculate all pairwise distances between points"""
distance_matrix = {}
for i in range(len(point_data)):
for j in range(i + 1, len(point_data)):
dist = euclidean_dist(point_data[i], point_data[j])
distance_matrix[(i, j)] = dist
return distance_matrix
def get_distance(i, j, distance_matrix):
"""Get distance between two points from the distance matrix"""
if i == j:
return 0
key = (i, j) if i < j else (j, i)
return distance_matrix[key]
def candidate_cluster_optimized(center_idx, point_data, distance_matrix, threshold, used_set):
"""
Optimized version of candidate cluster formation.
Uses a greedy approach with pre-sorting of points by distance to center.
"""
# Start with the center point
cluster = [center_idx]
# Pre-calculate distances to center for all unused points
distances_to_center = []
for i in range(len(point_data)):
if i != center_idx and i not in used_set:
dist = get_distance(center_idx, i, distance_matrix)
distances_to_center.append((i, dist))
# Sort points by distance to center (closest first)
distances_to_center.sort(key=lambda x: x[1])
# For each candidate point, track the maximum distance to any point in cluster
max_distances = defaultdict(float)
# Try adding points in order of increasing distance to center
for idx, _ in distances_to_center:
# Calculate max distance from this point to any point in cluster
max_dist_to_cluster = 0
for point in cluster:
dist = get_distance(idx, point, distance_matrix)
max_dist_to_cluster = max(max_dist_to_cluster, dist)
# If this point would keep the cluster within threshold
if max_dist_to_cluster <= threshold:
# Add it to cluster
cluster.append(idx)
# Update max distances for remaining candidates
for other_idx, _ in distances_to_center:
if other_idx not in cluster and other_idx not in used_set:
dist = get_distance(idx, other_idx, distance_matrix)
max_distances[other_idx] = max(max_distances[other_idx], dist)
return cluster
def best_cluster_optimized(point_data, distance_matrix, threshold, used_set):
"""
Optimized version of best cluster finding.
Uses cached calculations where possible.
"""
best_cluster = []
best_size = 0
best_diameter = float("inf")
# Track points we've already considered as centers to avoid redundant work
tried_centers = set()
# Start with unused points
for i in range(len(point_data)):
if i in used_set or i in tried_centers:
continue
# Form candidate cluster
cluster = candidate_cluster_optimized(i, point_data, distance_matrix, threshold, used_set)
tried_centers.add(i)
# Only consider clusters with at least 2 points
if len(cluster) < 2:
continue
# Calculate diameter only once
diameter = 0
for a in range(len(cluster)):
for b in range(a + 1, len(cluster)):
dist = get_distance(cluster[a], cluster[b], distance_matrix)
diameter = max(diameter, dist)
# Update best cluster if this one is better
if len(cluster) > best_size or (len(cluster) == best_size and diameter < best_diameter):
best_cluster = cluster
best_size = len(cluster)
best_diameter = diameter
# Optimization: If we find a cluster with many points, mark all its points as tried centers
# since any cluster centered at those points would be similar or worse
if len(cluster) > 3:
tried_centers.update(cluster)
return best_cluster
def qt_clustering_optimized(point_data, distance_matrix, threshold):
"""
Optimized QT clustering algorithm.
"""
used_points = set()
clusters = []
# Process until all points are used
while len(used_points) < len(point_data):
# Find best cluster from remaining points
start_time = time.time()
best = best_cluster_optimized(point_data, distance_matrix, threshold, used_points)
end_time = time.time()
# Debug timing
if len(clusters) < 5: # Only print timing for first few clusters
print(f"Finding cluster {len(clusters)+1} took {end_time - start_time:.4f} seconds")
# If no valid multi-point cluster found
if len(best) < 2:
# Add remaining points as singletons
for i in range(len(point_data)):
if i not in used_points:
clusters.append([i])
used_points.add(i)
break
else:
# Add cluster and update used points
clusters.append(best)
used_points.update(best)
print(f"Found cluster with {len(best)} points. Total points used: {len(used_points)}/{len(point_data)}")
return clusters
# Main execution
if __name__ == "__main__":
# Parse command line arguments
if len(sys.argv) == 2:
infile = sys.argv[1]
else:
infile = "data/point1000.lst"
print(f"Reading points from: {infile}")
# Start timing
total_start = time.time()
# Load data
point_data = readpoints(infile)
print(f"Loaded {len(point_data)} points")
# Calculate distances between all points
start_time = time.time()
distance_matrix = calculate_distance_matrix(point_data)
end_time = time.time()
print(f"Distance matrix calculation took {end_time - start_time:.4f} seconds")
# Find maximum distance (diameter)
max_distance = max(distance_matrix.values())
# Set quality threshold to 30% of maximum distance
quality_threshold = 0.3 * max_distance
print(f"Maximum distance: {max_distance}")
print(f"Quality Threshold (30% of diameter): {quality_threshold}")
# Run the optimized QT clustering algorithm
start_time = time.time()
clusters = qt_clustering_optimized(point_data, distance_matrix, quality_threshold)
end_time = time.time()
print(f"QT clustering took {end_time - start_time:.4f} seconds")
# Display results
print(f"\nFound {len(clusters)} clusters:")
for i, cluster in enumerate(clusters):
# Convert indices to point names
point_names = [point_data[idx][0] for idx in cluster]
print(f"Cluster {i+1}: {len(cluster)} points - {point_names}")
# Print total time
total_end = time.time()
print(f"\nTotal execution time: {total_end - total_start:.4f} seconds")