Unix_python_project/QT_3.0.py at main · danielgc99/Unix_python_project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
import math
import sys
import time
import numpy as np
from collections import defaultdict

def readpoints(infile):
    """Read points from file and return as list of tuples (name, x, y, ...)"""
    point_list = []
    n = 0
    with open(infile) as file:
        for line in file:
            n += 1
            line = line.strip().split()
            # Files with index column starting with "point"
            if line[0].lower().startswith("point"):
                point_list.append((line[0].replace("p", "P", 1), *[float(value) for value in line[1:]]))
            # For files with no index column
            else:
                point_list.append((f"Point{1+n}", *[float(value) for value in line]))
    return point_list

def calculate_distance_matrix(point_data):
    """Calculate distances between all points"""
    n_points = len(point_data)
    distance_matrix = {}

    # Extract coordinates for faster processing
    coords = np.array([list(point[1:]) for point in point_data])

    for i in range(n_points):
        for j in range(i + 1, n_points):
            # Fast Euclidean distance with NumPy
            dist = np.sqrt(np.sum((coords[i] - coords[j]) ** 2))
            distance_matrix[(i, j)] = dist

    return distance_matrix

def get_distance(i, j, distance_matrix):
    """Get distance between points i and j from distance matrix"""
    if i == j:
        return 0
    key = (i, j) if i < j else (j, i)
    return distance_matrix[key]

def calculate_max_distance(distance_matrix):
    """Find the maximum distance in the distance matrix"""
    return max(distance_matrix.values())

def candidate_cluster_exact(center_idx, point_data, distance_matrix, threshold, used_points):
    """
    Original candidate cluster algorithm that ensures exact results.
    Uses vectorized operations for speed.
    """
    # Start with the center point
    cluster = [center_idx]

    # Remaining points (not used and not the center)
    available_points = [i for i in range(len(point_data)) if i != center_idx and i not in used_points]

    # Pre-calculate distances from center to all available points for sorting
    distances_to_center = [(i, get_distance(center_idx, i, distance_matrix)) for i in available_points]
    distances_to_center.sort(key=lambda x: x[1])  # Sort by distance to center

    # Process available points in order of distance to center
    available_points = [p for p, _ in distances_to_center]

    # Continue adding points until no more can be added
    while available_points:
        best_point = None
        best_max_dist = float('inf')

        # Try each available point
        for idx in available_points:
            # Check if this point would exceed threshold with any existing point
            valid = True
            max_dist = 0

            for p in cluster:
                dist = get_distance(idx, p, distance_matrix)
                if dist > threshold:
                    valid = False
                    break
                max_dist = max(max_dist, dist)

            if valid and max_dist < best_max_dist:
                best_point = idx
                best_max_dist = max_dist

        # If we found a valid point, add it
        if best_point is not None:
            cluster.append(best_point)
            available_points.remove(best_point)
        else:
            # No more points can be added
            break

    return cluster

def best_cluster_exact(point_data, distance_matrix, threshold, used_points):
    """
    Find the best cluster with exact same logic as original algorithm.
    Optimized for performance with NumPy and early pruning.
    """
    best_cluster = []
    best_size = 0
    best_diameter = float('inf')

    # Process each potential center point
    for i in range(len(point_data)):
        if i in used_points:
            continue

        # Form candidate cluster
        cluster = candidate_cluster_exact(i, point_data, distance_matrix, threshold, used_points)

        # Skip if fewer than 2 points
        if len(cluster) < 2:
            continue

        # Calculate diameter
        max_dist = 0
        for a in range(len(cluster)):
            for b in range(a + 1, len(cluster)):
                dist = get_distance(cluster[a], cluster[b], distance_matrix)
                max_dist = max(max_dist, dist)

        # Update if better
        if len(cluster) > best_size or (len(cluster) == best_size and max_dist < best_diameter):
            best_cluster = cluster
            best_size = len(cluster)
            best_diameter = max_dist

    return best_cluster

def qt_clustering_exact_fast(point_data, distance_matrix, threshold):
    """
    QT Clustering implementation that preserves exact results while optimizing performance.
    """
    used_points = set()
    clusters = []
    n_points = len(point_data)

    # Track progress and performance
    iteration = 0
    largest_cluster = 0

    while len(used_points) < n_points:
        iteration += 1
        start_time = time.time()

        # Find best cluster
        best = best_cluster_exact(point_data, distance_matrix, threshold, used_points)
        largest_cluster = max(largest_cluster, len(best))

        # If no valid multi-point cluster
        if len(best) < 2:
            # Add remaining points as singletons
            remaining = [i for i in range(n_points) if i not in used_points]
            for i in remaining:
                clusters.append([i])
                used_points.add(i)
            print(f"Added {len(remaining)} singleton clusters")
            break
        else:
            # Add cluster and update used points
            clusters.append(best)
            used_points.update(best)

            # Report progress
            iter_time = time.time() - start_time
            print(f"Iteration {iteration}: Found cluster with {len(best)} points in {iter_time:.4f}s. "
                  f"Total: {len(used_points)}/{n_points}")

    print(f"Largest cluster: {largest_cluster} points")
    return clusters

# Main execution
if __name__ == "__main__":
    # Parse command line arguments
    if len(sys.argv) == 2:
        infile = sys.argv[1]
    else:
        infile = "data/point1000.lst"

    print(f"Reading points from: {infile}")

    # Overall timing
    total_start = time.time()

    # Load data
    point_data = readpoints(infile)
    print(f"Loaded {len(point_data)} points in {time.time() - total_start:.4f} seconds")

    # Calculate distance matrix
    start_time = time.time()
    distance_matrix = calculate_distance_matrix(point_data)
    print(f"Distance matrix calculation took {time.time() - start_time:.4f} seconds")

    # Find maximum distance (diameter)
    max_distance = calculate_max_distance(distance_matrix)

    # Set quality threshold to 30% of maximum distance
    quality_threshold = 0.3 * max_distance
    print(f"Maximum distance: {max_distance:.4f}")
    print(f"Quality Threshold (30% of diameter): {quality_threshold:.4f}")

    # Run QT clustering
    print("Starting QT clustering...")
    start_time = time.time()
    clusters = qt_clustering_exact_fast(point_data, distance_matrix, threshold=quality_threshold)
    print(f"QT clustering took {time.time() - start_time:.4f} seconds")

    # Print results
    print(f"\nFound {len(clusters)} clusters:")
    total_points = 0

    for i, cluster in enumerate(clusters):
        total_points += len(cluster)

        # Convert indices to point names
        if i < 5 or i >= len(clusters) - 5 or len(clusters) <= 10:
            point_names = [point_data[idx][0] for idx in cluster]
            if len(cluster) > 10:
                point_names = point_names[:5] + ['...'] + point_names[-5:]
            print(f"Cluster {i+1}: {len(cluster)} points - {point_names}")
        elif i == 5 and len(clusters) > 10:
            print("...")

    # Print total time
    print(f"\nTotal execution time: {time.time() - total_start:.4f} seconds")
    print(f"Total points in clusters: {total_points}/{len(point_data)}")