Unix_python_project/generate_validation_data.py at main · danielgc99/Unix_python_project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
Script to generate validation data with known cluster structure.
This creates a dataset where we know the expected clustering outcome.
"""
import os
import random
import math
import numpy as np
import json

# Create testdata directory if it doesn't exist
if not os.path.exists('testdata'):
    os.makedirs('testdata')

# Create test directory if it doesn't exist
if not os.path.exists('test'):
    os.makedirs('test')

def generate_validation_dataset():
    """
    Generate a dataset with clear clusters that have a known structure.
    Also saves the expected clusters for validation.
    """
    # Create 3 clearly separated clusters in 3D space
    cluster_centers = [
        [0, 0, 0],     # Cluster 1 center
        [10, 10, 10],  # Cluster 2 center
        [5, -5, -5]    # Cluster 3 center
    ]

    # Map to track which point belongs to which expected cluster
    expected_clusters = [[], [], []]

    # Generate points
    all_points = []
    point_count = 1

    # Generate 5 points for cluster 1
    for i in range(5):
        # Small radius to ensure clear separation
        offset = [random.uniform(-0.5, 0.5) for _ in range(3)]
        coords = [cluster_centers[0][j] + offset[j] for j in range(3)]
        all_points.append([f"Point{point_count}", *coords])
        expected_clusters[0].append(point_count - 1)  # 0-based index
        point_count += 1

    # Generate 7 points for cluster 2
    for i in range(7):
        offset = [random.uniform(-0.5, 0.5) for _ in range(3)]
        coords = [cluster_centers[1][j] + offset[j] for j in range(3)]
        all_points.append([f"Point{point_count}", *coords])
        expected_clusters[1].append(point_count - 1)
        point_count += 1

    # Generate 3 points for cluster 3
    for i in range(3):
        offset = [random.uniform(-0.5, 0.5) for _ in range(3)]
        coords = [cluster_centers[2][j] + offset[j] for j in range(3)]
        all_points.append([f"Point{point_count}", *coords])
        expected_clusters[2].append(point_count - 1)
        point_count += 1

    # Add 3 random isolated points (noise)
    for i in range(3):
        # Place these far from any cluster
        coords = [random.uniform(15, 20) for _ in range(3)]
        all_points.append([f"Point{point_count}", *coords])
        # No need to add to expected_clusters as they should be singleton clusters
        point_count += 1

    # Shuffle the points to make it more realistic
    random.shuffle(all_points)

    # Update expected_clusters based on the new point indices after shuffling
    # First create a mapping from point name to new index
    point_name_to_idx = {point[0]: idx for idx, point in enumerate(all_points)}

    # Then update expected_clusters
    new_expected_clusters = [[], [], []]
    for cluster_idx, cluster in enumerate(expected_clusters):
        for old_point_idx in cluster:
            point_name = f"Point{old_point_idx + 1}"
            new_idx = point_name_to_idx[point_name]
            new_expected_clusters[cluster_idx].append(new_idx)

    # Sort indices within each cluster
    for cluster in new_expected_clusters:
        cluster.sort()

    return all_points, new_expected_clusters

# Generate the validation dataset
all_points, expected_clusters = generate_validation_dataset()

# Write points to file
validation_file = os.path.join('testdata', 'validation_clusters.lst')
with open(validation_file, 'w') as f:
    for point in all_points:
        line = ' '.join(str(x) for x in point)
        f.write(line + '\n')

# Save expected clusters to a JSON file for tests to use
expected_file = os.path.join('test', 'expected_clusters.json')
with open(expected_file, 'w') as f:
    json.dump(expected_clusters, f)

print(f"Validation dataset written to {validation_file}")
print(f"Expected clusters written to {expected_file}")
print(f"Expected clusters: {expected_clusters}")
print(f"Total points: {len(all_points)}")
print(f"Points in known clusters: {sum(len(c) for c in expected_clusters)}")
print(f"Noise points: {len(all_points) - sum(len(c) for c in expected_clusters)}")