diff --git a/.gitignore b/.gitignore
index fdd0c6d..1a546cd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,3 @@
-*.png
 *.jpg
 *.jpeg
 *.gif
diff --git a/README.md b/README.md
index df0c441..a24a12b 100644
--- a/README.md
+++ b/README.md
@@ -16,41 +16,33 @@ The deadline is when all intern slots for summer 2026 are filled. We will review
 
 1. **Fork this repository.**
 2. Solve the placement problem using your preferred tools or scripts.  
-3. Run the test script to evaluate your solution and obtain the overlap and wirelength metrics.  
-4. Submit a pull request with your updated leaderboard entry and instructions for me to access your actual submission (it's fine if it's public).  
-
+3. Run the first 10 tests to evaluate your solution and obtain the overlap and wirelength metrics. Report Average Overlap, Wirelength and total Runtime. *Test cases 11 and 12 are extra credit, give them a shot if you have some time.*  
+5. Submit a pull request with your updated leaderboard entry and instructions for me to access your actual submission (it's fine if it's public).
+ 
 Note: You can use any libraries or frameworks you like, but please ensure that your code is well-documented and easy to follow.  
 
 Also, if you think there are any bugs in the provided code, feel free to fix them and mention the changes in your submission.  
 
 You may submit multiple solutions to try and increase your score.
-
+06
 We will review submissions on a rolling basis.
 
-## New Leaderboard (sorted by overlap)
-
-| Rank | Name            | Overlap     | Wirelength (um) | Runtime (s) | Notes                |
-|------|-----------------|-------------|-----------------|-------------|----------------------|
-| 1    |   example       | 0.5000      | 0.5             |  10         |   example submission |
-| 2    | Add Yours!      |             |                 |             |                      |
-
-
-
-## Leaderboard (sorted by overlap) (OLD; test suite has been updated; see above)
+## Leaderboard (sorted by overlap)
 
 | Rank | Name            | Overlap     | Wirelength (um) | Runtime (s) | Notes                |
 |------|-----------------|-------------|-----------------|-------------|----------------------|
-| 1    | Shashank Shriram  | 0.0000     | 0.1310          |  11.32      |   🏎️💥               |
-| 2    | Brayden Rudisill  | 0.0000    | 0.2611          |   50.51     |   Timed on a mac air |
-| 3    | manuhalapeth      | 0.0000    | 0.2630          |  196.8      |                      |
-| 4    | Neil Teje         | 0.0000    | 0.2700          | 24.00s      |                      |
-| 5    | Leison Gao      | 0.0000      | 0.2796          | 50.14s      |                      |
-| 6    | William Pan     | 0.0000      | 0.2848          | 155.33s     |                      |
-| 7    | Ashmit Dutta    | 0.0000      | 0.2870          | 995.58      |  Spent my entire morning (12 am - 6 am) doing this :P       |
-| 8    | Pawan Paleja     | 0.0000      | 0.3311         | 1.74s     |   Implemented hint for loss func, cosine annealing on learning rate with warmup, std annealing on lambda weight. Used optuna to tune hyperparam. Tested on gh codespaces 2-core.                   |
+| 0    | Pratul-Saini    |  0.0000     | 0.2537          | 490.06s     | Moved some pre-existing computation to GPU. Created some initialization based on connectivity heuristic, added warmup on wirelength, cosine annealing on learning rate. Only ran on first 10 tests for scoring purposes. I separately ran test 11 and placed its results in test-11.txt. Test 12 requires optimization on the problem generation (computing 10^10 overlap values in initial metrics) front seems like as that's where I see hanging.     |
+| 1    | Brayden Rudisill  | 0.0000    | 0.2611          |   50.51     |   Timed on a mac air |
+| 2    | manuhalapeth      | 0.0000    | 0.2630          |  196.8      |                      |
+| 3    | Neil Teje         | 0.0000    | 0.2700          | 24.00s      |                      |
+| 4    | Leison Gao      | 0.0000      | 0.2796          | 50.14s      |                      |
+| 5    | William Pan     | 0.0000      | 0.2848          | 155.33s     |                      |
+| 6    | Ashmit Dutta    | 0.0000      | 0.2870          | 995.58      |  Spent my entire morning (12 am - 6 am) doing this :P       |
+| 7    | Pawan Paleja     | 0.0000      | 0.3311         | 1.74s     |   Implemented hint for loss func, cosine annealing on learning rate with warmup, std annealing on lambda weight. Used optuna to tune hyperparam. Tested on gh codespaces 2-core.                   |
+ 8   | Shashank Shriram  | 0.0000     | 0.3312          |  11.32      |   🏎️💥               |
 | 9    | Gabriel Del Monte  | 0.0000      | 0.3427          | 606.07      |                                                              |
 | 10    | Aleksey  Valouev| 0.0000      | 0.3577          | 118.98      |                      |        
-| 11   | Mohul Shukla    | 0.0000      | 0.5048          | 54.60s      |                      |
+| 11   | Mohul Shukla    | 0.0000      | 0.5048          | 54.60s      |                      | 
 | 12    | Ryan Hulke      | 0.0000      | 0.5226          | 166.24      |                      |
 | 13    | Neel  Shah      | 0.0000      | 0.5445          | 45.40       |  Zero overlaps on all tests, adaptive schedule + early stop |
 | 14   | Nawel Asgar    | 0.0000     | 0.5675          | 81.49      | Adaptive penalty scaling with cubic gradients and design-size optimization
diff --git a/constants.py b/constants.py
new file mode 100644
index 0000000..b83826f
--- /dev/null
+++ b/constants.py
@@ -0,0 +1,18 @@
+# Configuration constants
+# Macro parameters
+MIN_MACRO_AREA = 100.0
+MAX_MACRO_AREA = 10000.0
+
+# Standard cell parameters (areas can be 1, 2, or 3)
+STANDARD_CELL_AREAS = [1.0, 2.0, 3.0]
+STANDARD_CELL_HEIGHT = 1.0
+
+# Pin parameters
+MIN_STANDARD_CELL_PINS = 3
+MAX_STANDARD_CELL_PINS = 6
+PIN_SIZE = 0.1  # All pins are 0.1 x 0.1
+
+
+# Placement parameters
+CELL_SPACING = 1e-3 # Small spacing to prevent zero-area cells and ensure legal placement
+MAX_ROW_WIDTH = 120.0 # Maximum width of a placement row (for initial spread)
diff --git a/data.py b/data.py
new file mode 100644
index 0000000..d7947d3
--- /dev/null
+++ b/data.py
@@ -0,0 +1,22 @@
+from enum import IntEnum
+
+# Feature index enums for cleaner code access
+class CellFeatureIdx(IntEnum):
+    """Indices for cell feature tensor columns."""
+    AREA = 0
+    NUM_PINS = 1
+    X = 2
+    Y = 3
+    WIDTH = 4
+    HEIGHT = 5
+
+
+class PinFeatureIdx(IntEnum):
+    """Indices for pin feature tensor columns."""
+    CELL_IDX = 0
+    PIN_X = 1  # Relative to cell corner
+    PIN_Y = 2  # Relative to cell corner
+    X = 3  # Absolute position
+    Y = 4  # Absolute position
+    WIDTH = 5
+    HEIGHT = 6
\ No newline at end of file
diff --git a/initialization.py b/initialization.py
new file mode 100644
index 0000000..390a075
--- /dev/null
+++ b/initialization.py
@@ -0,0 +1,304 @@
+from collections import deque
+import torch
+
+from data import PinFeatureIdx, CellFeatureIdx
+from constants import MIN_MACRO_AREA, CELL_SPACING, MAX_ROW_WIDTH
+
+def build_cell_level_edge_list(pin_features, edge_list):
+    """
+    Convert a pin-level edge list to a cell-level edge list.
+
+    Each edge [src_pin_idx, tgt_pin_idx] is mapped to
+    [src_cell_idx, tgt_cell_idx] via pin_features[:, PinFeatureIdx.CELL_IDX].
+
+    Self-edges (both pins on the same cell) and duplicates are removed.
+
+    Args:
+        pin_features: [P, 7] tensor; column 0 is the owning cell index
+        edge_list:    [E, 2] tensor of pin-index pairs [src_pin_idx, tgt_pin_idx]
+    Returns:
+        cell_edge_list: [E', 2] tensor of deduplicated cell-index pairs
+    """
+    if edge_list.shape[0] == 0:
+        return torch.zeros((0, 2), dtype=torch.long)
+
+    pin_to_cell = pin_features[:, PinFeatureIdx.CELL_IDX].long()  # [P]
+    src_cells = pin_to_cell[edge_list[:, 0]]  # [E]
+    tgt_cells = pin_to_cell[edge_list[:, 1]]  # [E]
+
+    # Remove self-edges — pins on the same cell have no placement implication
+    valid     = src_cells != tgt_cells
+    src_cells = src_cells[valid]
+    tgt_cells = tgt_cells[valid]
+
+    # Canonicalise so smaller index is always first, then deduplicate
+    # This collapses both (i,j) and (j,i) into one undirected edge
+    cell_edges = torch.stack([
+        torch.minimum(src_cells, tgt_cells),
+        torch.maximum(src_cells, tgt_cells),
+    ], dim=1)
+
+    return torch.unique(cell_edges, dim=0)  # [E', 2]
+
+
+def build_adjacency_list(cell_edge_list, num_cells):
+    """
+    Build an undirected adjacency list from a cell-level edge list.
+
+    Args:
+        cell_edge_list: [E, 2] tensor of cell-index pairs
+        num_cells:      int, total number of cells
+    Returns:
+        adjacency: List[List[int]], one entry per cell
+    """
+    adjacency = [[] for _ in range(num_cells)]
+    for src, tgt in cell_edge_list.tolist():
+        src, tgt = int(src), int(tgt)
+        adjacency[src].append(tgt)
+        adjacency[tgt].append(src)
+    return adjacency
+
+
+def compute_bfs_scores_from_macros(adjacency, macro_indices, num_cells, decay=0.5):
+    """
+    For each macro, run a BFS outward through the cell graph and assign a
+    dampened reachability score to every cell it reaches.
+
+    Traversal rules:
+      - BFS starts at the macro and expands freely through std cells
+      - When another macro is reached, record its score but do NOT expand through it
+        (macros act as boundaries — paths cannot pass through them)
+      - Score contribution at each cell = decay ^ depth, where depth is the
+        number of edges from the source macro to that cell
+
+    This produces a score matrix where:
+    scores[m][c] = decay ^ shortest_path_depth(macro_m, cell_c); 0 if cell_c is unreachable from macro_m
+    Slicing gives both outputs needed downstream:
+      macro_to_macro[m][j] = scores[m][macro_indices[j]]   — for macro ordering
+      macro_to_std[m][s]   = scores[m][std_indices[s]]     — for std cell assignment
+
+    Args:
+        cell_edge_list: [E, 2] tensor of global cell-index pairs (already cell-level)
+        macro_indices:  List[int], global cell indices of macros
+        num_cells:      int, total number of cells N
+        decay:          float in (0, 1), per-hop dampening factor
+    Returns:
+        scores: torch.Tensor of shape [M, num_cells], dtype float32
+    """
+    M         = len(macro_indices)
+    macro_set = set(macro_indices)
+    scores    = torch.zeros(M, num_cells, dtype=torch.float32)
+
+    for m, macro_idx in enumerate(macro_indices):
+        visited = {macro_idx}
+        scores[m, macro_idx] = 1.0 # source scores itself at decay^0
+        # Queue entries: (cell_index, depth, accumulated_decay)
+        # Carrying accumulated_decay avoids recomputing decay^depth each step
+        queue = deque([(macro_idx, 0, 1.0)])
+
+        while queue:
+            cell, depth, cell_decay = queue.popleft()
+
+            neighbor_decay = cell_decay * decay   # decay^(depth+1)
+
+            for neighbor in adjacency[cell]:
+                if neighbor in visited:
+                    continue
+
+                visited.add(neighbor)
+                scores[m, neighbor] = neighbor_decay
+
+                if neighbor in macro_set:
+                    # Boundary reached — score recorded, do not expand
+                    continue
+
+                # Std cell — expand through it
+                queue.append((neighbor, depth + 1, neighbor_decay))
+    return scores # [M, num_cells]
+
+def order_std_cells(score_matrix, std_indices):
+    """
+    Sort std cells so that cells connected to the same macro
+    are adjacent in the ordering. Within each macro group,
+    strongest affinity comes first.
+    
+    Args:
+        score_matrix: [M, num_cells] tensor
+        std_indices:  List[int], global indices of std cells
+    Returns:
+        ordered: List[int], indices into std_indices in packing order
+    """
+    # get std cell scores only
+    std_scores = score_matrix[:, std_indices]  # [M, S]
+    # identify the primary macro for each std cell
+    primary = torch.argmax(std_scores, dim=0)  # [S]
+    # get the score for each std cell's primary macro - this is affinity
+    affinity = std_scores.gather(0, primary.unsqueeze(0)).squeeze(0)
+
+    # The primary is an integer while the affinity is a normalized value in [0, 1]. This determines the macro grouping and ordering within each group respectively. Not a very strong ordering, but should be sufficient for a simple initialisation.
+    sort_key = primary.float() - affinity / (affinity.max() + 1e-12)
+    # Sort based on the above
+    return torch.argsort(sort_key).tolist()
+
+
+def pack_std_cells(order, std_indices, cell_features,
+                   row_height=1.0, spacing=CELL_SPACING):
+    """
+    Tile std cells into a dense rectangular block following
+    the given order. Left-to-right, top-to-bottom.
+    
+    Args:
+        order:         List[int], indices into std_indices
+        std_indices:   List[int], global cell indices
+        cell_features: [N, 6] tensor
+        row_height:    float
+        spacing:       float
+    Returns:
+        cell_features: updated clone with std positions written
+        bbox:          (x_min, x_max, y_min, y_max) of the block
+    """
+    cell_features = cell_features.clone()
+    widths = cell_features[std_indices, CellFeatureIdx.WIDTH]
+
+    # Compute how many rows we need and the width of each row
+    total_width = widths.sum().item()
+    num_rows = max(1, int(total_width ** 0.5 / row_height))
+    row_width = total_width / num_rows
+
+    # Cursor tracks the center of the leftmost edge of each std cell. Note row_height of all std cells is 1
+    cursor_x = 0.0
+    cursor_y = row_height / 2
+
+    for s in order:
+        idx = std_indices[s]
+        w = widths[s].item()
+
+        # if the next cell doesn't fit in the current row, move cursor to the start of the next row
+        if cursor_x + w > row_width:
+            cursor_y += row_height + spacing
+            cursor_x = 0.0
+
+        cell_features[idx, CellFeatureIdx.X] = cursor_x + w / 2
+        cell_features[idx, CellFeatureIdx.Y] = cursor_y
+        cursor_x += w + spacing
+
+    # define the bounding box of the packed std cells for macro placement reference and virtual macro creation
+    std_x = cell_features[std_indices, CellFeatureIdx.X]
+    std_y = cell_features[std_indices, CellFeatureIdx.Y]
+    std_w = cell_features[std_indices, CellFeatureIdx.WIDTH]
+    std_h = cell_features[std_indices, CellFeatureIdx.HEIGHT]
+    bbox = (
+        (std_x - std_w / 2).min().item(),
+        (std_x + std_w / 2).max().item(),
+        (std_y - std_h / 2).min().item(),
+        (std_y + std_h / 2).max().item(),
+    )
+
+    return cell_features, bbox
+
+def place_macros_around_block(score_matrix, macro_indices, std_indices,
+                              cell_features, bbox, standoff=5.0):
+    """
+    Place each macro outside the std cell block, on the side
+    where its most connected std cells live. Macros can overlap
+    """
+    # extract coordinates of bounding box of std cells
+    x_min, x_max, y_min, y_max = bbox
+
+    # compute center of the bounding box for reference
+    cx = (x_min + x_max) / 2
+    cy = (y_min + y_max) / 2
+
+    # extract coordinates of std cells
+    std_xy = cell_features[std_indices][:, [CellFeatureIdx.X, CellFeatureIdx.Y]]
+
+    cell_features = cell_features.clone()
+
+    for m, macro_idx in enumerate(macro_indices):
+        # Compute the weighted average position of the std cells, using BFS scores as weights.
+        weights = score_matrix[m, std_indices]
+        w_sum = weights.sum().clamp(min=1e-12)
+        target = (weights @ std_xy) / w_sum
+
+        # Compute delta from center of bounding box to macro target vector
+        dx = target[0] - cx
+        dy = target[1] - cy
+        # compute length of delta vector
+        length = (dx**2 + dy**2).sqrt().clamp(min=1e-12)
+        # normalize delta vector
+        dx, dy = dx / length, dy / length
+
+        # extract macro dimensions and compute at least how far horizontally and vertically the macro must be from the bounding box to avoid overlap, plus standoff
+        mw = cell_features[macro_idx, CellFeatureIdx.WIDTH].item()
+        mh = cell_features[macro_idx, CellFeatureIdx.HEIGHT].item()
+        half_w = (x_max - x_min) / 2 + mw / 2 + standoff
+        half_h = (y_max - y_min) / 2 + mh / 2 + standoff
+
+        # compute horizontally and vertically how far the macro must travel alogn the delta vector to avoid overlap
+        t_x = half_w / abs(dx) if abs(dx) > 1e-12 else float('inf')
+        t_y = half_h / abs(dy) if abs(dy) > 1e-12 else float('inf')
+        # Given a box the macro must be contained in, we look for the smallest scaling factor t that moves the macro outside the box along the delta vector
+        t = min(t_x, t_y)
+
+        # shift macro from center of bounding box to projected position outside the bounding box
+        cell_features[macro_idx, CellFeatureIdx.X] = cx + dx * t
+        cell_features[macro_idx, CellFeatureIdx.Y] = cy + dy * t
+
+    return cell_features
+
+def create_virtual_macro(bbox):
+    """
+    Create a single-row cell feature tensor representing
+    the std cell block as one large immovable macro.
+
+    Args:
+        bbox: (x_min, x_max, y_min, y_max) of packed std cells
+    Returns:
+        virtual_macro: [1, 6] tensor with
+            [area, num_pins=0, cx, cy, width, height]
+    """
+    x_min, x_max, y_min, y_max = bbox
+    width = x_max - x_min
+    height = y_max - y_min
+
+    return torch.tensor([[
+        width * height,  # area
+        0.0,             # num_pins (no pins — wirelength ignores it)
+        (x_min + x_max) / 2,  # x center
+        (y_min + y_max) / 2,  # y center
+        width,
+        height,
+    ]])
+
+def initialize_placement(cell_features, pin_features, edge_list):
+    """
+    Full initialization pipeline:
+      1. Build cell-level graph from pin connectivity
+      2. BFS from each macro to score reachability to all cells
+      3. Order std cells by macro affinity (clustering)
+      4. Dense-pack std cells into a rectangular block
+      5. Place macros outside the block, each on its natural side
+    """
+    num_cells = cell_features.shape[0]
+
+    cell_edge_list = build_cell_level_edge_list(pin_features, edge_list)
+    adjacency = build_adjacency_list(cell_edge_list, num_cells)
+
+    macro_indices = [i for i in range(num_cells)
+                     if cell_features[i, CellFeatureIdx.AREA] >= MIN_MACRO_AREA]
+    std_indices = [i for i in range(num_cells)
+                   if cell_features[i, CellFeatureIdx.AREA] < MIN_MACRO_AREA]
+
+    score_matrix = compute_bfs_scores_from_macros(
+        adjacency, macro_indices, num_cells
+    )
+
+    order = order_std_cells(score_matrix, std_indices)
+    cell_features, bbox = pack_std_cells(order, std_indices, cell_features)
+    cell_features = place_macros_around_block(
+        score_matrix, macro_indices, std_indices, cell_features, bbox
+    )
+
+    virtual_macro = create_virtual_macro(bbox)
+
+    return cell_features, virtual_macro
\ No newline at end of file
diff --git a/placement.py b/placement.py
index d70412d..278af21 100644
--- a/placement.py
+++ b/placement.py
@@ -39,46 +39,16 @@
 """
 
 import os
-from enum import IntEnum
 
 import torch
 import torch.optim as optim
 
+from constants import MIN_MACRO_AREA, MAX_MACRO_AREA, STANDARD_CELL_AREAS, STANDARD_CELL_HEIGHT, MIN_STANDARD_CELL_PINS, MAX_STANDARD_CELL_PINS, PIN_SIZE
+from data import CellFeatureIdx, PinFeatureIdx
+from initialization import initialize_placement
 
-# Feature index enums for cleaner code access
-class CellFeatureIdx(IntEnum):
-    """Indices for cell feature tensor columns."""
-    AREA = 0
-    NUM_PINS = 1
-    X = 2
-    Y = 3
-    WIDTH = 4
-    HEIGHT = 5
-
-
-class PinFeatureIdx(IntEnum):
-    """Indices for pin feature tensor columns."""
-    CELL_IDX = 0
-    PIN_X = 1  # Relative to cell corner
-    PIN_Y = 2  # Relative to cell corner
-    X = 3  # Absolute position
-    Y = 4  # Absolute position
-    WIDTH = 5
-    HEIGHT = 6
-
-
-# Configuration constants
-# Macro parameters
-MIN_MACRO_AREA = 100.0
-MAX_MACRO_AREA = 10000.0
-
-# Standard cell parameters (areas can be 1, 2, or 3)
-STANDARD_CELL_AREAS = [1.0, 2.0, 3.0]
-STANDARD_CELL_HEIGHT = 1.0
-
-# Pin count parameters
-MIN_STANDARD_CELL_PINS = 3
-MAX_STANDARD_CELL_PINS = 6
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {DEVICE}")
 
 # Output directory
 OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -154,7 +124,6 @@ def generate_placement_input(num_macros, num_std_cells):
     pin_features = torch.zeros(total_pins, 7)
 
     # Fixed pin size for all pins (square pins)
-    PIN_SIZE = 0.1  # All pins are 0.1 x 0.1
 
     pin_idx = 0
     for cell_idx in range(total_cells):
@@ -195,7 +164,6 @@ def generate_placement_input(num_macros, num_std_cells):
     # Step 7: Generate edges with simple random connectivity
     # Each pin connects to 1-3 random pins (preferring different cells)
     edge_list = []
-    avg_edges_per_pin = 2.0
 
     pin_to_cell = torch.zeros(total_pins, dtype=torch.long)
     pin_idx = 0
@@ -263,7 +231,7 @@ def wirelength_attraction_loss(cell_features, pin_features, edge_list):
         Scalar loss value
     """
     if edge_list.shape[0] == 0:
-        return torch.tensor(0.0, requires_grad=True)
+        return torch.tensor(0.0, device=DEVICE, requires_grad=True)
 
     # Update absolute pin positions based on cell positions
     cell_positions = cell_features[:, 2:4]  # [N, 2]
@@ -299,158 +267,168 @@ def wirelength_attraction_loss(cell_features, pin_features, edge_list):
     return total_wirelength / edge_list.shape[0]  # Normalize by number of edges
 
 
-def overlap_repulsion_loss(cell_features, pin_features, edge_list):
-    """Calculate loss to prevent cell overlaps.
-
-    TODO: IMPLEMENT THIS FUNCTION
-
-    This is the main challenge. You need to implement a differentiable loss function
-    that penalizes overlapping cells. The loss should:
-
-    1. Be zero when no cells overlap
-    2. Increase as overlap area increases
-    3. Use only differentiable PyTorch operations (no if statements on tensors)
-    4. Work efficiently with vectorized operations
-
-    HINTS:
-    - Two axis-aligned rectangles overlap if they overlap in BOTH x and y dimensions
-    - For rectangles centered at (x1, y1) and (x2, y2) with widths (w1, w2) and heights (h1, h2):
-      * x-overlap occurs when |x1 - x2| < (w1 + w2) / 2
-      * y-overlap occurs when |y1 - y2| < (h1 + h2) / 2
-    - Use torch.relu() to compute positive overlaps: overlap_x = relu((w1+w2)/2 - |x1-x2|)
-    - Overlap area = overlap_x * overlap_y
-    - Consider all pairs of cells: use broadcasting with unsqueeze
-    - Use torch.triu() to avoid counting each pair twice (only consider i < j)
-    - Normalize the loss appropriately (by number of pairs or total area)
-
-    RECOMMENDED APPROACH:
-    1. Extract positions, widths, heights from cell_features
-    2. Compute all pairwise distances using broadcasting:
-       positions_i = positions.unsqueeze(1)  # [N, 1, 2]
-       positions_j = positions.unsqueeze(0)  # [1, N, 2]
-       distances = positions_i - positions_j  # [N, N, 2]
-    3. Calculate minimum separation distances for each pair
-    4. Use relu to get positive overlap amounts
-    5. Multiply overlaps in x and y to get overlap areas
-    6. Mask to only consider upper triangle (i < j)
-    7. Sum and normalize
-
-    Args:
-        cell_features: [N, 6] tensor with [area, num_pins, x, y, width, height]
-        pin_features: [P, 7] tensor with pin information (not used here)
-        edge_list: [E, 2] tensor with edges (not used here)
-
-    Returns:
-        Scalar loss value (should be 0 when no overlaps exist)
-    """
+def overlap_repulsion_loss(cell_features):
     N = cell_features.shape[0]
     if N <= 1:
-        return torch.tensor(0.0, requires_grad=True)
+        return torch.tensor(0.0, device=DEVICE, requires_grad=True)
+
+    X = cell_features[:, CellFeatureIdx.X].unsqueeze(1)
+    Y = cell_features[:, CellFeatureIdx.Y].unsqueeze(1)
+    W = cell_features[:, CellFeatureIdx.WIDTH].unsqueeze(1)
+    H = cell_features[:, CellFeatureIdx.HEIGHT].unsqueeze(1)
 
-    # TODO: Implement overlap detection and loss calculation here
-    #
-    # Your implementation should:
-    # 1. Extract cell positions, widths, and heights
-    # 2. Compute pairwise overlaps using vectorized operations
-    # 3. Return a scalar loss that is zero when no overlaps exist
-    #
-    # Delete this placeholder and add your implementation:
+    overlap_x = torch.relu((W + W.T) / 2 - torch.abs(X - X.T))
+    overlap_y = torch.relu((H + H.T) / 2 - torch.abs(Y - Y.T))
+    overlap_area = overlap_x * overlap_y
 
-    # Placeholder - returns a constant loss (REPLACE THIS!)
-    return torch.tensor(1.0, requires_grad=True)
+    mask = torch.triu(torch.ones(N, N, device=DEVICE), diagonal=1)
+    return torch.sum(overlap_area * mask)
 
 
 def train_placement(
     cell_features,
     pin_features,
     edge_list,
-    num_epochs=1000,
-    lr=0.01,
-    lambda_wirelength=1.0,
-    lambda_overlap=10.0,
+    num_epochs=20000,
+    lr=0.2,
+    lambda_wirelength=5.0,
+    lambda_overlap=1000.0,
+    warmup_fraction=0.15,
     verbose=True,
     log_interval=100,
 ):
-    """Train the placement optimization using gradient descent.
-
-    Args:
-        cell_features: [N, 6] tensor with cell properties
-        pin_features: [P, 7] tensor with pin properties
-        edge_list: [E, 2] tensor with edge connectivity
-        num_epochs: Number of optimization iterations
-        lr: Learning rate for Adam optimizer
-        lambda_wirelength: Weight for wirelength loss
-        lambda_overlap: Weight for overlap loss
-        verbose: Whether to print progress
-        log_interval: How often to print progress
+    N = cell_features.shape[0]
 
-    Returns:
-        Dictionary with:
-            - final_cell_features: Optimized cell positions
-            - initial_cell_features: Original cell positions (for comparison)
-            - loss_history: Loss values over time
-    """
-    # Clone features and create learnable positions
     cell_features = cell_features.clone()
     initial_cell_features = cell_features.clone()
 
-    # Make only cell positions require gradients
-    cell_positions = cell_features[:, 2:4].clone().detach()
-    cell_positions.requires_grad_(True)
+    cell_features, virtual_macro = initialize_placement(cell_features, pin_features, edge_list)
+
+    cell_features = cell_features.to(DEVICE)
+    pin_features = pin_features.to(DEVICE)
+    edge_list = edge_list.to(DEVICE)
+
+    macro_indices = [i for i in range(N) if cell_features[i, CellFeatureIdx.AREA] >= MIN_MACRO_AREA]
+
+    # Two modes: optimize all cells jointly, or only macros
+    optimize_all = N <= 500
 
-    # Create optimizer
-    optimizer = optim.Adam([cell_positions], lr=lr)
+    if optimize_all:
+        positions = cell_features[:, 2:4].clone().detach()
+        virtual_macro = None  # No virtual macro — overlap computed on all cells
+    else:
+        positions = cell_features[macro_indices, 2:4].clone().detach()
+        virtual_macro = virtual_macro.to(DEVICE)
+
+    positions.requires_grad_(True)
 
-    # Track loss history
     loss_history = {
         "total_loss": [],
         "wirelength_loss": [],
         "overlap_loss": [],
     }
 
-    # Training loop
-    for epoch in range(num_epochs):
-        optimizer.zero_grad()
+    best_positions = positions.clone().detach()
+    best_wl_loss = float("inf")
 
-        # Create cell_features with current positions
-        cell_features_current = cell_features.clone()
-        cell_features_current[:, 2:4] = cell_positions
+    warmup_epochs = int(num_epochs * warmup_fraction)
+    overlap_epochs = num_epochs - warmup_epochs
 
-        # Calculate losses
-        wl_loss = wirelength_attraction_loss(
-            cell_features_current, pin_features, edge_list
-        )
-        overlap_loss = overlap_repulsion_loss(
-            cell_features_current, pin_features, edge_list
+    # ---- Phase 1: wirelength-only warmup with its own optimizer ----
+    if warmup_epochs > 0:
+        optimizer_wl = optim.Adam([positions], lr=lr)
+        scheduler_wl = optim.lr_scheduler.CosineAnnealingLR(
+            optimizer_wl, T_max=warmup_epochs, eta_min=1e-4
         )
 
-        # Combined loss
-        total_loss = lambda_wirelength * wl_loss + lambda_overlap * overlap_loss
+        for epoch in range(warmup_epochs):
+            optimizer_wl.zero_grad()
 
-        # Backward pass
-        total_loss.backward()
+            cell_features_current = cell_features.clone()
+            if optimize_all:
+                cell_features_current[:, 2:4] = positions
+            else:
+                cell_features_current[macro_indices, 2:4] = positions
+
+            wl_loss = wirelength_attraction_loss(cell_features_current, pin_features, edge_list)
+
+            with torch.no_grad():
+                if virtual_macro is not None:
+                    overlap_cells = torch.cat([cell_features_current[macro_indices], virtual_macro], dim=0)
+                    overlap_loss_val = overlap_repulsion_loss(overlap_cells).item()
+                else:
+                    overlap_loss_val = overlap_repulsion_loss(cell_features_current).item()
+
+            total_loss = lambda_wirelength * wl_loss
+            total_loss.backward()
+            torch.nn.utils.clip_grad_norm_([positions], max_norm=5.0)
+            optimizer_wl.step()
+            scheduler_wl.step()
+
+            loss_history["total_loss"].append(total_loss.item())
+            loss_history["wirelength_loss"].append(wl_loss.item())
+            loss_history["overlap_loss"].append(overlap_loss_val)
+
+            if verbose and (epoch % log_interval == 0 or epoch == warmup_epochs - 1):
+                print(f"Epoch {epoch}/{num_epochs} [WL-warmup]:")
+                print(f"  Wirelength Loss: {wl_loss.item():.10f}")
+                print(f"  Overlap (no grad): {overlap_loss_val:.10f}")
+
+    # ---- Phase 2: joint wirelength + overlap with fresh optimizer ----
+    optimizer = optim.Adam([positions], lr=lr)
+    scheduler = optim.lr_scheduler.CosineAnnealingLR(
+        optimizer, T_max=overlap_epochs, eta_min=1e-4
+    )
+
+    for epoch in range(overlap_epochs):
+        global_epoch = warmup_epochs + epoch
+        phase2_progress = min(1.0, epoch / (overlap_epochs * 0.6))
+        current_lambda_overlap = lambda_overlap * phase2_progress
+        current_lambda_wirelength = lambda_wirelength
+
+        optimizer.zero_grad()
 
-        # Gradient clipping to prevent extreme updates
-        torch.nn.utils.clip_grad_norm_([cell_positions], max_norm=5.0)
+        cell_features_current = cell_features.clone()
+        if optimize_all:
+            cell_features_current[:, 2:4] = positions
+        else:
+            cell_features_current[macro_indices, 2:4] = positions
 
-        # Update positions
+        wl_loss = wirelength_attraction_loss(cell_features_current, pin_features, edge_list)
+
+        if virtual_macro is not None:
+            overlap_cells = torch.cat([cell_features_current[macro_indices], virtual_macro], dim=0)
+            overlap_loss = overlap_repulsion_loss(overlap_cells)
+        else:
+            overlap_loss = overlap_repulsion_loss(cell_features_current)
+
+        if overlap_loss.item() < 1e-6 and wl_loss.item() < best_wl_loss:
+            best_wl_loss = wl_loss.item()
+            best_positions = positions.clone().detach()
+
+        total_loss = current_lambda_wirelength * wl_loss + current_lambda_overlap * overlap_loss
+        total_loss.backward()
+        torch.nn.utils.clip_grad_norm_([positions], max_norm=5.0)
         optimizer.step()
+        scheduler.step()
 
-        # Record losses
         loss_history["total_loss"].append(total_loss.item())
         loss_history["wirelength_loss"].append(wl_loss.item())
         loss_history["overlap_loss"].append(overlap_loss.item())
 
-        # Log progress
-        if verbose and (epoch % log_interval == 0 or epoch == num_epochs - 1):
-            print(f"Epoch {epoch}/{num_epochs}:")
-            print(f"  Total Loss: {total_loss.item():.6f}")
-            print(f"  Wirelength Loss: {wl_loss.item():.6f}")
-            print(f"  Overlap Loss: {overlap_loss.item():.6f}")
+        if verbose and (global_epoch % log_interval == 0 or epoch == overlap_epochs - 1):
+            print(f"Epoch {global_epoch}/{num_epochs} [WL+Overlap]:")
+            print(f"  Total Loss: {total_loss.item():.10f}")
+            print(f"  Wirelength Loss: {wl_loss.item():.10f}")
+            print(f"  Overlap Loss: {overlap_loss.item():.10f}")
 
-    # Create final cell features
-    final_cell_features = cell_features.clone()
-    final_cell_features[:, 2:4] = cell_positions.detach()
+    # Restore best checkpoint
+    cell_features_current = cell_features.clone()
+    if optimize_all:
+        cell_features_current[:, 2:4] = best_positions
+    else:
+        cell_features_current[macro_indices, 2:4] = best_positions
+    final_cell_features = cell_features_current.detach()
 
     return {
         "final_cell_features": final_cell_features,
@@ -458,7 +436,6 @@ def train_placement(
         "loss_history": loss_history,
     }
 
-
 # ======= FINAL EVALUATION CODE (Don't edit this part) =======
 
 def calculate_overlap_metrics(cell_features):
@@ -486,11 +463,10 @@ def calculate_overlap_metrics(cell_features):
             "overlap_percentage": 0.0,
         }
 
-    # Extract cell properties
-    positions = cell_features[:, 2:4].detach().numpy()  # [N, 2]
-    widths = cell_features[:, 4].detach().numpy()  # [N]
-    heights = cell_features[:, 5].detach().numpy()  # [N]
-    areas = cell_features[:, 0].detach().numpy()  # [N]
+    positions = cell_features[:, 2:4].detach().cpu().numpy()
+    widths = cell_features[:, 4].detach().cpu().numpy()
+    heights = cell_features[:, 5].detach().cpu().numpy()
+    areas = cell_features[:, 0].detach().cpu().numpy()
 
     overlap_count = 0
     total_overlap_area = 0.0
@@ -548,9 +524,9 @@ def calculate_cells_with_overlaps(cell_features):
         return set()
 
     # Extract cell properties
-    positions = cell_features[:, 2:4].detach().numpy()
-    widths = cell_features[:, 4].detach().numpy()
-    heights = cell_features[:, 5].detach().numpy()
+    positions = cell_features[:, 2:4].detach().cpu().numpy()
+    widths = cell_features[:, 4].detach().cpu().numpy()
+    heights = cell_features[:, 5].detach().cpu().numpy()
 
     cells_with_overlaps = set()
 
@@ -608,7 +584,7 @@ def calculate_normalized_metrics(cell_features, pin_features, edge_list):
         num_nets = 0
     else:
         # Calculate total wirelength using the loss function (unnormalized)
-        wl_loss = wirelength_attraction_loss(cell_features, pin_features, edge_list)
+        wl_loss = wirelength_attraction_loss(cell_features.to(DEVICE), pin_features.to(DEVICE), edge_list.to(DEVICE))
         total_wirelength = wl_loss.item() * edge_list.shape[0]  # Undo normalization
 
         # Calculate total area
@@ -632,8 +608,6 @@ def calculate_normalized_metrics(cell_features, pin_features, edge_list):
 def plot_placement(
     initial_cell_features,
     final_cell_features,
-    pin_features,
-    edge_list,
     filename="placement_result.png",
 ):
     """Create side-by-side visualization of initial vs final placement.
@@ -657,9 +631,9 @@ def plot_placement(
             (ax2, final_cell_features, "Final Placement"),
         ]:
             N = cell_features.shape[0]
-            positions = cell_features[:, 2:4].detach().numpy()
-            widths = cell_features[:, 4].detach().numpy()
-            heights = cell_features[:, 5].detach().numpy()
+            positions = cell_features[:, 2:4].detach().cpu().numpy()
+            widths = cell_features[:, 4].detach().cpu().numpy()
+            heights = cell_features[:, 5].detach().cpu().numpy()
 
             # Draw cells
             for i in range(N):
@@ -717,6 +691,7 @@ def main():
 
     # Set random seed for reproducibility
     torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
 
     # Generate placement problem
     num_macros = 3
@@ -807,10 +782,8 @@ def main():
     plot_placement(
         result["initial_cell_features"],
         result["final_cell_features"],
-        pin_features,
-        edge_list,
         filename="placement_result.png",
     )
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/test-logs/output-seeded-1.txt b/test-logs/output-seeded-1.txt
new file mode 100644
index 0000000..ff28a45
Binary files /dev/null and b/test-logs/output-seeded-1.txt differ
diff --git a/test-logs/output-seeded-2.txt b/test-logs/output-seeded-2.txt
new file mode 100644
index 0000000..7212a1a
Binary files /dev/null and b/test-logs/output-seeded-2.txt differ
diff --git a/test-logs/output-seeded-3.txt b/test-logs/output-seeded-3.txt
new file mode 100644
index 0000000..7ce1309
Binary files /dev/null and b/test-logs/output-seeded-3.txt differ
diff --git a/test-logs/output-seeded-4.txt b/test-logs/output-seeded-4.txt
new file mode 100644
index 0000000..e1aaa4b
Binary files /dev/null and b/test-logs/output-seeded-4.txt differ
diff --git a/test-logs/output-seeded-5.txt b/test-logs/output-seeded-5.txt
new file mode 100644
index 0000000..e30f40c
Binary files /dev/null and b/test-logs/output-seeded-5.txt differ
diff --git a/test-logs/output-seeded-6.txt b/test-logs/output-seeded-6.txt
new file mode 100644
index 0000000..6573b53
Binary files /dev/null and b/test-logs/output-seeded-6.txt differ
diff --git a/test-logs/test-11-v2.txt b/test-logs/test-11-v2.txt
new file mode 100644
index 0000000..db541a5
Binary files /dev/null and b/test-logs/test-11-v2.txt differ
diff --git a/test-logs/test-11.txt b/test-logs/test-11.txt
new file mode 100644
index 0000000..104c2e7
Binary files /dev/null and b/test-logs/test-11.txt differ
diff --git a/test-placement-plots/test_placement_1.png b/test-placement-plots/test_placement_1.png
new file mode 100644
index 0000000..0c96c6f
Binary files /dev/null and b/test-placement-plots/test_placement_1.png differ
diff --git a/test-placement-plots/test_placement_10.png b/test-placement-plots/test_placement_10.png
new file mode 100644
index 0000000..ca528da
Binary files /dev/null and b/test-placement-plots/test_placement_10.png differ
diff --git a/test-placement-plots/test_placement_2.png b/test-placement-plots/test_placement_2.png
new file mode 100644
index 0000000..d646919
Binary files /dev/null and b/test-placement-plots/test_placement_2.png differ
diff --git a/test-placement-plots/test_placement_3.png b/test-placement-plots/test_placement_3.png
new file mode 100644
index 0000000..9751f6b
Binary files /dev/null and b/test-placement-plots/test_placement_3.png differ
diff --git a/test-placement-plots/test_placement_4.png b/test-placement-plots/test_placement_4.png
new file mode 100644
index 0000000..40d4f73
Binary files /dev/null and b/test-placement-plots/test_placement_4.png differ
diff --git a/test-placement-plots/test_placement_5.png b/test-placement-plots/test_placement_5.png
new file mode 100644
index 0000000..97ce060
Binary files /dev/null and b/test-placement-plots/test_placement_5.png differ
diff --git a/test-placement-plots/test_placement_6.png b/test-placement-plots/test_placement_6.png
new file mode 100644
index 0000000..ee7cda9
Binary files /dev/null and b/test-placement-plots/test_placement_6.png differ
diff --git a/test-placement-plots/test_placement_7.png b/test-placement-plots/test_placement_7.png
new file mode 100644
index 0000000..dc6c548
Binary files /dev/null and b/test-placement-plots/test_placement_7.png differ
diff --git a/test-placement-plots/test_placement_8.png b/test-placement-plots/test_placement_8.png
new file mode 100644
index 0000000..9d4aa8e
Binary files /dev/null and b/test-placement-plots/test_placement_8.png differ
diff --git a/test-placement-plots/test_placement_9.png b/test-placement-plots/test_placement_9.png
new file mode 100644
index 0000000..3b50d0f
Binary files /dev/null and b/test-placement-plots/test_placement_9.png differ
diff --git a/test.py b/test.py
index f22ff21..55436bc 100644
--- a/test.py
+++ b/test.py
@@ -27,6 +27,7 @@
     calculate_normalized_metrics,
     generate_placement_input,
     train_placement,
+    plot_placement,
 )
 
 
@@ -46,8 +47,8 @@
     (9, 8, 200, 1009),
     (10, 10, 2000, 1010),
     # Realistic designs
-    (11, 10, 10000, 1011),
-    (12, 10, 100000, 1012),
+    #(11, 10, 10000, 1011),
+   # (12, 10, 100000, 1012),
 ]
 
 
@@ -73,12 +74,15 @@ def run_placement_test(
     if seed:
         # Set seed for reproducibility
         torch.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)
 
     # Generate netlist
     cell_features, pin_features, edge_list = generate_placement_input(
         num_macros, num_std_cells
     )
 
+    initial_cell_features = cell_features.clone()  # Keep a copy of initial features for metrics
+
     # Initialize positions with random spread
     total_cells = cell_features.shape[0]
     total_area = cell_features[:, 0].sum().item()
@@ -104,6 +108,8 @@ def run_placement_test(
     final_cell_features = result["final_cell_features"]
     metrics = calculate_normalized_metrics(final_cell_features, pin_features, edge_list)
 
+    plot_placement(initial_cell_features, final_cell_features, filename=f"test-placement-plots/test_placement_{test_id}.png")
+
     return {
         "test_id": test_id,
         "num_macros": num_macros,