From ed185d662ea2c7f9c7b67ad10d5386561818464b Mon Sep 17 00:00:00 2001
From: Richard Wang <wangrichard08@gmail.com>
Date: Wed, 1 Apr 2026 12:41:43 +0800
Subject: [PATCH] Add leaderboard entry: Richard Wang (WL=0.2666,
 overlap=0.0000)

---
 README.md    |   47 +-
 placement.py | 1464 ++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 1274 insertions(+), 237 deletions(-)

diff --git a/README.md b/README.md
index cf27bfb..9c08fd8 100644
--- a/README.md
+++ b/README.md
@@ -29,32 +29,35 @@ We will review submissions on a rolling basis.
 
 ## Leaderboard (sorted by overlap)
 
+## Leaderboard (sorted by overlap)
+
 | Rank | Name            | Overlap     | Wirelength (um) | Runtime (s) | Notes                |
 |------|-----------------|-------------|-----------------|-------------|----------------------|
 | 1    | Brayden Rudisill  | 0.0000    | 0.2611          |   50.51     |   Timed on a mac air |
 | 2    | manuhalapeth      | 0.0000    | 0.2630          |  196.8      |                      |
-| 3    | Neil Teje         | 0.0000    | 0.2700          | 24.00s      |                      |
-| 4    | Leison Gao      | 0.0000      | 0.2796          | 50.14s      |                      |
-| 5    | William Pan     | 0.0000      | 0.2848          | 155.33s     |                      |
-| 6    | Ashmit Dutta    | 0.0000      | 0.2870          | 995.58      |  Spent my entire morning (12 am - 6 am) doing this :P       |
-| 7    | Pawan Paleja     | 0.0000      | 0.3311         | 1.74s     |   Implemented hint for loss func, cosine annealing on learning rate with warmup, std annealing on lambda weight. Used optuna to tune hyperparam. Tested on gh codespaces 2-core.                   |
- 8   | Shashank Shriram  | 0.0000     | 0.3312          |  11.32      |   🏎️💥               |
-| 9    | Gabriel Del Monte  | 0.0000      | 0.3427          | 606.07      |                                                              |
-| 10    | Aleksey  Valouev| 0.0000      | 0.3577          | 118.98      |                      |        
-| 11   | Mohul Shukla    | 0.0000      | 0.5048          | 54.60s      |                      |
-| 12    | Ryan Hulke      | 0.0000      | 0.5226          | 166.24      |                      |
-| 13    | Neel  Shah      | 0.0000      | 0.5445          | 45.40       |  Zero overlaps on all tests, adaptive schedule + early stop |
-| 14   | Nawel Asgar    | 0.0000     | 0.5675          | 81.49      | Adaptive penalty scaling with cubic gradients and design-size optimization
-| 15   | Shiva Baghel     | 0.0000     | 0.5885          | 491.00      | Stable zero-overlap with balanced optimization      |
-| 16   | Vansh Jain      | 0.0000      | 0.9352          | 86.36       |                      |
-| 17    | Akash Pai       | 0.0006      | 0.4933          | 326.25s     |                      |
-| 18    | Zade Mahayni     | 0.00665     | 0.5157          |  127.4     | Will try again tomorrow |
-| 19    | Nithin Yanna    | 0.0148      | 0.5034          | 247.30s     | aggressive overlap penalty with quadratic scaling |
-| 20    | Sean Ko         | 0.0271      |  .5138          | 31.83s      | lr increase, decrease epoch, increase lambda overlap and decreased lambda wire_length + log penalty loss |
-| 21    | Keya Gohil    | 0.0155      | 0.4678         | 1513.07     | Still working |
-| 22    | Prithvi Seran   | 0.0499      | 0.4890          | 398.58      |                      |
-| 23    | partcl example  | 0.8         | 0.4             | 5           | example              |
-| 24    | Add Yours!      |             |                 |             |                      |
+| 3    | Richard Wang      | 0.0000    | 0.2666          | 124.86      |                      |
+| 4    | Neil Teje         | 0.0000    | 0.2700          | 24.00s      |                      |
+| 5    | Leison Gao      | 0.0000      | 0.2796          | 50.14s      |                      |
+| 6    | William Pan     | 0.0000      | 0.2848          | 155.33s     |                      |
+| 7    | Ashmit Dutta    | 0.0000      | 0.2870          | 995.58      |  Spent my entire morning (12 am - 6 am) doing this :P       |
+| 8    | Pawan Paleja     | 0.0000      | 0.3311         | 1.74s     |   Implemented hint for loss func, cosine annealing on learning rate with warmup, std annealing on lambda weight. Used optuna to tune hyperparam. Tested on gh codespaces 2-core.                   |
+| 9    | Shashank Shriram  | 0.0000     | 0.3312          |  11.32      |   🏎️💥               |
+| 10   | Gabriel Del Monte  | 0.0000      | 0.3427          | 606.07      |                                                              |
+| 11   | Aleksey  Valouev| 0.0000      | 0.3577          | 118.98      |                      |        
+| 12   | Mohul Shukla    | 0.0000      | 0.5048          | 54.60s      |                      |
+| 13   | Ryan Hulke      | 0.0000      | 0.5226          | 166.24      |                      |
+| 14   | Neel  Shah      | 0.0000      | 0.5445          | 45.40       |  Zero overlaps on all tests, adaptive schedule + early stop |
+| 15   | Nawel Asgar    | 0.0000     | 0.5675          | 81.49      | Adaptive penalty scaling with cubic gradients and design-size optimization
+| 16   | Shiva Baghel     | 0.0000     | 0.5885          | 491.00      | Stable zero-overlap with balanced optimization      |
+| 17   | Vansh Jain      | 0.0000      | 0.9352          | 86.36       |                      |
+| 18   | Akash Pai       | 0.0006      | 0.4933          | 326.25s     |                      |
+| 19   | Zade Mahayni     | 0.00665     | 0.5157          |  127.4     | Will try again tomorrow |
+| 20   | Nithin Yanna    | 0.0148      | 0.5034          | 247.30s     | aggressive overlap penalty with quadratic scaling |
+| 21   | Sean Ko         | 0.0271      |  .5138          | 31.83s      | lr increase, decrease epoch, increase lambda overlap and decreased lambda wire_length + log penalty loss |
+| 22   | Keya Gohil    | 0.0155      | 0.4678         | 1513.07     | Still working |
+| 23   | Prithvi Seran   | 0.0499      | 0.4890          | 398.58      |                      |
+| 24   | partcl example  | 0.8         | 0.4             | 5           | example              |
+| 25   | Add Yours!      |             |                 |             |                      |
 
 > **To add your results:**  
 > Insert a new row in the table above with your name, overlap, wirelength, and any notes. Ensure you sort by overlap.
diff --git a/placement.py b/placement.py
index d70412d..e036fda 100644
--- a/placement.py
+++ b/placement.py
@@ -39,8 +39,10 @@
 """
 
 import os
+from collections import defaultdict
 from enum import IntEnum
 
+import numpy as np
 import torch
 import torch.optim as optim
 
@@ -144,8 +146,6 @@ def generate_placement_input(num_macros, num_std_cells):
     cell_features = torch.zeros(total_cells, 6)
     cell_features[:, CellFeatureIdx.AREA] = areas
     cell_features[:, CellFeatureIdx.NUM_PINS] = num_pins_per_cell.float()
-    cell_features[:, CellFeatureIdx.X] = 0.0  # x position (initialized to 0)
-    cell_features[:, CellFeatureIdx.Y] = 0.0  # y position (initialized to 0)
     cell_features[:, CellFeatureIdx.WIDTH] = cell_widths
     cell_features[:, CellFeatureIdx.HEIGHT] = cell_heights
 
@@ -156,85 +156,46 @@ def generate_placement_input(num_macros, num_std_cells):
     # Fixed pin size for all pins (square pins)
     PIN_SIZE = 0.1  # All pins are 0.1 x 0.1
 
-    pin_idx = 0
-    for cell_idx in range(total_cells):
-        n_pins = num_pins_per_cell[cell_idx].item()
-        cell_width = cell_widths[cell_idx].item()
-        cell_height = cell_heights[cell_idx].item()
-
-        # Generate random pin positions within the cell
-        # Offset from edges to ensure pins are fully inside
-        margin = PIN_SIZE / 2
-        if cell_width > 2 * margin and cell_height > 2 * margin:
-            pin_x = torch.rand(n_pins) * (cell_width - 2 * margin) + margin
-            pin_y = torch.rand(n_pins) * (cell_height - 2 * margin) + margin
-        else:
-            # For very small cells, just center the pins
-            pin_x = torch.full((n_pins,), cell_width / 2)
-            pin_y = torch.full((n_pins,), cell_height / 2)
-
-        # Fill pin features
-        pin_features[pin_idx : pin_idx + n_pins, PinFeatureIdx.CELL_IDX] = cell_idx
-        pin_features[pin_idx : pin_idx + n_pins, PinFeatureIdx.PIN_X] = (
-            pin_x  # relative to cell
-        )
-        pin_features[pin_idx : pin_idx + n_pins, PinFeatureIdx.PIN_Y] = (
-            pin_y  # relative to cell
-        )
-        pin_features[pin_idx : pin_idx + n_pins, PinFeatureIdx.X] = (
-            pin_x  # absolute (same as relative initially)
-        )
-        pin_features[pin_idx : pin_idx + n_pins, PinFeatureIdx.Y] = (
-            pin_y  # absolute (same as relative initially)
-        )
-        pin_features[pin_idx : pin_idx + n_pins, PinFeatureIdx.WIDTH] = PIN_SIZE
-        pin_features[pin_idx : pin_idx + n_pins, PinFeatureIdx.HEIGHT] = PIN_SIZE
-
-        pin_idx += n_pins
+    pin_cell_idx = torch.repeat_interleave(
+        torch.arange(total_cells), num_pins_per_cell
+    )
+    pin_cell_w = cell_widths[pin_cell_idx]
+    pin_cell_h = cell_heights[pin_cell_idx]
+    margin = PIN_SIZE / 2
+    raw_rx = torch.rand(total_pins)
+    raw_ry = torch.rand(total_pins)
+    usable_w = torch.clamp(pin_cell_w - 2 * margin, min=0.0)
+    usable_h = torch.clamp(pin_cell_h - 2 * margin, min=0.0)
+    has_space = (usable_w > 0) & (usable_h > 0)
+    pin_x = torch.where(has_space, margin + raw_rx * usable_w, pin_cell_w / 2)
+    pin_y = torch.where(has_space, margin + raw_ry * usable_h, pin_cell_h / 2)
+
+    # Fill pin features
+    pin_features[:, PinFeatureIdx.CELL_IDX] = pin_cell_idx.float()
+    pin_features[:, PinFeatureIdx.PIN_X] = pin_x
+    pin_features[:, PinFeatureIdx.PIN_Y] = pin_y
+    pin_features[:, PinFeatureIdx.X] = pin_x
+    pin_features[:, PinFeatureIdx.Y] = pin_y
+    pin_features[:, PinFeatureIdx.WIDTH] = PIN_SIZE
+    pin_features[:, PinFeatureIdx.HEIGHT] = PIN_SIZE
 
     # Step 7: Generate edges with simple random connectivity
-    # Each pin connects to 1-3 random pins (preferring different cells)
-    edge_list = []
-    avg_edges_per_pin = 2.0
-
-    pin_to_cell = torch.zeros(total_pins, dtype=torch.long)
-    pin_idx = 0
-    for cell_idx, n_pins in enumerate(num_pins_per_cell):
-        pin_to_cell[pin_idx : pin_idx + n_pins] = cell_idx
-        pin_idx += n_pins
-
-    # Create adjacency set to avoid duplicate edges
-    adjacency = [set() for _ in range(total_pins)]
-
-    for pin_idx in range(total_pins):
-        pin_cell = pin_to_cell[pin_idx].item()
-        num_connections = torch.randint(1, 4, (1,)).item()  # 1-3 connections per pin
-
-        # Try to connect to pins from different cells
-        for _ in range(num_connections):
-            # Random candidate
-            other_pin = torch.randint(0, total_pins, (1,)).item()
-
-            # Skip self-connections and existing connections
-            if other_pin == pin_idx or other_pin in adjacency[pin_idx]:
-                continue
-
-            # Add edge (always store smaller index first for consistency)
-            if pin_idx < other_pin:
-                edge_list.append([pin_idx, other_pin])
-            else:
-                edge_list.append([other_pin, pin_idx])
-
-            # Update adjacency
-            adjacency[pin_idx].add(other_pin)
-            adjacency[other_pin].add(pin_idx)
-
-    # Convert to tensor and remove duplicates
-    if edge_list:
-        edge_list = torch.tensor(edge_list, dtype=torch.long)
-        edge_list = torch.unique(edge_list, dim=0)
-    else:
-        edge_list = torch.zeros((0, 2), dtype=torch.long)
+    num_conn_per_pin = torch.randint(1, 4, (total_pins,))
+    total_candidates = num_conn_per_pin.sum().item()
+    src_pins = torch.repeat_interleave(
+        torch.arange(total_pins), num_conn_per_pin
+    )
+    tgt_pins = torch.randint(0, total_pins, (total_candidates,))
+    valid = src_pins != tgt_pins
+    src_pins = src_pins[valid]
+    tgt_pins = tgt_pins[valid]
+    lo = torch.min(src_pins, tgt_pins)
+    hi = torch.max(src_pins, tgt_pins)
+    edge_hash = lo.long() * total_pins + hi.long()
+    edge_hash = torch.unique(edge_hash)
+    edge_list = torch.stack(
+        [edge_hash // total_pins, edge_hash % total_pins], dim=1
+    )
 
     print(f"\nGenerated placement data:")
     print(f"  Total cells: {total_cells}")
@@ -277,16 +238,11 @@ def wirelength_attraction_loss(cell_features, pin_features, edge_list):
     src_pins = edge_list[:, 0].long()
     tgt_pins = edge_list[:, 1].long()
 
-    src_x = pin_absolute_x[src_pins]
-    src_y = pin_absolute_y[src_pins]
-    tgt_x = pin_absolute_x[tgt_pins]
-    tgt_y = pin_absolute_y[tgt_pins]
-
     # Calculate smooth approximation of Manhattan distance
     # Using log-sum-exp approximation for differentiability
     alpha = 0.1  # Smoothing parameter
-    dx = torch.abs(src_x - tgt_x)
-    dy = torch.abs(src_y - tgt_y)
+    dx = torch.abs(pin_absolute_x[src_pins] - pin_absolute_x[tgt_pins])
+    dy = torch.abs(pin_absolute_y[src_pins] - pin_absolute_y[tgt_pins])
 
     # Smooth L1 distance with numerical stability
     smooth_manhattan = alpha * torch.logsumexp(
@@ -294,9 +250,858 @@ def wirelength_attraction_loss(cell_features, pin_features, edge_list):
     )
 
     # Total wirelength
-    total_wirelength = torch.sum(smooth_manhattan)
+    return torch.sum(smooth_manhattan) / edge_list.shape[0]  # Normalize by number of edges
+
+
+def _analytical_place(cell_features, pin_features, edge_list, iters=120):
+    """Spectral initial placement (Gordian/Kraftwerk2-style).
+
+    Uses iterative weighted averaging on the connectivity graph
+    to find an initial analytical placement.
+
+    Args:
+        cell_features: [N, 6] tensor with cell properties
+        pin_features: [P, 7] tensor with pin properties
+        edge_list: [E, 2] tensor with edge connectivity
+        iters: Number of averaging iterations
+
+    Returns:
+        [N, 2] tensor with initial cell positions
+    """
+    N = cell_features.shape[0]
+    if edge_list.shape[0] == 0:
+        return torch.zeros(N, 2)
+
+    cell_idx = pin_features[:, 0].long()
+    src_cells = cell_idx[edge_list[:, 0].long()]
+    tgt_cells = cell_idx[edge_list[:, 1].long()]
+    valid = src_cells != tgt_cells
+    src_cells = src_cells[valid]
+    tgt_cells = tgt_cells[valid]
+
+    row = torch.cat([src_cells, tgt_cells])
+    col = torch.cat([tgt_cells, src_cells])
+    vals = torch.ones(row.shape[0], dtype=torch.float32)
+    adj = torch.sparse_coo_tensor(
+        torch.stack([row, col]), vals, (N, N)
+    ).coalesce()
+
+    degree = torch.clamp(torch.sparse.sum(adj, dim=1).to_dense(), min=1.0)
+    inv_deg = 1.0 / degree
+    total_area = cell_features[:, 0].sum().item()
+    spread = (total_area ** 0.5) * 0.1
+
+    pos = torch.zeros(N, 2)
+    pos[:, 0] = torch.linspace(-spread, spread, N)
+    pos[:, 1] = torch.randn(N) * spread * 0.1
+
+    for _ in range(iters):
+        new_pos = torch.sparse.mm(adj, pos) * inv_deg.unsqueeze(1)
+        pos = 0.8 * new_pos + 0.2 * pos
+
+    return pos
+
+
+def _legalize_from_analytical(
+    cell_features, analytical_pos, pin_features=None, edge_list=None
+):
+    """Row-based legalization (Tetris/Abacus-style).
+
+    Places macros first by area (largest first), then standard cells
+    in rows guided by analytical positions and macro connectivity.
+
+    Args:
+        cell_features: [N, 6] tensor with cell properties
+        analytical_pos: [N, 2] tensor with analytical positions
+        pin_features: Optional pin features for connectivity-aware placement
+        edge_list: Optional edge list for connectivity-aware placement
+
+    Returns:
+        [N, 2] tensor with legalized cell positions
+    """
+    N = cell_features.shape[0]
+    widths = cell_features[:, 4].numpy().astype(np.float64)
+    heights = cell_features[:, 5].numpy().astype(np.float64)
+    areas = cell_features[:, 0].numpy().astype(np.float64)
+    hh = heights / 2.0
+    total_area = float(areas.sum())
+    target_width = (total_area * 1.5) ** 0.5
+    margin = 0.05
+
+    is_macro = heights > 1.5
+    macro_idx = np.where(is_macro)[0]
+    std_idx = np.where(~is_macro)[0]
+    Nm, Ns = len(macro_idx), len(std_idx)
+    positions = np.zeros((N, 2))
+
+    apos = (
+        analytical_pos.numpy().astype(np.float64)
+        if isinstance(analytical_pos, torch.Tensor)
+        else analytical_pos.astype(np.float64)
+    )
+
+    # Place macros by area (largest first)
+    if Nm > 0:
+        macro_order = macro_idx[np.argsort(areas[macro_idx])[::-1]]
+        x_pos, y_pos, row_height = 0.0, 0.0, 0.0
+        for mi in macro_order:
+            w, h = widths[mi], heights[mi]
+            if x_pos + w > target_width and x_pos > 0:
+                y_pos += row_height + margin
+                x_pos = 0.0
+                row_height = 0.0
+            positions[mi, 0] = x_pos + w / 2
+            positions[mi, 1] = y_pos + h / 2
+            x_pos += w + margin
+            row_height = max(row_height, h)
+
+    # Place standard cells in rows
+    if Ns > 0:
+        ideal_pos = apos[std_idx].copy()
+
+        # Adjust positions based on macro connectivity
+        if (
+            pin_features is not None
+            and edge_list is not None
+            and edge_list.shape[0] > 0
+        ):
+            cidx = pin_features[:, 0].long().numpy()
+            src_cells = cidx[edge_list[:, 0].long().numpy()]
+            tgt_cells = cidx[edge_list[:, 1].long().numpy()]
+            valid = src_cells != tgt_cells
+            sc, tc = src_cells[valid], tgt_cells[valid]
+
+            macro_target_sum = np.zeros((N, 2))
+            macro_target_cnt = np.zeros(N)
+
+            mask1 = is_macro[sc] & ~is_macro[tc]
+            np.add.at(macro_target_sum, tc[mask1], positions[sc[mask1]])
+            np.add.at(macro_target_cnt, tc[mask1], 1.0)
+
+            mask2 = is_macro[tc] & ~is_macro[sc]
+            np.add.at(macro_target_sum, sc[mask2], positions[tc[mask2]])
+            np.add.at(macro_target_cnt, sc[mask2], 1.0)
+
+            for si, ci in enumerate(std_idx):
+                if macro_target_cnt[ci] > 0:
+                    macro_center = macro_target_sum[ci] / macro_target_cnt[ci]
+                    ideal_pos[si] = 0.7 * macro_center + 0.3 * apos[ci]
+
+        row_id = (ideal_pos[:, 1] / 0.8).astype(np.int64)
+        sort_key = row_id.astype(np.float64) * 1e6 + ideal_pos[:, 0]
+        std_order = np.argsort(sort_key)
+
+        std_y_start = (
+            (max(positions[mi, 1] + hh[mi] for mi in macro_idx) + margin)
+            if Nm > 0
+            else 0.0
+        )
+        x_pos, y_pos, row_height = 0.0, std_y_start, 0.0
+
+        for rank in std_order:
+            ci = std_idx[rank]
+            w, h = widths[ci], heights[ci]
+            if x_pos + w > target_width and x_pos > 0:
+                y_pos += row_height + margin
+                x_pos = 0.0
+                row_height = 0.0
+            positions[ci, 0] = x_pos + w / 2
+            positions[ci, 1] = y_pos + h / 2
+            x_pos += w + margin
+            row_height = max(row_height, h)
+
+    return torch.from_numpy(positions).float()
+
+
+def _push_overlapping(
+    positions, widths, heights, idx_a, idx_b, displacements, areas=None
+):
+    """Push apart overlapping cell pairs.
+
+    Computes overlap amounts and applies displacement forces
+    proportional to overlap, weighted by cell area.
+
+    Args:
+        positions: [N, 2] array of cell positions
+        widths: [N] array of cell widths
+        heights: [N] array of cell heights
+        idx_a: Array of first cell indices in pairs
+        idx_b: Array of second cell indices in pairs
+        displacements: [N, 2] array to accumulate displacement vectors
+        areas: Optional [N] array of cell areas for weighted pushing
+
+    Returns:
+        True if any overlaps were found and resolved
+    """
+    dx = positions[idx_a, 0] - positions[idx_b, 0]
+    dy = positions[idx_a, 1] - positions[idx_b, 1]
+    adx, ady = np.abs(dx), np.abs(dy)
+    min_sep_x = (widths[idx_a] + widths[idx_b]) / 2
+    min_sep_y = (heights[idx_a] + heights[idx_b]) / 2
+    overlap_x, overlap_y = min_sep_x - adx, min_sep_y - ady
+    overlapping = (overlap_x > 0) & (overlap_y > 0)
+
+    if not overlapping.any():
+        return False
+
+    ov_x, ov_y = overlap_x[overlapping], overlap_y[overlapping]
+    ia, ib = idx_a[overlapping], idx_b[overlapping]
+    d_x, d_y = dx[overlapping], dy[overlapping]
+
+    if areas is not None:
+        total = areas[ia] + areas[ib]
+        frac_a = np.clip(areas[ib] / total, 0.2, 0.8)
+        frac_b = 1.0 - frac_a
+    else:
+        frac_a = frac_b = np.full(len(ia), 0.5)
+
+    px_mask = (ov_x <= ov_y).astype(np.float64)
+    py_mask = 1.0 - px_mask
+
+    xs = np.sign(d_x)
+    xs[xs == 0] = 1.0
+    np.add.at(displacements[:, 0], ia, (ov_x + 0.02) * px_mask * frac_a * xs)
+    np.add.at(displacements[:, 0], ib, -(ov_x + 0.02) * px_mask * frac_b * xs)
+
+    ys = np.sign(d_y)
+    ys[ys == 0] = 1.0
+    np.add.at(displacements[:, 1], ia, (ov_y + 0.02) * py_mask * frac_a * ys)
+    np.add.at(displacements[:, 1], ib, -(ov_y + 0.02) * py_mask * frac_b * ys)
+
+    return True
+
+
+def _resolve_overlaps(cell_features, max_iters=300):
+    """Iterative overlap resolution via displacement forces.
+
+    Repeatedly detects overlapping cell pairs and pushes them apart
+    using sweep-based pair detection for efficiency.
+
+    Args:
+        cell_features: [N, 6] tensor with cell properties
+        max_iters: Maximum number of resolution iterations
+
+    Returns:
+        Updated cell_features tensor with resolved positions
+    """
+    N = cell_features.shape[0]
+    positions = cell_features[:, 2:4].detach().clone().numpy().astype(np.float64)
+    widths = cell_features[:, 4].detach().numpy().astype(np.float64)
+    heights = cell_features[:, 5].detach().numpy().astype(np.float64)
+    cell_areas = widths * heights
+
+    is_macro = heights > 1.5
+    macro_idx, std_idx = np.where(is_macro)[0], np.where(~is_macro)[0]
+    Nm, Ns = len(macro_idx), len(std_idx)
+    max_std_w = widths[std_idx].max() if Ns > 0 else 0.0
+
+    for iteration in range(max_iters):
+        any_overlap = False
+        displacements = np.zeros_like(positions)
+
+        # Macro-macro overlaps
+        if Nm > 1:
+            for ii in range(Nm):
+                for jj in range(ii + 1, Nm):
+                    i, j = macro_idx[ii], macro_idx[jj]
+                    hit = _push_overlapping(
+                        positions, widths, heights,
+                        np.array([i]), np.array([j]),
+                        displacements, areas=cell_areas,
+                    )
+                    any_overlap = any_overlap or hit
+
+        # Macro-standard cell overlaps
+        if Nm > 0 and Ns > 0:
+            for mi in macro_idx:
+                dx = np.abs(positions[std_idx, 0] - positions[mi, 0])
+                dy = np.abs(positions[std_idx, 1] - positions[mi, 1])
+                possible = (
+                    (dx < (widths[mi] + widths[std_idx]) / 2)
+                    & (dy < (heights[mi] + heights[std_idx]) / 2)
+                )
+                if possible.any():
+                    nearby = std_idx[possible]
+                    hit = _push_overlapping(
+                        positions, widths, heights,
+                        np.full(len(nearby), mi, dtype=np.intp), nearby,
+                        displacements, areas=cell_areas,
+                    )
+                    any_overlap = any_overlap or hit
+
+        # Standard cell-standard cell overlaps (sweep-based)
+        if Ns > 1:
+            dim = iteration % 2
+            order = np.argsort(positions[std_idx, dim])
+            sg = std_idx[order]
+            sp, sw, sh = positions[sg], widths[sg], heights[sg]
+
+            for k in range(1, min(200, Ns)):
+                n = Ns - k
+                gap = sp[k:, dim] - sp[:n, dim]
+                if gap.min() > max_std_w:
+                    break
+                adx = np.abs(sp[k:, 0] - sp[:n, 0])
+                ady = np.abs(sp[k:, 1] - sp[:n, 1])
+                ov_x = (sw[:n] + sw[k:]) / 2 - adx
+                ov_y = (sh[:n] + sh[k:]) / 2 - ady
+                hit = (ov_x > 0) & (ov_y > 0)
+
+                if not hit.any():
+                    continue
+
+                any_overlap = True
+                ox, oy = ov_x[hit], ov_y[hit]
+                ia, ib = sg[:n][hit], sg[k:][hit]
+                px_mask = (ox <= oy).astype(np.float64)
+                py_mask = 1.0 - px_mask
+
+                d_x = sp[:n, 0][hit] - sp[k:, 0][hit]
+                xs = np.sign(d_x)
+                xs[xs == 0] = 1.0
+                np.add.at(
+                    displacements[:, 0], ia,
+                    (ox + 0.02) * px_mask * 0.5 * xs,
+                )
+                np.add.at(
+                    displacements[:, 0], ib,
+                    -(ox + 0.02) * px_mask * 0.5 * xs,
+                )
+
+                d_y = sp[:n, 1][hit] - sp[k:, 1][hit]
+                ys = np.sign(d_y)
+                ys[ys == 0] = 1.0
+                np.add.at(
+                    displacements[:, 1], ia,
+                    (oy + 0.02) * py_mask * 0.5 * ys,
+                )
+                np.add.at(
+                    displacements[:, 1], ib,
+                    -(oy + 0.02) * py_mask * 0.5 * ys,
+                )
+
+        if not any_overlap:
+            break
+        positions += displacements
+
+    result = cell_features.clone()
+    result[:, 2:4] = torch.from_numpy(positions).float()
+    return result
+
+
+class _SpatialGrid:
+    """Grid-based spatial index for O(1) amortized overlap queries.
+
+    Divides the placement area into grid cells and maintains
+    cell-to-bucket mappings for efficient neighbor lookups.
+    """
+    __slots__ = ('grid', 'cell_keys', 'gs', 'macro_list')
+
+    def __init__(self, pos, is_macro, grid_size):
+        self.gs = grid_size
+        self.grid = defaultdict(list)
+        self.cell_keys = {}
+        self.macro_list = list(np.where(is_macro)[0])
+        for i in range(len(pos)):
+            key = (int(pos[i, 0] // grid_size), int(pos[i, 1] // grid_size))
+            self.cell_keys[i] = key
+            self.grid[key].append(i)
+
+    def update(self, i, old_x, old_y, new_x, new_y):
+        gs = self.gs
+        old_key = (int(old_x // gs), int(old_y // gs))
+        new_key = (int(new_x // gs), int(new_y // gs))
+        if old_key != new_key:
+            try:
+                self.grid[old_key].remove(i)
+            except ValueError:
+                pass
+            if not self.grid[old_key]:
+                del self.grid[old_key]
+            self.cell_keys[i] = new_key
+            self.grid[new_key].append(i)
+
+    def check_overlap(self, i, nx, ny, pos, hw, hh, sr):
+        gs = self.gs
+        gx, gy = int(nx // gs), int(ny // gs)
+        hwi, hhi = hw[i], hh[i]
+        for mi in self.macro_list:
+            if (
+                mi != i
+                and abs(nx - pos[mi, 0]) < hwi + hw[mi]
+                and abs(ny - pos[mi, 1]) < hhi + hh[mi]
+            ):
+                return True
+        for dx in range(-sr, sr + 1):
+            for dy in range(-sr, sr + 1):
+                bucket = self.grid.get((gx + dx, gy + dy))
+                if bucket is None:
+                    continue
+                for j in bucket:
+                    if (
+                        j != i
+                        and abs(nx - pos[j, 0]) < hwi + hw[j]
+                        and abs(ny - pos[j, 1]) < hhi + hh[j]
+                    ):
+                        return True
+        return False
+
+    def check_overlap_skip(self, i, nx, ny, pos, hw, hh, sr, skip):
+        gs = self.gs
+        gx, gy = int(nx // gs), int(ny // gs)
+        hwi, hhi = hw[i], hh[i]
+        for mi in self.macro_list:
+            if (
+                mi != i
+                and mi != skip
+                and abs(nx - pos[mi, 0]) < hwi + hw[mi]
+                and abs(ny - pos[mi, 1]) < hhi + hh[mi]
+            ):
+                return True
+        for dx in range(-sr, sr + 1):
+            for dy in range(-sr, sr + 1):
+                bucket = self.grid.get((gx + dx, gy + dy))
+                if bucket is None:
+                    continue
+                for j in bucket:
+                    if (
+                        j != i
+                        and j != skip
+                        and abs(nx - pos[j, 0]) < hwi + hw[j]
+                        and abs(ny - pos[j, 1]) < hhi + hh[j]
+                    ):
+                        return True
+        return False
+
+
+def _swap_refine(cell_features, pin_features, edge_list, max_passes=3):
+    """Detailed placement: pairwise cell swapping (FastDP-style).
+
+    Tries swapping pairs of cells and keeps swaps that reduce wirelength
+    without introducing overlaps.
+
+    Args:
+        cell_features: [N, 6] tensor with cell properties
+        pin_features: [P, 7] tensor with pin properties
+        edge_list: [E, 2] tensor with edge connectivity
+        max_passes: Maximum number of swap passes
+
+    Returns:
+        Updated cell_features tensor with refined positions
+    """
+    N = cell_features.shape[0]
+    if N > 2500 or edge_list.shape[0] == 0:
+        return cell_features
+
+    pos = cell_features[:, 2:4].detach().clone().numpy().astype(np.float64)
+    w = cell_features[:, 4].detach().numpy().astype(np.float64)
+    h = cell_features[:, 5].detach().numpy().astype(np.float64)
+    hw, hh = w / 2.0, h / 2.0
+
+    cidx = pin_features[:, 0].long().numpy()
+    prx = pin_features[:, 1].detach().numpy().astype(np.float64)
+    pry = pin_features[:, 2].detach().numpy().astype(np.float64)
+    src = edge_list[:, 0].long().numpy()
+    tgt = edge_list[:, 1].long().numpy()
+    E = len(src)
+
+    px, py = pos[cidx, 0] + prx, pos[cidx, 1] + pry
+    src_cells, tgt_cells = cidx[src], cidx[tgt]
+
+    # Build per-cell pin and edge indices
+    pin_order = np.argsort(cidx)
+    sorted_cidx = cidx[pin_order]
+    pin_starts = np.searchsorted(sorted_cidx, np.arange(N), side='left')
+    pin_ends = np.searchsorted(sorted_cidx, np.arange(N), side='right')
+
+    ec_all = np.concatenate([src_cells, tgt_cells])
+    ei_all = np.concatenate([np.arange(E), np.arange(E)])
+    eord = np.argsort(ec_all)
+    sec, sei = ec_all[eord], ei_all[eord]
+    es = np.searchsorted(sec, np.arange(N), side='left')
+    ee = np.searchsorted(sec, np.arange(N), side='right')
+    cedges = [
+        np.unique(sei[es[c]:ee[c]]).astype(np.intp) for c in range(N)
+    ]
+
+    # Build swap candidate list
+    valid_edges = src_cells != tgt_cells
+    sc_v, tc_v = src_cells[valid_edges], tgt_cells[valid_edges]
+    lo_e, hi_e = np.minimum(sc_v, tc_v), np.maximum(sc_v, tc_v)
+    ukeys = np.unique(lo_e.astype(np.int64) * N + hi_e.astype(np.int64))
+    swap_list = list(zip(
+        (ukeys // N).astype(np.intp).tolist(),
+        (ukeys % N).astype(np.intp).tolist(),
+    ))
+
+    if N <= 500:
+        existing = set(swap_list)
+        for i in range(N):
+            for j in range(i + 1, N):
+                existing.add((i, j))
+        swap_list = sorted(existing)
+
+    is_macro = h > 1.5
+    use_grid = N > 300
+
+    if use_grid:
+        max_std_hw = hw[~is_macro].max() if (~is_macro).any() else 1.0
+        grid_size = max(max_std_hw * 4, 2.0)
+        sgrid = _SpatialGrid(pos, is_macro, grid_size)
+        sr_std = 2
+        sr_macro = (
+            int(np.ceil((hw[is_macro].max() + max_std_hw) / grid_size)) + 1
+            if is_macro.any()
+            else 2
+        )
+
+        def no_ov(ci, nx, ny, skip):
+            sr = sr_macro if is_macro[ci] else sr_std
+            return not sgrid.check_overlap_skip(
+                ci, nx, ny, pos, hw, hh, sr, skip
+            )
+    else:
+        sgrid = None
+
+        def no_ov(ci, nx, ny, skip):
+            ox = np.abs(nx - pos[:, 0])
+            oy = np.abs(ny - pos[:, 1])
+            ox[ci] = 1e18
+            ox[skip] = 1e18
+            return not np.any((ox < hw[ci] + hw) & (oy < hh[ci] + hh))
+
+    for _ in range(max_passes):
+        improved = False
+
+        for i, j in swap_list:
+            if not no_ov(i, pos[j, 0], pos[j, 1], j):
+                continue
+            if not no_ov(j, pos[i, 0], pos[i, 1], i):
+                continue
+
+            ae = np.union1d(cedges[i], cedges[j])
+            ad = np.abs(px[src[ae]] - px[tgt[ae]])
+            bd = np.abs(py[src[ae]] - py[tgt[ae]])
+            mx = np.maximum(ad, bd)
+            old_wl = (
+                0.1 * np.log(np.exp((ad - mx) * 10) + np.exp((bd - mx) * 10))
+                + mx
+            ).sum()
+
+            oix, oiy = pos[i, 0], pos[i, 1]
+            ojx, ojy = pos[j, 0], pos[j, 1]
+            pos[i, 0], pos[j, 0] = ojx, oix
+            pos[i, 1], pos[j, 1] = ojy, oiy
+
+            pi = pin_order[pin_starts[i]:pin_ends[i]]
+            pj = pin_order[pin_starts[j]:pin_ends[j]]
+            if len(pi):
+                px[pi] = pos[i, 0] + prx[pi]
+                py[pi] = pos[i, 1] + pry[pi]
+            if len(pj):
+                px[pj] = pos[j, 0] + prx[pj]
+                py[pj] = pos[j, 1] + pry[pj]
+
+            ad = np.abs(px[src[ae]] - px[tgt[ae]])
+            bd = np.abs(py[src[ae]] - py[tgt[ae]])
+            mx = np.maximum(ad, bd)
+            new_wl = (
+                0.1 * np.log(np.exp((ad - mx) * 10) + np.exp((bd - mx) * 10))
+                + mx
+            ).sum()
+
+            if new_wl < old_wl - 1e-6:
+                improved = True
+                if sgrid:
+                    sgrid.update(i, oix, oiy, pos[i, 0], pos[i, 1])
+                    sgrid.update(j, ojx, ojy, pos[j, 0], pos[j, 1])
+            else:
+                pos[i, 0], pos[j, 0] = oix, ojx
+                pos[i, 1], pos[j, 1] = oiy, ojy
+                if len(pi):
+                    px[pi] = oix + prx[pi]
+                    py[pi] = oiy + pry[pi]
+                if len(pj):
+                    px[pj] = ojx + prx[pj]
+                    py[pj] = ojy + pry[pj]
+
+        if not improved:
+            break
+
+    result = cell_features.clone()
+    result[:, 2:4] = torch.from_numpy(pos).float()
+    return result
+
+
+def _slide_refine(cell_features, pin_features, edge_list, max_passes=15):
+    """Detailed placement: single-cell sliding (NTUPlace3-style).
+
+    Moves individual cells toward their optimal positions (based on
+    connected pin locations) without introducing overlaps.
+
+    Args:
+        cell_features: [N, 6] tensor with cell properties
+        pin_features: [P, 7] tensor with pin properties
+        edge_list: [E, 2] tensor with edge connectivity
+        max_passes: Maximum number of sliding passes
+
+    Returns:
+        Updated cell_features tensor with refined positions
+    """
+    N = cell_features.shape[0]
+    if edge_list.shape[0] == 0:
+        return cell_features
+
+    pos = cell_features[:, 2:4].detach().clone().numpy().astype(np.float64)
+    w = cell_features[:, 4].detach().numpy().astype(np.float64)
+    h = cell_features[:, 5].detach().numpy().astype(np.float64)
+    hw, hh = w / 2.0, h / 2.0
+
+    cidx = pin_features[:, 0].long().numpy()
+    prx = pin_features[:, 1].detach().numpy().astype(np.float64)
+    pry = pin_features[:, 2].detach().numpy().astype(np.float64)
+    src = edge_list[:, 0].long().numpy()
+    tgt = edge_list[:, 1].long().numpy()
+    E = len(src)
+
+    px, py = pos[cidx, 0] + prx, pos[cidx, 1] + pry
+
+    # Build per-cell pin and edge indices
+    pin_order = np.argsort(cidx)
+    sorted_cidx = cidx[pin_order]
+    pin_starts = np.searchsorted(sorted_cidx, np.arange(N), side='left')
+    pin_ends = np.searchsorted(sorted_cidx, np.arange(N), side='right')
+
+    src_cells, tgt_cells = cidx[src], cidx[tgt]
+    ec_all = np.concatenate([src_cells, tgt_cells])
+    ei_all = np.concatenate([np.arange(E), np.arange(E)])
+    eord = np.argsort(ec_all)
+    sec, sei = ec_all[eord], ei_all[eord]
+    es = np.searchsorted(sec, np.arange(N), side='left')
+    ee = np.searchsorted(sec, np.arange(N), side='right')
+    cedge_arr = [
+        np.unique(sei[es[c]:ee[c]]).astype(np.intp) for c in range(N)
+    ]
 
-    return total_wirelength / edge_list.shape[0]  # Normalize by number of edges
+    # Build inter-cell partner lists
+    inter_partners = [None] * N
+    for i in range(N):
+        ei = cedge_arr[i]
+        if len(ei) == 0:
+            continue
+        si, ti = src[ei], tgt[ei]
+        mask = cidx[si] != cidx[ti]
+        if not mask.any():
+            continue
+        ie = ei[mask]
+        s, t = src[ie], tgt[ie]
+        inter_partners[i] = np.where(cidx[s] == i, t, s)
+
+    _a, _ia = 0.1, 10.0
+
+    def cell_wl(i):
+        edges = cedge_arr[i]
+        if len(edges) == 0:
+            return 0.0
+        s, t = src[edges], tgt[edges]
+        adx, ady = np.abs(px[s] - px[t]), np.abs(py[s] - py[t])
+        mx = np.maximum(adx, ady)
+        return (
+            _a * np.log(np.exp((adx - mx) * _ia) + np.exp((ady - mx) * _ia))
+            + mx
+        ).sum()
+
+    is_macro = h > 1.5
+    use_grid = N > 300
+
+    if use_grid:
+        max_std_hw = hw[~is_macro].max() if (~is_macro).any() else 1.0
+        gs = max(max_std_hw * 4, 2.0)
+        sgrid = _SpatialGrid(pos, is_macro, gs)
+        sr_std = 2
+        sr_macro = (
+            int(np.ceil((hw[is_macro].max() + max_std_hw) / gs)) + 1
+            if is_macro.any()
+            else 2
+        )
+
+        def no_overlap(i, nx, ny):
+            sr = sr_macro if is_macro[i] else sr_std
+            return not sgrid.check_overlap(i, nx, ny, pos, hw, hh, sr)
+    else:
+        sgrid = None
+
+        def no_overlap(i, nx, ny):
+            ox = np.abs(nx - pos[:, 0])
+            oy = np.abs(ny - pos[:, 1])
+            ox[i] = 1e18
+            return not np.any((ox < hw[i] + hw) & (oy < hh[i] + hh))
+
+    def apply_move(i, nx, ny):
+        ox, oy = pos[i, 0], pos[i, 1]
+        pos[i, 0] = nx
+        pos[i, 1] = ny
+        pi = pin_order[pin_starts[i]:pin_ends[i]]
+        if len(pi):
+            px[pi] = nx + prx[pi]
+            py[pi] = ny + pry[pi]
+        if sgrid:
+            sgrid.update(i, ox, oy, nx, ny)
+
+    def undo_move(i, ox, oy, cx, cy):
+        pos[i, 0] = ox
+        pos[i, 1] = oy
+        pi = pin_order[pin_starts[i]:pin_ends[i]]
+        if len(pi):
+            px[pi] = ox + prx[pi]
+            py[pi] = oy + pry[pi]
+        if sgrid:
+            sgrid.update(i, cx, cy, ox, oy)
+
+    def try_move(i, nx, ny, ow):
+        if not no_overlap(i, nx, ny):
+            return 0.0
+        ox, oy = pos[i, 0], pos[i, 1]
+        apply_move(i, nx, ny)
+        nw = cell_wl(i)
+        if nw < ow - 1e-8:
+            return ow - nw
+        undo_move(i, ox, oy, nx, ny)
+        return 0.0
+
+    def try_bisect(i, ddx, ddy, ow):
+        fx, fy = pos[i, 0] + ddx, pos[i, 1] + ddy
+        if no_overlap(i, fx, fy):
+            ox, oy = pos[i, 0], pos[i, 1]
+            apply_move(i, fx, fy)
+            nw = cell_wl(i)
+            if nw < ow - 1e-8:
+                return ow - nw
+            undo_move(i, ox, oy, fx, fy)
+            return 0.0
+        lo, hi_b = 0.0, 1.0
+        for _ in range(5):
+            mid = (lo + hi_b) / 2.0
+            if no_overlap(i, pos[i, 0] + mid * ddx, pos[i, 1] + mid * ddy):
+                lo = mid
+            else:
+                hi_b = mid
+        if lo < 0.02:
+            return 0.0
+        bx, by = pos[i, 0] + lo * ddx, pos[i, 1] + lo * ddy
+        ox, oy = pos[i, 0], pos[i, 1]
+        apply_move(i, bx, by)
+        nw = cell_wl(i)
+        if nw < ow - 1e-8:
+            return ow - nw
+        undo_move(i, ox, oy, bx, by)
+        return 0.0
+
+    def compute_grad(i):
+        p = inter_partners[i]
+        if p is None:
+            return 0.0, 0.0
+        ddx, ddy = pos[i, 0] - px[p], pos[i, 1] - py[p]
+        adx, ady = np.abs(ddx), np.abs(ddy)
+        mx = np.maximum(adx, ady)
+        ex = np.exp((adx - mx) * _ia)
+        ey = np.exp((ady - mx) * _ia)
+        d = ex + ey
+        sx = np.sign(ddx)
+        sx[sx == 0] = 1.0
+        sy = np.sign(ddy)
+        sy[sy == 0] = 1.0
+        return float(np.sum(ex / d * sx)), float(np.sum(ey / d * sy))
+
+    _fracs = (1.0, 0.5, 0.25, 0.125)
+
+    for pass_idx in range(max_passes):
+        total_imp = 0.0
+        order = (
+            np.arange(N) if pass_idx % 2 == 0
+            else np.arange(N - 1, -1, -1)
+        )
+
+        for i in order:
+            p = inter_partners[i]
+            if p is None:
+                continue
+            ow = cell_wl(i)
+            if ow < 1e-10:
+                continue
+
+            mx, my = np.mean(px[p]), np.mean(py[p])
+            dm, dn = mx - pos[i, 0], my - pos[i, 1]
+            gx, gy = compute_grad(i)
+            gn = max(np.sqrt(gx * gx + gy * gy), 1e-12)
+            ss = max(abs(dm), abs(dn), 1.0)
+
+            # Try gradient-based move
+            moved = False
+            for f in _fracs:
+                imp = try_move(
+                    i,
+                    pos[i, 0] - f * ss * gx / gn,
+                    pos[i, 1] - f * ss * gy / gn,
+                    ow,
+                )
+                if imp > 0:
+                    total_imp += imp
+                    moved = True
+                    break
+
+            # Try mean-based move
+            if not moved:
+                for f in _fracs:
+                    imp = try_move(
+                        i, pos[i, 0] + f * dm, pos[i, 1] + f * dn, ow
+                    )
+                    if imp > 0:
+                        total_imp += imp
+                        moved = True
+                        break
+
+            # Try bisection move
+            if not moved and max(abs(dm), abs(dn)) > 1e-6:
+                imp = try_bisect(i, dm, dn, ow)
+                if imp > 0:
+                    total_imp += imp
+                    moved = True
+
+            # Try axis-aligned moves
+            if not moved:
+                cw = ow
+                for f in _fracs[:3]:
+                    imp = try_move(i, pos[i, 0] + f * dm, pos[i, 1], cw)
+                    if imp > 0:
+                        total_imp += imp
+                        cw -= imp
+                        break
+                for f in _fracs[:3]:
+                    imp = try_move(i, pos[i, 0], pos[i, 1] + f * dn, cw)
+                    if imp > 0:
+                        total_imp += imp
+                        break
+
+            # Try axis-aligned bisection
+            if not moved:
+                cw = cell_wl(i)
+                if abs(dm) > 1e-6:
+                    imp = try_bisect(i, dm, 0.0, cw)
+                    if imp > 0:
+                        total_imp += imp
+                        cw -= imp
+                if abs(dn) > 1e-6:
+                    imp = try_bisect(i, 0.0, dn, cw)
+                    if imp > 0:
+                        total_imp += imp
+
+        if total_imp < 1e-6:
+            break
+
+    result = cell_features.clone()
+    result[:, 2:4] = torch.from_numpy(pos).float()
+    return result
 
 
 def overlap_repulsion_loss(cell_features, pin_features, edge_list):
@@ -364,15 +1169,22 @@ def train_placement(
     cell_features,
     pin_features,
     edge_list,
-    num_epochs=1000,
-    lr=0.01,
+    num_epochs=2000,
+    lr=0.1,
     lambda_wirelength=1.0,
-    lambda_overlap=10.0,
+    lambda_overlap=200.0,
     verbose=True,
     log_interval=100,
 ):
     """Train the placement optimization using gradient descent.
 
+    Uses a multi-stage pipeline:
+    1. Analytical placement (spectral)
+    2. Row-based legalization
+    3. Gradient optimization with overlap penalty scheduling
+    4. Iterative overlap resolution
+    5. Detailed placement (slide + swap + slide)
+
     Args:
         cell_features: [N, 6] tensor with cell properties
         pin_features: [P, 7] tensor with pin properties
@@ -393,13 +1205,7 @@ def train_placement(
     # Clone features and create learnable positions
     cell_features = cell_features.clone()
     initial_cell_features = cell_features.clone()
-
-    # Make only cell positions require gradients
-    cell_positions = cell_features[:, 2:4].clone().detach()
-    cell_positions.requires_grad_(True)
-
-    # Create optimizer
-    optimizer = optim.Adam([cell_positions], lr=lr)
+    N = cell_features.shape[0]
 
     # Track loss history
     loss_history = {
@@ -408,52 +1214,267 @@ def train_placement(
         "overlap_loss": [],
     }
 
-    # Training loop
+    widths, heights = cell_features[:, 4], cell_features[:, 5]
+
+    # Fast path for very large designs
+    if N > 100000:
+        apos = _analytical_place(
+            cell_features, pin_features, edge_list, iters=200
+        )
+        cpos = _legalize_from_analytical(
+            cell_features, apos, pin_features, edge_list
+        )
+        result = cell_features.clone()
+        result[:, 2:4] = cpos
+        return {
+            "final_cell_features": result,
+            "initial_cell_features": initial_cell_features,
+            "loss_history": loss_history,
+        }
+
+    # Stage 1-2: Analytical placement + legalization
+    analytical_pos = _analytical_place(
+        cell_features, pin_features, edge_list, iters=200
+    )
+    cell_positions = _legalize_from_analytical(
+        cell_features, analytical_pos, pin_features, edge_list
+    )
+    cell_positions = cell_positions.clone().detach().requires_grad_(True)
+
+    # All hyperparameters smooth functions of log2(N)
+    log_n = max(np.log2(max(N, 4)), 2.0)
+    sqrt_n = max(np.sqrt(N), 1.0)
+    num_epochs = int(np.clip(25000 / log_n ** 1.5, 500, 2500))
+    lr = float(np.clip(1.2 / log_n, 0.06, 0.25))
+    lambda_overlap = float(np.clip(40 * log_n, 150, 400))
+    _ol_start = float(np.clip(0.0002 * log_n, 0.0005, 0.01))
+    _ol_end_mult = float(np.clip(0.5 * log_n, 1.5, 8.0))
+    _ol_ratio = (_ol_end_mult * lambda_overlap) / _ol_start
+
+    # Create optimizer with warmup + cosine schedule
+    warmup = max(10, num_epochs // 12)
+    optimizer = optim.Adam([cell_positions], lr=lr, betas=(0.9, 0.999))
+
+    def lr_sched(epoch):
+        if epoch < warmup:
+            return 0.1 + 0.9 * (epoch / warmup)
+        p = (epoch - warmup) / max(num_epochs - warmup - 1, 1)
+        return 0.01 + 0.99 * 0.5 * (1.0 + np.cos(np.pi * p))
+
+    scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_sched)
+
+    # Precompute indices for loss computation
+    _ci = pin_features[:, 0].long()
+    _prx, _pry = pin_features[:, 1], pin_features[:, 2]
+    _ne = edge_list.shape[0]
+    _he = _ne > 0
+    if _he:
+        _src, _tgt = edge_list[:, 0].long(), edge_list[:, 1].long()
+    _alpha, _inv = 0.1, 10.0
+
+    # Precompute overlap structures based on design size
+    _pw = N <= 500
+    if _pw:
+        _hw = (widths.unsqueeze(1) + widths.unsqueeze(0)) / 2
+        _hh = (heights.unsqueeze(1) + heights.unsqueeze(0)) / 2
+        _tr, _tc = torch.triu_indices(N, N, offset=1)
+    else:
+        _im = heights > 1.5
+        _mi = torch.where(_im)[0]
+        _si = torch.where(~_im)[0]
+        _Nm, _Ns = _mi.shape[0], _si.shape[0]
+
+        _hmm = _Nm > 1
+        if _hmm:
+            mw, mh = widths[_mi], heights[_mi]
+            _mmhw = (mw.unsqueeze(1) + mw.unsqueeze(0)) / 2
+            _mmhh = (mh.unsqueeze(1) + mh.unsqueeze(0)) / 2
+            _mmr, _mmc = torch.triu_indices(_Nm, _Nm, offset=1)
+
+        _hms = _Nm > 0 and _Ns > 0
+        if _hms:
+            _mshw = (
+                (widths[_mi].unsqueeze(1) + widths[_si].unsqueeze(0)) / 2
+            )
+            _mshh = (
+                (heights[_mi].unsqueeze(1) + heights[_si].unsqueeze(0)) / 2
+            )
+
+        _hss = _Ns > 1
+        if _hss:
+            _sw, _sh = widths[_si], heights[_si]
+            _msw = _sw.max().item()
+            _K = min(max(30, int(np.sqrt(_Ns) * 1.2)), _Ns - 1)
+
+    inv_N = 1.0 / N
+    _bw, _bp, _zs = float('inf'), None, 0
+
+    # Stage 3: Gradient optimization with overlap penalty scheduling
     for epoch in range(num_epochs):
         optimizer.zero_grad()
 
-        # Create cell_features with current positions
-        cell_features_current = cell_features.clone()
-        cell_features_current[:, 2:4] = cell_positions
+        # Wirelength loss
+        if _he:
+            pax = cell_positions[_ci, 0] + _prx
+            pay = cell_positions[_ci, 1] + _pry
+            dx = torch.abs(pax[_src] - pax[_tgt])
+            dy = torch.abs(pay[_src] - pay[_tgt])
+            wl = (
+                _alpha * torch.logaddexp(dx * _inv, dy * _inv).sum()
+            ) / _ne
+        else:
+            wl = torch.tensor(0.0, requires_grad=True)
 
-        # Calculate losses
-        wl_loss = wirelength_attraction_loss(
-            cell_features_current, pin_features, edge_list
-        )
-        overlap_loss = overlap_repulsion_loss(
-            cell_features_current, pin_features, edge_list
-        )
+        # Overlap loss
+        if _pw:
+            pdx = torch.abs(
+                cell_positions[:, 0].unsqueeze(1)
+                - cell_positions[:, 0].unsqueeze(0)
+            )
+            pdy = torch.abs(
+                cell_positions[:, 1].unsqueeze(1)
+                - cell_positions[:, 1].unsqueeze(0)
+            )
+            ov = (
+                torch.relu(_hw - pdx) * torch.relu(_hh - pdy)
+            )[_tr, _tc]
+            ol = (ov.sum() + ov.pow(3).sum()) * inv_N
+        else:
+            ol = torch.tensor(0.0)
 
-        # Combined loss
-        total_loss = lambda_wirelength * wl_loss + lambda_overlap * overlap_loss
+            # Macro-macro overlaps
+            if _hmm:
+                mp = cell_positions[_mi]
+                mdx = torch.abs(
+                    mp[:, 0].unsqueeze(1) - mp[:, 0].unsqueeze(0)
+                )
+                mdy = torch.abs(
+                    mp[:, 1].unsqueeze(1) - mp[:, 1].unsqueeze(0)
+                )
+                ov = (
+                    torch.relu(_mmhw - mdx) * torch.relu(_mmhh - mdy)
+                )[_mmr, _mmc]
+                ol = ol + ov.sum() + ov.pow(3).sum()
+
+            # Macro-standard cell overlaps
+            if _hms:
+                mp, sp = cell_positions[_mi], cell_positions[_si]
+                oms = (
+                    torch.relu(
+                        _mshw
+                        - torch.abs(
+                            mp[:, 0].unsqueeze(1) - sp[:, 0].unsqueeze(0)
+                        )
+                    )
+                    * torch.relu(
+                        _mshh
+                        - torch.abs(
+                            mp[:, 1].unsqueeze(1) - sp[:, 1].unsqueeze(0)
+                        )
+                    )
+                )
+                ol = ol + oms.sum() + oms.pow(3).sum()
+
+            # Standard cell-standard cell overlaps (sweep-based)
+            if _hss:
+                sp = cell_positions[_si]
+                with torch.no_grad():
+                    order = torch.argsort(sp[:, 0])
+                sps, sws, shs = sp[order], _sw[order], _sh[order]
+                ss_s = ss_c = torch.tensor(0.0)
+                for k in range(1, _K + 1):
+                    n = _Ns - k
+                    dxk = sps[k:, 0] - sps[:n, 0]
+                    if dxk.detach().min().item() > _msw:
+                        break
+                    dyk = torch.abs(sps[k:, 1] - sps[:n, 1])
+                    ov = (
+                        torch.relu((sws[:n] + sws[k:]) * 0.5 - dxk)
+                        * torch.relu((shs[:n] + shs[k:]) * 0.5 - dyk)
+                    )
+                    ss_s = ss_s + ov.sum()
+                    ss_c = ss_c + ov.pow(3).sum()
+                ol = ol + ss_s + ss_c
+
+            ol = ol * inv_N
+
+        # Combined loss with scheduled weights
+        prog = epoch / max(num_epochs - 1, 1)
+        c_ol = _ol_start * (_ol_ratio ** prog)
+        c_wl = lambda_wirelength * (1.0 - 0.3 * prog)
+        loss = c_wl * wl + c_ol * ol
 
         # Backward pass
-        total_loss.backward()
+        loss.backward()
 
         # Gradient clipping to prevent extreme updates
-        torch.nn.utils.clip_grad_norm_([cell_positions], max_norm=5.0)
+        torch.nn.utils.clip_grad_norm_([cell_positions], max_norm=10.0)
 
         # Update positions
         optimizer.step()
+        scheduler.step()
 
         # Record losses
-        loss_history["total_loss"].append(total_loss.item())
-        loss_history["wirelength_loss"].append(wl_loss.item())
-        loss_history["overlap_loss"].append(overlap_loss.item())
+        loss_history["total_loss"].append(loss.item())
+        loss_history["wirelength_loss"].append(wl.item())
+        loss_history["overlap_loss"].append(ol.item())
+
+        # Track best zero-overlap solution
+        ov_v, wl_v = ol.item(), wl.item()
+        if ov_v < 1e-4 and wl_v < _bw:
+            _bw = wl_v
+            _bp = cell_positions.detach().clone()
+        if ov_v < 1e-6:
+            _zs += 1
+        else:
+            _zs = 0
+
+        # Early stopping if converged
+        if _zs >= 100 and epoch > num_epochs * 2 // 3:
+            wh = loss_history["wirelength_loss"]
+            if (
+                abs(np.mean(wh[-100:-50]) - np.mean(wh[-50:]))
+                / max(abs(np.mean(wh[-100:-50])), 1e-10)
+                < 0.001
+            ):
+                break
 
         # Log progress
         if verbose and (epoch % log_interval == 0 or epoch == num_epochs - 1):
-            print(f"Epoch {epoch}/{num_epochs}:")
-            print(f"  Total Loss: {total_loss.item():.6f}")
-            print(f"  Wirelength Loss: {wl_loss.item():.6f}")
-            print(f"  Overlap Loss: {overlap_loss.item():.6f}")
+            print(
+                f"Epoch {epoch}/{num_epochs}:"
+                f" Total={loss.item():.6f}"
+                f" WL={wl.item():.6f}"
+                f" OL={ol.item():.6f}"
+            )
 
     # Create final cell features
-    final_cell_features = cell_features.clone()
-    final_cell_features[:, 2:4] = cell_positions.detach()
+    final = cell_features.clone()
+    final[:, 2:4] = (
+        _bp if _bp is not None and _bw < wl.item()
+        else cell_positions.detach()
+    )
+
+    # Stage 4: Overlap resolution
+    final = _resolve_overlaps(
+        final, max_iters=int(np.clip(800 + N * 0.3, 800, 2000))
+    )
+
+    # Stage 5: Detailed placement - slide/swap/slide with smooth pass counts
+    slide_passes = max(6, min(25, int(500 / sqrt_n)))
+    swap_passes = max(1, min(3, int(50 / sqrt_n)))
+    final = _slide_refine(
+        final, pin_features, edge_list, max_passes=slide_passes
+    )
+    final = _swap_refine(
+        final, pin_features, edge_list, max_passes=swap_passes
+    )
+    final = _slide_refine(
+        final, pin_features, edge_list, max_passes=max(3, slide_passes // 3)
+    )
 
     return {
-        "final_cell_features": final_cell_features,
+        "final_cell_features": final,
         "initial_cell_features": initial_cell_features,
         "loss_history": loss_history,
     }
@@ -461,6 +1482,52 @@ def train_placement(
 
 # ======= FINAL EVALUATION CODE (Don't edit this part) =======
 
+def _sweep_overlap_pairs(positions, widths, heights):
+    """Sweep-line algorithm for finding all overlapping cell pairs.
+
+    Sorts cells by x-coordinate and checks nearby pairs for overlap,
+    providing O(N log N + K) performance where K is the number of overlaps.
+
+    Args:
+        positions: [N, 2] array of cell positions
+        widths: [N] array of cell widths
+        heights: [N] array of cell heights
+
+    Returns:
+        Tuple of (pairs, areas):
+            - pairs: list of (i, j) tuples of overlapping cell indices
+            - areas: list of overlap areas for each pair
+    """
+    N = len(positions)
+    si = np.argsort(positions[:, 0])
+    sp, sw, sh = positions[si], widths[si], heights[si]
+    md = max(widths.max(), heights.max())
+    pairs, areas = [], []
+
+    for k in range(1, N):
+        n = N - k
+        dx = sp[k:, 0] - sp[:n, 0]
+        if dx.min() > md:
+            break
+        ox = (sw[:n] + sw[k:]) / 2 - dx
+        xc = ox > 0
+        if not xc.any():
+            continue
+        ady = np.abs(sp[:n, 1] - sp[k:, 1])
+        oy = (sh[:n] + sh[k:]) / 2 - ady
+        hit = xc & (oy > 0)
+        if not hit.any():
+            continue
+        for idx in np.where(hit)[0]:
+            i, j = int(si[idx]), int(si[idx + k])
+            if i > j:
+                i, j = j, i
+            pairs.append((i, j))
+            areas.append(float(ox[idx] * oy[idx]))
+
+    return pairs, areas
+
+
 def calculate_overlap_metrics(cell_features):
     """Calculate ground truth overlap statistics (non-differentiable).
 
@@ -486,49 +1553,20 @@ def calculate_overlap_metrics(cell_features):
             "overlap_percentage": 0.0,
         }
 
-    # Extract cell properties
-    positions = cell_features[:, 2:4].detach().numpy()  # [N, 2]
-    widths = cell_features[:, 4].detach().numpy()  # [N]
-    heights = cell_features[:, 5].detach().numpy()  # [N]
-    areas = cell_features[:, 0].detach().numpy()  # [N]
-
-    overlap_count = 0
-    total_overlap_area = 0.0
-    max_overlap_area = 0.0
-    overlap_areas = []
+    pos = cell_features[:, 2:4].detach().numpy()
+    w = cell_features[:, 4].detach().numpy()
+    h = cell_features[:, 5].detach().numpy()
+    a = cell_features[:, 0].detach().numpy()
 
-    # Check all pairs
-    for i in range(N):
-        for j in range(i + 1, N):
-            # Calculate center-to-center distances
-            dx = abs(positions[i, 0] - positions[j, 0])
-            dy = abs(positions[i, 1] - positions[j, 1])
-
-            # Minimum separation for non-overlap
-            min_sep_x = (widths[i] + widths[j]) / 2
-            min_sep_y = (heights[i] + heights[j]) / 2
-
-            # Calculate overlap amounts
-            overlap_x = max(0, min_sep_x - dx)
-            overlap_y = max(0, min_sep_y - dy)
-
-            # Overlap occurs only if both x and y overlap
-            if overlap_x > 0 and overlap_y > 0:
-                overlap_area = overlap_x * overlap_y
-                overlap_count += 1
-                total_overlap_area += overlap_area
-                max_overlap_area = max(max_overlap_area, overlap_area)
-                overlap_areas.append(overlap_area)
-
-    # Calculate percentage of total area
-    total_area = sum(areas)
-    overlap_percentage = (overlap_count / N * 100) if total_area > 0 else 0.0
+    pairs, ov = _sweep_overlap_pairs(pos, w, h)
 
     return {
-        "overlap_count": overlap_count,
-        "total_overlap_area": total_overlap_area,
-        "max_overlap_area": max_overlap_area,
-        "overlap_percentage": overlap_percentage,
+        "overlap_count": len(pairs),
+        "total_overlap_area": sum(ov) if ov else 0.0,
+        "max_overlap_area": max(ov) if ov else 0.0,
+        "overlap_percentage": (
+            (len(pairs) / N * 100) if sum(a) > 0 else 0.0
+        ),
     }
 
 
@@ -547,34 +1585,18 @@ def calculate_cells_with_overlaps(cell_features):
     if N <= 1:
         return set()
 
-    # Extract cell properties
-    positions = cell_features[:, 2:4].detach().numpy()
-    widths = cell_features[:, 4].detach().numpy()
-    heights = cell_features[:, 5].detach().numpy()
-
-    cells_with_overlaps = set()
-
-    # Check all pairs
-    for i in range(N):
-        for j in range(i + 1, N):
-            # Calculate center-to-center distances
-            dx = abs(positions[i, 0] - positions[j, 0])
-            dy = abs(positions[i, 1] - positions[j, 1])
-
-            # Minimum separation for non-overlap
-            min_sep_x = (widths[i] + widths[j]) / 2
-            min_sep_y = (heights[i] + heights[j]) / 2
-
-            # Calculate overlap amounts
-            overlap_x = max(0, min_sep_x - dx)
-            overlap_y = max(0, min_sep_y - dy)
+    pairs, _ = _sweep_overlap_pairs(
+        cell_features[:, 2:4].detach().numpy(),
+        cell_features[:, 4].detach().numpy(),
+        cell_features[:, 5].detach().numpy(),
+    )
 
-            # Overlap occurs only if both x and y overlap
-            if overlap_x > 0 and overlap_y > 0:
-                cells_with_overlaps.add(i)
-                cells_with_overlaps.add(j)
+    cells = set()
+    for i, j in pairs:
+        cells.add(i)
+        cells.add(j)
 
-    return cells_with_overlaps
+    return cells
 
 
 def calculate_normalized_metrics(cell_features, pin_features, edge_list):
@@ -598,34 +1620,34 @@ def calculate_normalized_metrics(cell_features, pin_features, edge_list):
     N = cell_features.shape[0]
 
     # Calculate overlap metric: num cells with overlaps / total cells
-    cells_with_overlaps = calculate_cells_with_overlaps(cell_features)
-    num_cells_with_overlaps = len(cells_with_overlaps)
-    overlap_ratio = num_cells_with_overlaps / N if N > 0 else 0.0
+    num_ov = len(calculate_cells_with_overlaps(cell_features))
 
-    # Calculate wirelength metric: (wirelength / num nets) / sqrt(total area)
     if edge_list.shape[0] == 0:
-        normalized_wl = 0.0
-        num_nets = 0
-    else:
-        # Calculate total wirelength using the loss function (unnormalized)
-        wl_loss = wirelength_attraction_loss(cell_features, pin_features, edge_list)
-        total_wirelength = wl_loss.item() * edge_list.shape[0]  # Undo normalization
-
-        # Calculate total area
-        total_area = cell_features[:, 0].sum().item()
-
-        num_nets = edge_list.shape[0]
+        return {
+            "overlap_ratio": num_ov / N if N > 0 else 0.0,
+            "normalized_wl": 0.0,
+            "num_cells_with_overlaps": num_ov,
+            "total_cells": N,
+            "num_nets": 0,
+        }
 
-        # Normalize: (wirelength / net) / sqrt(area)
-        # This gives a dimensionless quality metric independent of design size
-        normalized_wl = (total_wirelength / num_nets) / (total_area ** 0.5) if total_area > 0 else 0.0
+    # Calculate wirelength metric: (wirelength / num nets) / sqrt(total area)
+    wl = wirelength_attraction_loss(cell_features, pin_features, edge_list)
+    ta = cell_features[:, 0].sum().item()
+    ne = edge_list.shape[0]
+
+    # Normalize: (wirelength / net) / sqrt(area)
+    # This gives a dimensionless quality metric independent of design size
+    normalized_wl = (
+        (wl.item() * ne / ne) / (ta ** 0.5) if ta > 0 else 0.0
+    )
 
     return {
-        "overlap_ratio": overlap_ratio,
+        "overlap_ratio": num_ov / N if N > 0 else 0.0,
         "normalized_wl": normalized_wl,
-        "num_cells_with_overlaps": num_cells_with_overlaps,
+        "num_cells_with_overlaps": num_ov,
         "total_cells": N,
-        "num_nets": num_nets,
+        "num_nets": ne,
     }
 
 
@@ -782,8 +1804,11 @@ def main():
     normalized_metrics = calculate_normalized_metrics(
         final_cell_features, pin_features, edge_list
     )
-    print(f"Overlap Ratio: {normalized_metrics['overlap_ratio']:.4f} "
-          f"({normalized_metrics['num_cells_with_overlaps']}/{normalized_metrics['total_cells']} cells)")
+    print(
+        f"Overlap Ratio: {normalized_metrics['overlap_ratio']:.4f} "
+        f"({normalized_metrics['num_cells_with_overlaps']}"
+        f"/{normalized_metrics['total_cells']} cells)"
+    )
     print(f"Normalized Wirelength: {normalized_metrics['normalized_wl']:.4f}")
 
     # Success check
@@ -793,11 +1818,20 @@ def main():
     if normalized_metrics["num_cells_with_overlaps"] == 0:
         print("✓ PASS: No overlapping cells!")
         print("✓ PASS: Overlap ratio is 0.0")
-        print("\nCongratulations! Your implementation successfully eliminated all overlaps.")
-        print(f"Your normalized wirelength: {normalized_metrics['normalized_wl']:.4f}")
+        print(
+            "\nCongratulations! Your implementation successfully"
+            " eliminated all overlaps."
+        )
+        print(
+            f"Your normalized wirelength:"
+            f" {normalized_metrics['normalized_wl']:.4f}"
+        )
     else:
         print("✗ FAIL: Overlaps still exist")
-        print(f"  Need to eliminate overlaps in {normalized_metrics['num_cells_with_overlaps']} cells")
+        print(
+            f"  Need to eliminate overlaps in"
+            f" {normalized_metrics['num_cells_with_overlaps']} cells"
+        )
         print("\nSuggestions:")
         print("  1. Check your overlap_repulsion_loss() implementation")
         print("  2. Change lambdas (try increasing lambda_overlap)")