diff --git a/README.md b/README.md
index 18aef43..c142a12 100644
--- a/README.md
+++ b/README.md
@@ -130,6 +130,15 @@ The `config` folder contains example configuration files for different architect
 4. **Tensor parallelism works automatically** because PyTorch Lightning handles the distributed setup and PyTorch's Distributed Tensor API shards your model across GPUs without requiring changes to your component code
 
 
+## Development
+
+Format and lint the codebase:
+
+```bash
+uv run ruff format .
+uv run ruff check --fix .
+```
+
 ## Testing
 
 Run the test suite to ensure everything is working correctly:
diff --git a/crosslayer_transcoder/data/__init__.py b/crosslayer_transcoder/data/__init__.py
index 52b22ec..81d524b 100644
--- a/crosslayer_transcoder/data/__init__.py
+++ b/crosslayer_transcoder/data/__init__.py
@@ -5,9 +5,6 @@
 with efficient multiprocessing data generation.
 """
 
-import torch
-import torch.multiprocessing as mp
-
 from .activation_sources import ActivationComputer, DiskActivationSource
 from .data_generator import DataGeneratorProcess
 
diff --git a/crosslayer_transcoder/data/activation_sources.py b/crosslayer_transcoder/data/activation_sources.py
index 3a6e9e2..4ef3650 100644
--- a/crosslayer_transcoder/data/activation_sources.py
+++ b/crosslayer_transcoder/data/activation_sources.py
@@ -79,8 +79,7 @@ def _extract_activations(self, model: Any, tokens: torch.Tensor, mask: torch.Ten
         """
         mlp_ins = []
         mlp_outs = []
-        with model.trace(tokens) as tracer:
-
+        with model.trace(tokens):
             # Extract from all transformer layers
             for i in range(self.n_layers):
                 # MLP input (after layer norm)
@@ -147,7 +146,7 @@ def _setup_file(self) -> None:
             # self.tensor_handle = self.file_handle[self.accessor]
             self.position = 0
         except Exception as e:
-            raise RuntimeError(f"Failed to open activation file {self.file_path}: {e}")
+            raise RuntimeError(f"Failed to open activation file {self.file_path}: {e}") from e
 
     def get_next_batch(self, batch_size: Optional[int] = None) -> torch.Tensor:
         """
diff --git a/crosslayer_transcoder/data/data_generator.py b/crosslayer_transcoder/data/data_generator.py
index 5bd5e2f..8e0fc38 100644
--- a/crosslayer_transcoder/data/data_generator.py
+++ b/crosslayer_transcoder/data/data_generator.py
@@ -8,12 +8,9 @@
 import os
 from typing import Optional
 
-import nnsight
 import torch
 from datasets import load_dataset
-from torch.utils.data import DataLoader
 
-from crosslayer_transcoder.data import text_dataset
 from crosslayer_transcoder.data.activation_sources import ActivationComputer, DiskActivationSource
 from crosslayer_transcoder.data.deployment_policy import DeploymentPolicy
 
@@ -205,7 +202,7 @@ def cleanup(self):
             for child in children:
                 try:
                     child.terminate()
-                except:
+                except Exception:
                     pass
         except ImportError:
             pass  # psutil not available
diff --git a/crosslayer_transcoder/data/deployment_policy.py b/crosslayer_transcoder/data/deployment_policy.py
index 3481343..99cb437 100644
--- a/crosslayer_transcoder/data/deployment_policy.py
+++ b/crosslayer_transcoder/data/deployment_policy.py
@@ -4,7 +4,6 @@
 """
 
 import logging
-import re
 from abc import ABC, abstractmethod
 from enum import Enum
 from typing import Any, Dict, List, Optional, Tuple
diff --git a/crosslayer_transcoder/data/generation_loop.py b/crosslayer_transcoder/data/generation_loop.py
index 021ec92..f5c6225 100644
--- a/crosslayer_transcoder/data/generation_loop.py
+++ b/crosslayer_transcoder/data/generation_loop.py
@@ -13,7 +13,6 @@
 from crosslayer_transcoder.data import text_dataset
 from crosslayer_transcoder.data.activation_sources import ActivationComputer, DiskActivationSource
 from crosslayer_transcoder.data.deployment_policy import (
-    BaseDeploymentPolicy,
     DeploymentPolicy,
     create_deployment_policy,
 )
@@ -138,7 +137,7 @@ def generation_loop(
             # Generate new activations
             gen_start = time.time()
             activations = self._generate_activations()
-            gen_time = time.time() - gen_start
+            time.time() - gen_start
 
             # Take only as many activations as we have indices
             num_indices = len(indices_to_refresh)
@@ -256,7 +255,7 @@ def refill_from_disk(self, batch_size: int = 40_000):
                 self.shared_buffer.set_activations(indices_to_fill, samples)
 
                 # Calculate refill rate for this batch
-                batch_time = time.time() - batch_start_time
+                time.time() - batch_start_time
                 batch_refilled = len(indices_to_fill)
                 total_refilled += batch_refilled
 
diff --git a/crosslayer_transcoder/data/process_monitor.py b/crosslayer_transcoder/data/process_monitor.py
index a84bd37..7738e5c 100644
--- a/crosslayer_transcoder/data/process_monitor.py
+++ b/crosslayer_transcoder/data/process_monitor.py
@@ -56,7 +56,7 @@ def update_dashboard(self, status: str, buffer_stats: Dict[str, Any], current_de
 
         # Format uptime with more stable display
         if uptime > 60:
-            uptime_str = f"{int(uptime/60)}:{int(uptime%60):02d}"
+            uptime_str = f"{int(uptime / 60)}:{int(uptime % 60):02d}"
         else:
             uptime_str = f"{int(uptime)}s"
 
diff --git a/crosslayer_transcoder/data/shared_memory.py b/crosslayer_transcoder/data/shared_memory.py
index f9405ec..58e617b 100644
--- a/crosslayer_transcoder/data/shared_memory.py
+++ b/crosslayer_transcoder/data/shared_memory.py
@@ -6,14 +6,11 @@
 import atexit
 import logging
 import multiprocessing as mp
-import threading
 import time
 from multiprocessing import shared_memory
-from typing import Any, Dict, List, Tuple
+from typing import Any, Dict
 
-import numpy as np
 import torch
-import torch.multiprocessing as torch_mp
 
 # No config imports needed in this module
 
diff --git a/crosslayer_transcoder/main.py b/crosslayer_transcoder/main.py
index 2afce89..d9e4c7f 100755
--- a/crosslayer_transcoder/main.py
+++ b/crosslayer_transcoder/main.py
@@ -17,9 +17,7 @@ class CrossLayerTranscoderCLI(LightningCLI):
     def add_arguments_to_parser(self, parser):
         """Add custom argument linking and configuration."""
         # Link model and data parameters that should be consistent
-        parser.link_arguments(
-            "data.init_args.n_layers", "model.init_args.nonlinearity.init_args.n_layers"
-        )
+        parser.link_arguments("data.init_args.n_layers", "model.init_args.nonlinearity.init_args.n_layers")
         parser.link_arguments("data.init_args.n_layers", "model.init_args.n_layers")
         parser.link_arguments(
             "model.init_args.d_features",
@@ -34,7 +32,7 @@ def main():
     os.environ.setdefault("WANDB_CACHE_DIR", f"{os.getcwd()}/wandb_cache")
 
     # Create CLI with subclass mode to support class_path configuration
-    cli = CrossLayerTranscoderCLI(
+    CrossLayerTranscoderCLI(
         model_class=L.LightningModule,  # Use base class for subclass mode
         datamodule_class=L.LightningDataModule,  # Use base class for subclass mode
         subclass_mode_model=True,  # Enable subclass mode for model
diff --git a/crosslayer_transcoder/metrics/replacement_model_accuracy.py b/crosslayer_transcoder/metrics/replacement_model_accuracy.py
index 2e11c01..d869d47 100644
--- a/crosslayer_transcoder/metrics/replacement_model_accuracy.py
+++ b/crosslayer_transcoder/metrics/replacement_model_accuracy.py
@@ -102,7 +102,7 @@ def update(self, clt, max_batches=20):
         with torch.no_grad():
             for i, (tokens, mask) in enumerate(self.loader):
                 torch.cuda.empty_cache()
-                print(f"computing replacement model", i)
+                print("computing replacement model", i)
                 tokens = self.handle_device(tokens)
                 mask = self.handle_device(mask)
                 if i >= max_batches:
diff --git a/crosslayer_transcoder/model/clt.py b/crosslayer_transcoder/model/clt.py
index cc58891..921f2ed 100644
--- a/crosslayer_transcoder/model/clt.py
+++ b/crosslayer_transcoder/model/clt.py
@@ -78,7 +78,9 @@ def decode(
             "from_layer to_layer -> batch_size to_layer d_acts",
         )
 
-    def forward(self, acts: Float[torch.Tensor, "batch_size n_layers d_acts"]) -> Tuple[
+    def forward(
+        self, acts: Float[torch.Tensor, "batch_size n_layers d_acts"]
+    ) -> Tuple[
         Float[torch.Tensor, "batch_size n_layers d_features"],
         Float[torch.Tensor, "batch_size n_layers d_features"],
         Float[torch.Tensor, "batch_size n_layers d_acts"],
@@ -158,12 +160,12 @@ def __init__(self, d_acts: int, d_features: int, n_layers: int):
         self.d_acts = d_acts
         self.d_features = d_features
         self.n_layers = n_layers
-        self.register_parameter(f"W", nn.Parameter(torch.empty((n_layers, d_features, d_acts))))
+        self.register_parameter("W", nn.Parameter(torch.empty((n_layers, d_features, d_acts))))
         self.reset_parameters()
 
     def reset_parameters(self):
         dec_uniform_thresh = 1 / ((self.d_acts * self.n_layers) ** 0.5)
-        self.get_parameter(f"W").data.uniform_(-dec_uniform_thresh, dec_uniform_thresh)
+        self.get_parameter("W").data.uniform_(-dec_uniform_thresh, dec_uniform_thresh)
 
     @torch.no_grad()
     def forward_layer(
@@ -173,7 +175,7 @@ def forward_layer(
             features = features[:, :, layer, :]
         return einsum(
             features,
-            self.get_parameter(f"W")[layer],
+            self.get_parameter("W")[layer],
             "batch_size seq d_features, d_features d_acts -> batch_size seq d_acts",
         )
 
@@ -199,7 +201,7 @@ def __init__(self, d_acts: int, d_features: int, n_layers: int):
         self.n_layers = n_layers
         for i in range(n_layers):
             self.register_parameter(f"W_{i}", nn.Parameter(torch.empty((i + 1, d_features, d_acts))))
-        self.register_parameter(f"b", nn.Parameter(torch.empty((n_layers, d_acts))))
+        self.register_parameter("b", nn.Parameter(torch.empty((n_layers, d_acts))))
         self.reset_parameters()
 
     def reset_parameters(self):
@@ -273,7 +275,9 @@ def initialize_standardizers(self, batch: Float[torch.Tensor, "batch_size io n_l
         self.input_standardizer.initialize_from_batch(batch)
         self.output_standardizer.initialize_from_batch(batch)
 
-    def forward(self, acts: Float[torch.Tensor, "batch_size n_layers d_acts"]) -> Tuple[
+    def forward(
+        self, acts: Float[torch.Tensor, "batch_size n_layers d_acts"]
+    ) -> Tuple[
         Float[torch.Tensor, "batch_size n_layers d_features"],  # pre_actvs
         Float[torch.Tensor, "batch_size n_layers d_features"],  # features
         Float[torch.Tensor, "batch_size n_layers d_acts"],  # recons_norm
diff --git a/crosslayer_transcoder/model/clt_lightning.py b/crosslayer_transcoder/model/clt_lightning.py
index 39a874a..17305ea 100644
--- a/crosslayer_transcoder/model/clt_lightning.py
+++ b/crosslayer_transcoder/model/clt_lightning.py
@@ -1,18 +1,12 @@
 import gc
-import os
-import subprocess
-import time
 from typing import Optional, Tuple
 
 import lightning as L
-import psutil
 import torch
-import torch.nn as nn
-from einops import einsum
+import wandb
 from jaxtyping import Float
 from torch.distributed.tensor.parallel import parallelize_module
 
-import wandb
 from crosslayer_transcoder.metrics.dead_features import DeadFeatures
 from crosslayer_transcoder.metrics.replacement_model_accuracy import (
     ReplacementModelAccuracy,
@@ -22,8 +16,6 @@
     CrossLayerTranscoder,
     Decoder,
 )
-from crosslayer_transcoder.model.jumprelu import JumpReLU
-from crosslayer_transcoder.model.topk import BatchTopK
 
 
 class CrossLayerTranscoderModule(L.LightningModule):
@@ -55,9 +47,7 @@ def __init__(
     ):
         super().__init__(*args, **kwargs)
 
-        self.save_hyperparameters(
-            ignore=["model", "replacement_model", "dead_features"]
-        )
+        self.save_hyperparameters(ignore=["model", "replacement_model", "dead_features"])
         # torch.cuda.memory._record_memory_history(max_entries=100_000)
 
         # Store pre-constructed modules
@@ -143,9 +133,7 @@ def log_training_metrics(self, features, recons_norm, recons, mlp_out, batch_idx
 
         ss_err = (mlp_out_norm - recons_norm) ** 2
         ss_err = ss_err.sum(dim=0)
-        ss_total = ((mlp_out_norm - mlp_out_norm.mean(dim=0, keepdim=True)) ** 2).sum(
-            dim=0
-        )
+        ss_total = ((mlp_out_norm - mlp_out_norm.mean(dim=0, keepdim=True)) ** 2).sum(dim=0)
         fvu = (ss_err / ss_total).mean()  # (n_layers, d_model)
         self.log("metrics/fraction_of_variance_unexplained", fvu)
         fvu_per_layer = (ss_err / ss_total).mean(dim=-1)
@@ -189,8 +177,7 @@ def log_training_metrics(self, features, recons_norm, recons, mlp_out, batch_idx
         self.log("metrics/recons_standardized_std", recons_norm.std())
         self.log(
             "metrics/L0_avg_per_layer",
-            torch.count_nonzero(active_features)
-            / (features.shape[0] * features.shape[1]),
+            torch.count_nonzero(active_features) / (features.shape[0] * features.shape[1]),
         )
 
         # Magnitude of feature activations - memory efficient version
@@ -216,9 +203,7 @@ def log_training_metrics(self, features, recons_norm, recons, mlp_out, batch_idx
 
         # Log L0 table per layer
         if batch_idx % 500 == 1:
-            l0_per_layer = (
-                torch.count_nonzero(active_features, dim=(0, 2)) / features.shape[0]
-            )
+            l0_per_layer = torch.count_nonzero(active_features, dim=(0, 2)) / features.shape[0]
 
             if self.logger and isinstance(self.logger.experiment, wandb.wandb_run.Run):
                 table = wandb.Table(
@@ -226,11 +211,7 @@ def log_training_metrics(self, features, recons_norm, recons, mlp_out, batch_idx
                     columns=["layer", "L0"],
                 )
                 self.logger.experiment.log(
-                    {
-                        "layers/L0_per_layer": wandb.plot.bar(
-                            table, "layer", "L0", title="L0 per Layer"
-                        )
-                    },
+                    {"layers/L0_per_layer": wandb.plot.bar(table, "layer", "L0", title="L0 per Layer")},
                     step=self.global_step,
                 )
 
@@ -259,15 +240,11 @@ def log_training_metrics(self, features, recons_norm, recons, mlp_out, batch_idx
                 ):
                     for layer in range(dead_log_freqs.shape[0]):
                         self.logger.experiment.log(
-                            {
-                                f"layers/log_feature_density/layer_{layer}": dead_log_freqs[
-                                    layer
-                                ]
-                            },
+                            {f"layers/log_feature_density/layer_{layer}": dead_log_freqs[layer]},
                             step=self.global_step,
                         )
                     self.logger.experiment.log(
-                        {f"training/log_feature_density": dead_log_freqs.flatten()},
+                        {"training/log_feature_density": dead_log_freqs.flatten()},
                         step=self.global_step,
                     )
                 self.log("training/log_feature_density_mean", dead_log_freqs.mean())
@@ -433,9 +410,7 @@ def training_step(self, batch, batch_idx):
         if isinstance(self.model.decoder, CrosslayerDecoder):
             dec_norms = torch.zeros_like(features[:1])
             for l in range(self.model.decoder.n_layers):
-                W = self.model.decoder.get_parameter(
-                    f"W_{l}"
-                )  # (from_layer, d_features, d_acts)
+                W = self.model.decoder.get_parameter(f"W_{l}")  # (from_layer, d_features, d_acts)
                 dec_norms[:, : l + 1] = dec_norms[:, : l + 1] + (W**2).sum(dim=-1)
             dec_norms = dec_norms.sqrt()
 
@@ -443,17 +418,11 @@ def training_step(self, batch, batch_idx):
             dec_norms = torch.sqrt((self.model.decoder.W**2).sum(dim=-1))
 
         weighted_features = features * dec_norms * self.c
-        self.log(
-            "model/weighted_features_mean", weighted_features.detach().mean().cpu()
-        )
+        self.log("model/weighted_features_mean", weighted_features.detach().mean().cpu())
 
         if self.use_tanh:
-            weighted_features = torch.tanh(
-                weighted_features
-            )  # (batch_size, n_layers, d_features)
-        sparsity = (
-            self.current_sparsity_penalty() * weighted_features.sum(dim=[1, 2]).mean()
-        )
+            weighted_features = torch.tanh(weighted_features)  # (batch_size, n_layers, d_features)
+        sparsity = self.current_sparsity_penalty() * weighted_features.sum(dim=[1, 2]).mean()
         self.log("training/sparsity_loss", sparsity)
 
         # Compute Pre-activation Loss
diff --git a/crosslayer_transcoder/model/parallel.py b/crosslayer_transcoder/model/parallel.py
index 0dc9777..f2f8ac1 100644
--- a/crosslayer_transcoder/model/parallel.py
+++ b/crosslayer_transcoder/model/parallel.py
@@ -1,43 +1,25 @@
 # mypy: allow-untyped-defs
 # Copyright (c) Meta Platforms, Inc. and affiliates
-from abc import ABC, abstractmethod
-from collections.abc import Sequence
-from dataclasses import dataclass
-from functools import cached_property, partial
-from typing import Any, Optional, Tuple, Union, cast
+from functools import partial
+from typing import Optional, cast
 
-import lightning as L
 import torch
 import torch.nn as nn
-from einops import einsum, rearrange
-from jaxtyping import Float
-from torch.distributed._tensor.placement_types import Partial, Replicate, Shard
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor import (
-    DeviceMesh,
     DTensor,
-    Replicate,
-    Shard,
     distribute_module,
     distribute_tensor,
 )
 from torch.distributed.tensor._dtensor_spec import DTensorSpec
-from torch.distributed.tensor._op_schema import OpSchema  # OpSpec,
 from torch.distributed.tensor._op_schema import (
     OpStrategy,
     PlacementStrategy,
-    RuntimeSchemaInfo,
-    StrategyType,
-    _is_inplace_op,
 )
-from torch.distributed.tensor._ops.utils import is_tensor_dim_sharded, normalize_dim, register_op_strategy
-from torch.distributed.tensor.parallel import RowwiseParallel, parallelize_module
+from torch.distributed.tensor._ops.utils import register_op_strategy
 from torch.distributed.tensor.parallel.style import ParallelStyle
 from torch.distributed.tensor.placement_types import Placement, Replicate, Shard
-from torch.nn.modules.activation import ReLU
 
-import wandb
-from crosslayer_transcoder.metrics.replacement_model_accuracy import ReplacementModelAccuracy
 from crosslayer_transcoder.model.jumprelu import JumpReLU
 
 aten = torch.ops.aten  # convenience alias
@@ -73,7 +55,7 @@ def select_int_strategy(op_schema):
 
                 if shard_dim == selected_dim:
                     raise NotImplementedError(
-                        "Selecting along a dimension that is sharded " "is not supported on PyTorch-2.7.1."
+                        "Selecting along a dimension that is sharded is not supported on PyTorch-2.7.1."
                     )
                 # shift shard index left if we dropped an earlier dimension
                 if shard_dim > selected_dim:
diff --git a/crosslayer_transcoder/model/standardize.py b/crosslayer_transcoder/model/standardize.py
index 45b1b63..d5e0fc8 100644
--- a/crosslayer_transcoder/model/standardize.py
+++ b/crosslayer_transcoder/model/standardize.py
@@ -1,6 +1,6 @@
-from einops import einsum
 import torch
 import torch.nn as nn
+from einops import einsum
 from jaxtyping import Float
 
 
@@ -8,19 +8,13 @@ class Standardizer(nn.Module):
     def __init__(self, **kwargs):
         super().__init__()
 
-    def initialize_from_batch(
-        self, batch: Float[torch.Tensor, "batch_size io n_layers actv_dim"]
-    ):
+    def initialize_from_batch(self, batch: Float[torch.Tensor, "batch_size io n_layers actv_dim"]):
         pass
 
-    def forward(
-        self, batch: Float[torch.Tensor, "batch_size n_layers actv_dim"], layer="all"
-    ):
+    def forward(self, batch: Float[torch.Tensor, "batch_size n_layers actv_dim"], layer="all"):
         return batch
 
-    def standardize(
-        self, batch: Float[torch.Tensor, "batch_size n_layers actv_dim"], layer="all"
-    ):
+    def standardize(self, batch: Float[torch.Tensor, "batch_size n_layers actv_dim"], layer="all"):
         return batch
 
 
@@ -32,9 +26,7 @@ def __init__(self, n_layers, activation_dim):
         self.is_initialized = False
 
     @torch.no_grad()
-    def initialize_from_batch(
-        self, batch: Float[torch.Tensor, "batch_size io n_layers actv_dim"]
-    ):
+    def initialize_from_batch(self, batch: Float[torch.Tensor, "batch_size io n_layers actv_dim"]):
         batch = batch[:, 0]
         self.mean.data = batch.mean(dim=0)
         self.std.data = batch.std(dim=0)
@@ -80,9 +72,7 @@ def __init__(self, n_layers, activation_dim):
         self.register_buffer("std", torch.empty(n_layers, activation_dim))
         self.is_initialized = False
 
-    def initialize_from_batch(
-        self, batch: Float[torch.Tensor, "batch_size io n_layers actv_dim"]
-    ):
+    def initialize_from_batch(self, batch: Float[torch.Tensor, "batch_size io n_layers actv_dim"]):
         self.mean.data = batch[:, 1].mean(dim=0)
         self.std.data = batch[:, 1].std(dim=0)
         self.std.data.clamp_(min=1e-8)
@@ -137,9 +127,7 @@ def __init__(self):
         super().__init__()
 
     @torch.no_grad()
-    def initialize_from_batch(
-        self, batch: Float[torch.Tensor, "batch_size io n_layers actv_dim"]
-    ):
+    def initialize_from_batch(self, batch: Float[torch.Tensor, "batch_size io n_layers actv_dim"]):
         pass
 
     def forward(
@@ -162,9 +150,7 @@ def __init__(self, n_layers, n_exclude: int = 0):
         self.is_initialized = False
 
     @torch.no_grad()
-    def initialize_from_batch(
-        self, batch: Float[torch.Tensor, "batch_size io n_layers actv_dim"]
-    ):
+    def initialize_from_batch(self, batch: Float[torch.Tensor, "batch_size io n_layers actv_dim"]):
         batch = batch[:, 0]
         # exclude the n_exclude largest and smallest values
         topk = batch.topk(self.n_exclude, dim=-1, sorted=False)
@@ -198,9 +184,7 @@ def __init__(self, n_layers, n_exclude: int = 0):
         self.n_exclude = n_exclude
         self.is_initialized = False
 
-    def initialize_from_batch(
-        self, batch: Float[torch.Tensor, "batch_size io n_layers actv_dim"]
-    ):
+    def initialize_from_batch(self, batch: Float[torch.Tensor, "batch_size io n_layers actv_dim"]):
         batch = batch[:, 1]
         # exclude the n_exclude largest and smallest values
         topk = batch.topk(self.n_exclude, dim=-1, sorted=False)
@@ -233,6 +217,4 @@ def standardize(
         if layer == "all":
             return (mlp_out - self.mean[None, :, None]) / self.std[None, :, None]
         else:
-            return (mlp_out - self.mean[None, layer, None]) / self.std[
-                None, layer, None
-            ]
+            return (mlp_out - self.mean[None, layer, None]) / self.std[None, layer, None]
diff --git a/crosslayer_transcoder/utils/buffer.py b/crosslayer_transcoder/utils/buffer.py
index b60a0e8..f6bfd66 100644
--- a/crosslayer_transcoder/utils/buffer.py
+++ b/crosslayer_transcoder/utils/buffer.py
@@ -1,6 +1,6 @@
+import h5py
 import torch
 from torch.utils.data import Dataset
-import h5py
 
 
 class DiscBuffer(Dataset):
@@ -33,4 +33,4 @@ def __getitem__(self, idx):
     def close(self):
         # Close the HDF5 file when done
         if self.h5:
-            self.h5.close()
\ No newline at end of file
+            self.h5.close()
diff --git a/crosslayer_transcoder/utils/convert_activations_to_float16.py b/crosslayer_transcoder/utils/convert_activations_to_float16.py
index deebad7..1a97daa 100755
--- a/crosslayer_transcoder/utils/convert_activations_to_float16.py
+++ b/crosslayer_transcoder/utils/convert_activations_to_float16.py
@@ -65,7 +65,7 @@ def convert_h5_float32_to_float16(
         new_size_gb = dataset.size * 2 / (1024**3)  # float16 = 2 bytes
         print(f"Original size: {original_size_gb:.2f} GB")
         print(f"New size: {new_size_gb:.2f} GB")
-        print(f"Size reduction: {(1 - new_size_gb/original_size_gb)*100:.1f}%")
+        print(f"Size reduction: {(1 - new_size_gb / original_size_gb) * 100:.1f}%")
 
         # Create output file
         with h5py.File(output_path, "w") as f_out:
@@ -81,7 +81,7 @@ def convert_h5_float32_to_float16(
 
             for start_idx in range(0, total_samples, chunk_size):
                 end_idx = min(start_idx + chunk_size, total_samples)
-                current_chunk_size = end_idx - start_idx
+                end_idx - start_idx
 
                 # Read chunk from input
                 chunk_data = dataset[start_idx:end_idx]
@@ -125,7 +125,7 @@ def convert_h5_float32_to_float16(
     total_time = time.time() - start_time
     final_rate = total_samples / total_time
 
-    print(f"\n✅ Conversion complete!")
+    print("\n✅ Conversion complete!")
     print(f"Total time: {total_time:.1f}s")
     print(f"Average rate: {final_rate:,.0f} samples/s")
     print(f"Output file: {output_path}")
@@ -143,13 +143,9 @@ def main():
     """Main function for command line usage."""
     import argparse
 
-    parser = argparse.ArgumentParser(
-        description="Convert HDF5 activations from float32 to float16"
-    )
+    parser = argparse.ArgumentParser(description="Convert HDF5 activations from float32 to float16")
     parser.add_argument("input_file", help="Path to input HDF5 file (float32)")
-    parser.add_argument(
-        "-o", "--output", help="Path to output HDF5 file (default: input_fp16.h5)"
-    )
+    parser.add_argument("-o", "--output", help="Path to output HDF5 file (default: input_fp16.h5)")
     parser.add_argument(
         "-c",
         "--chunk-size",
diff --git a/crosslayer_transcoder/utils/debug.ipynb b/crosslayer_transcoder/utils/debug.ipynb
index d6939d4..e14cca1 100644
--- a/crosslayer_transcoder/utils/debug.ipynb
+++ b/crosslayer_transcoder/utils/debug.ipynb
@@ -28,7 +28,7 @@
     }
    ],
    "source": [
-    "t = torch.load('../lifetime_active.pt').cpu()\n",
+    "t = torch.load(\"../lifetime_active.pt\").cpu()\n",
     "t.shape"
    ]
   },
@@ -86,7 +86,7 @@
    ],
    "source": [
     "t1 = t2\n",
-    "t2 = torch.load('../lifetime_active.pt').cpu()\n",
+    "t2 = torch.load(\"../lifetime_active.pt\").cpu()\n",
     "plt.figure(figsize=(20, 5))\n",
     "plt.hist((t2 - t1).flatten(), bins=100, range=(0, 100))\n",
     "plt.show()"
diff --git a/crosslayer_transcoder/utils/shuffle_activations.py b/crosslayer_transcoder/utils/shuffle_activations.py
index 8ae963c..43dcc49 100644
--- a/crosslayer_transcoder/utils/shuffle_activations.py
+++ b/crosslayer_transcoder/utils/shuffle_activations.py
@@ -9,7 +9,6 @@
 
 import h5py
 import numpy as np
-from tqdm import tqdm
 
 
 def shuffle_activations(input_path, output_path, seed=42):
@@ -54,7 +53,7 @@ def shuffle_activations(input_path, output_path, seed=42):
     print("Writing shuffled data...")
     with h5py.File(output_path, "w") as f_out:
         # Create dataset and write entire tensor at once
-        shuffled_tensor = f_out.create_dataset(
+        f_out.create_dataset(
             "tensor",
             data=data,  # Write all data at once
             chunks=True,  # Enable chunking for better I/O
@@ -81,7 +80,6 @@ def verify_shuffle(original_path, shuffled_path, num_samples=100):
         h5py.File(original_path, "r") as f_orig,
         h5py.File(shuffled_path, "r") as f_shuf,
     ):
-
         orig_tensor = f_orig["tensor"]
         shuf_tensor = f_shuf["tensor"]
 
@@ -91,16 +89,14 @@ def verify_shuffle(original_path, shuffled_path, num_samples=100):
         batch_size = orig_tensor.shape[0]
 
         # Sample indices for verification
-        test_indices = np.random.choice(
-            batch_size, min(num_samples, batch_size), replace=False
-        )
+        test_indices = np.random.choice(batch_size, min(num_samples, batch_size), replace=False)
 
         # Read only the samples we need for verification - much more efficient
         orig_samples = orig_tensor[test_indices]
         shuf_samples = shuf_tensor[test_indices]
 
         differences_found = 0
-        for i, idx in enumerate(test_indices):
+        for i, _idx in enumerate(test_indices):
             if not np.array_equal(orig_samples[i], shuf_samples[i]):
                 differences_found += 1
 
@@ -112,9 +108,7 @@ def verify_shuffle(original_path, shuffled_path, num_samples=100):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Shuffle activations along batch dimension"
-    )
+    parser = argparse.ArgumentParser(description="Shuffle activations along batch dimension")
     parser.add_argument(
         "--input",
         default="/var/local/glang/activations/clt-activations-10M.h5",
@@ -125,12 +119,8 @@ def verify_shuffle(original_path, shuffled_path, num_samples=100):
         default="/var/local/glang/activations/clt-activations-10M-shuffled.h5",
         help="Output HDF5 file path",
     )
-    parser.add_argument(
-        "--seed", type=int, default=42, help="Random seed for reproducible shuffling"
-    )
-    parser.add_argument(
-        "--verify", action="store_true", help="Verify the shuffle after completion"
-    )
+    parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducible shuffling")
+    parser.add_argument("--verify", action="store_true", help="Verify the shuffle after completion")
 
     args = parser.parse_args()
 
diff --git a/crosslayer_transcoder/utils/store_actvs benchmark.ipynb b/crosslayer_transcoder/utils/store_actvs benchmark.ipynb
index 4206fc2..f554d6b 100644
--- a/crosslayer_transcoder/utils/store_actvs benchmark.ipynb	
+++ b/crosslayer_transcoder/utils/store_actvs benchmark.ipynb	
@@ -24,13 +24,14 @@
     }
    ],
    "source": [
-    "import torch\n",
-    "import nnsight\n",
-    "import datasets\n",
+    "import os\n",
+    "\n",
     "import activation_server.text_dataset as text_dataset\n",
-    "from torch.utils.data import DataLoader\n",
+    "import datasets\n",
     "import h5py\n",
-    "import os\n",
+    "import nnsight\n",
+    "import torch\n",
+    "from torch.utils.data import DataLoader\n",
     "\n",
     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"\"\n",
     "\n",
@@ -59,7 +60,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = nnsight.LanguageModel('openai-community/gpt2', device_map='auto', dispatch=True)"
+    "model = nnsight.LanguageModel(\"openai-community/gpt2\", device_map=\"auto\", dispatch=True)"
    ]
   },
   {
@@ -119,7 +120,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset = datasets.load_dataset('Skylion007/openwebtext', split='train')"
+    "dataset = datasets.load_dataset(\"Skylion007/openwebtext\", split=\"train\")"
    ]
   },
   {
@@ -221,6 +222,7 @@
    ],
    "source": [
     "import time\n",
+    "\n",
     "from tqdm import tqdm\n",
     "\n",
     "# Run for 10 seconds to get average throughput\n",
@@ -239,10 +241,10 @@
     "samples_per_sec = total_samples / elapsed\n",
     "batches_per_sec = total_batches / elapsed\n",
     "\n",
-    "print(f\"\\nThroughput:\")\n",
+    "print(\"\\nThroughput:\")\n",
     "print(f\"Samples/sec: {samples_per_sec:.1f}\")\n",
     "print(f\"Batches/sec: {batches_per_sec:.1f}\")\n",
-    "print(f\"Average batch size: {total_samples/total_batches:.1f}\")"
+    "print(f\"Average batch size: {total_samples / total_batches:.1f}\")"
    ]
   },
   {
@@ -253,7 +255,7 @@
    "outputs": [],
    "source": [
     "def extract_activations(model, tokens):\n",
-    "    with model.trace(tokens) as tracer:\n",
+    "    with model.trace(tokens):\n",
     "        mlp_ins = []\n",
     "        mlp_outs = []\n",
     "        for i in range(12):\n",
@@ -303,8 +305,8 @@
    ],
    "source": [
     "import time\n",
-    "from tqdm import tqdm\n",
     "\n",
+    "from tqdm import tqdm\n",
     "\n",
     "# Run benchmark for 10 seconds to measure activation extraction throughput\n",
     "start_time = time.time()\n",
@@ -316,14 +318,14 @@
     "for batch in tqdm(text_dataset_loader):\n",
     "    if time.time() > end_time:\n",
     "        break\n",
-    "        \n",
+    "\n",
     "    # prepend BOS token like in main loop\n",
     "    batch = torch.roll(batch, shifts=1, dims=1)\n",
     "    batch[:, 0] = model.config.bos_token_id\n",
-    "    \n",
+    "\n",
     "    # Extract activations\n",
     "    mlp_acts = extract_activations(model, batch)\n",
-    "    \n",
+    "\n",
     "    total_tokens += batch.numel()\n",
     "    total_batches += 1\n",
     "\n",
@@ -331,10 +333,10 @@
     "tokens_per_sec = total_tokens / elapsed\n",
     "batches_per_sec = total_batches / elapsed\n",
     "\n",
-    "print(f\"\\nActivation Extraction Throughput:\")\n",
+    "print(\"\\nActivation Extraction Throughput:\")\n",
     "print(f\"Tokens/sec: {tokens_per_sec:,.1f}\")\n",
     "print(f\"Batches/sec: {batches_per_sec:.1f}\")\n",
-    "print(f\"Average batch size: {total_tokens/total_batches:.1f} tokens\")\n"
+    "print(f\"Average batch size: {total_tokens / total_batches:.1f} tokens\")"
    ]
   },
   {
@@ -357,8 +359,7 @@
    ],
    "source": [
     "with model.trace(batch):\n",
-    "    print('processing')\n",
-    "    "
+    "    print(\"processing\")"
    ]
   },
   {
@@ -637,14 +638,14 @@
     }
    ],
    "source": [
-    "store_path = '/var/local/glang/activations'\n",
-    "filename = 'clt-activations-10M.h5'\n",
+    "store_path = \"/var/local/glang/activations\"\n",
+    "filename = \"clt-activations-10M.h5\"\n",
     "store_size = 10000000\n",
     "actv_size = model.config.n_embd\n",
     "\n",
     "with h5py.File(os.path.join(store_path, filename), \"w\") as f:\n",
     "    h5_dataset = f.create_dataset(\n",
-    "        'tensor', (store_size, 2, model.config.n_layer, model.config.n_embd), dtype='float32'\n",
+    "        \"tensor\", (store_size, 2, model.config.n_layer, model.config.n_embd), dtype=\"float32\"\n",
     "    )\n",
     "\n",
     "    h5_pointer = 0\n",
@@ -662,9 +663,7 @@
     "        n_acts = mlp_acts.shape[0]\n",
     "\n",
     "        if h5_pointer + n_acts > store_size:\n",
-    "            h5_dataset[h5_pointer:] = (\n",
-    "                mlp_acts[: int(store_size - h5_pointer)].cpu().numpy()\n",
-    "            )\n",
+    "            h5_dataset[h5_pointer:] = mlp_acts[: int(store_size - h5_pointer)].cpu().numpy()\n",
     "            break\n",
     "        else:\n",
     "            h5_dataset[h5_pointer : h5_pointer + n_acts] = mlp_acts.cpu().numpy()\n",
diff --git a/crosslayer_transcoder/utils/store_actvs.ipynb b/crosslayer_transcoder/utils/store_actvs.ipynb
index 6229c23..9188acf 100644
--- a/crosslayer_transcoder/utils/store_actvs.ipynb
+++ b/crosslayer_transcoder/utils/store_actvs.ipynb
@@ -25,13 +25,14 @@
     }
    ],
    "source": [
-    "import torch\n",
-    "import nnsight\n",
+    "import os\n",
+    "\n",
     "import datasets\n",
+    "import h5py\n",
+    "import nnsight\n",
     "import text_dataset\n",
+    "import torch\n",
     "from torch.utils.data import DataLoader\n",
-    "import h5py\n",
-    "import os\n",
     "\n",
     "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"3\"\n",
     "\n",
@@ -60,7 +61,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = nnsight.LanguageModel('openai-community/gpt2', device_map='auto', dispatch=True)"
+    "model = nnsight.LanguageModel(\"openai-community/gpt2\", device_map=\"auto\", dispatch=True)"
    ]
   },
   {
@@ -120,7 +121,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset = datasets.load_dataset('Skylion007/openwebtext', split='train')"
+    "dataset = datasets.load_dataset(\"Skylion007/openwebtext\", split=\"train\")"
    ]
   },
   {
@@ -178,7 +179,7 @@
    "outputs": [],
    "source": [
     "def extract_activations(model, tokens):\n",
-    "    with model.trace(tokens) as tracer:\n",
+    "    with model.trace(tokens):\n",
     "        mlp_ins = []\n",
     "        mlp_outs = []\n",
     "        for i in range(12):\n",
@@ -459,14 +460,14 @@
     }
    ],
    "source": [
-    "store_path = '/var/local/glang/activations'\n",
-    "filename = 'clt-activations-10M.h5'\n",
+    "store_path = \"/var/local/glang/activations\"\n",
+    "filename = \"clt-activations-10M.h5\"\n",
     "store_size = 10000000\n",
     "actv_size = model.config.n_embd\n",
     "\n",
     "with h5py.File(os.path.join(store_path, filename), \"w\") as f:\n",
     "    h5_dataset = f.create_dataset(\n",
-    "        'tensor', (store_size, 2, model.config.n_layer, model.config.n_embd), dtype='float32'\n",
+    "        \"tensor\", (store_size, 2, model.config.n_layer, model.config.n_embd), dtype=\"float32\"\n",
     "    )\n",
     "\n",
     "    h5_pointer = 0\n",
@@ -484,9 +485,7 @@
     "        n_acts = mlp_acts.shape[0]\n",
     "\n",
     "        if h5_pointer + n_acts > store_size:\n",
-    "            h5_dataset[h5_pointer:] = (\n",
-    "                mlp_acts[: int(store_size - h5_pointer)].cpu().numpy()\n",
-    "            )\n",
+    "            h5_dataset[h5_pointer:] = mlp_acts[: int(store_size - h5_pointer)].cpu().numpy()\n",
     "            break\n",
     "        else:\n",
     "            h5_dataset[h5_pointer : h5_pointer + n_acts] = mlp_acts.cpu().numpy()\n",
diff --git a/crosslayer_transcoder/utils/store_actvs_benchmark.py b/crosslayer_transcoder/utils/store_actvs_benchmark.py
index bb7c204..a88adfb 100644
--- a/crosslayer_transcoder/utils/store_actvs_benchmark.py
+++ b/crosslayer_transcoder/utils/store_actvs_benchmark.py
@@ -1,15 +1,13 @@
 import os
 import time
 
+import activation_server.text_dataset as text_dataset
 import datasets
-import h5py
 import nnsight
 import torch
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 
-import activation_server.text_dataset as text_dataset
-
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 
 
@@ -32,9 +30,7 @@
 
 def create_model(use_fp16=False):
     """Create model with optional FP16"""
-    model = nnsight.LanguageModel(
-        "openai-community/gpt2", device_map="cpu", dispatch=True
-    )
+    model = nnsight.LanguageModel("openai-community/gpt2", device_map="cpu", dispatch=True)
     model.requires_grad_(False)
 
     if use_fp16:
@@ -50,7 +46,7 @@ def create_model(use_fp16=False):
 
 @torch.no_grad()
 def extract_activations(model, tokens):
-    with model.trace(tokens) as tracer:
+    with model.trace(tokens):
         mlp_ins = []
         mlp_outs = []
         for i in range(12):
@@ -108,7 +104,7 @@ def benchmark_config(batch_size, use_fp16=False, duration=30):
             batch[:, 0] = model.config.bos_token_id
 
             # Extract activations
-            mlp_acts = extract_activations(model, batch)
+            extract_activations(model, batch)
 
             total_tokens += batch.numel()
             total_batches += 1
@@ -121,8 +117,8 @@ def benchmark_config(batch_size, use_fp16=False, duration=30):
     tokens_per_sec = total_tokens / elapsed
 
     print(f"Tokens/sec: {tokens_per_sec:,.1f}")
-    print(f"Batches/sec: {total_batches/elapsed:.1f}")
-    print(f"Avg batch size: {total_tokens/total_batches:.1f} tokens")
+    print(f"Batches/sec: {total_batches / elapsed:.1f}")
+    print(f"Avg batch size: {total_tokens / total_batches:.1f} tokens")
 
     return tokens_per_sec
 
@@ -140,11 +136,7 @@ def benchmark_config(batch_size, use_fp16=False, duration=30):
 
     print("\n=== RESULTS ===")
     baseline = results.get("batch_40", 2100)
-    for config, tokens_per_sec in sorted(
-        results.items(), key=lambda x: x[1], reverse=True
-    ):
+    for config, tokens_per_sec in sorted(results.items(), key=lambda x: x[1], reverse=True):
         if tokens_per_sec > 0:
             improvement = tokens_per_sec / baseline
-            print(
-                f"{config:15}: {tokens_per_sec:8,.1f} tokens/sec ({improvement:.1f}x)"
-            )
+            print(f"{config:15}: {tokens_per_sec:8,.1f} tokens/sec ({improvement:.1f}x)")
diff --git a/crosslayer_transcoder/utils/store_actvs_benchmark_optimized.py b/crosslayer_transcoder/utils/store_actvs_benchmark_optimized.py
index 51aace9..0eaf869 100644
--- a/crosslayer_transcoder/utils/store_actvs_benchmark_optimized.py
+++ b/crosslayer_transcoder/utils/store_actvs_benchmark_optimized.py
@@ -1,14 +1,13 @@
 import os
 import time
 
+import activation_server.text_dataset as text_dataset
 import datasets
 import nnsight
 import torch
 from torch.utils.data import DataLoader
 from tqdm import tqdm
 
-import activation_server.text_dataset as text_dataset
-
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
 
 # Set optimal thread count for single model
@@ -21,9 +20,7 @@
 
 def create_model(use_float16=False):
     """Create and optimize model"""
-    model = nnsight.LanguageModel(
-        "openai-community/gpt2", device_map="cpu", dispatch=True
-    )
+    model = nnsight.LanguageModel("openai-community/gpt2", device_map="cpu", dispatch=True)
     model.requires_grad_(False)
 
     if use_float16:
@@ -36,7 +33,7 @@ def extract_activations_optimized(model, tokens, use_float16=False):
     """Optimized activation extraction"""
     # Don't convert tokens to float16 - they must stay as integers for embedding
 
-    with model.trace(tokens) as tracer:
+    with model.trace(tokens):
         mlp_ins = []
         mlp_outs = []
         for i in range(12):
@@ -59,9 +56,7 @@ def extract_activations_optimized(model, tokens, use_float16=False):
 
 def benchmark_single_model(batch_size=40, use_float16=False, duration=30):
     """Benchmark single model with different batch sizes"""
-    print(
-        f"\n=== Single Model Benchmark (batch_size={batch_size}, fp16={use_float16}) ==="
-    )
+    print(f"\n=== Single Model Benchmark (batch_size={batch_size}, fp16={use_float16}) ===")
 
     model = create_model(use_float16)
     dataset = datasets.load_dataset("Skylion007/openwebtext", split="train")
@@ -101,7 +96,7 @@ def benchmark_single_model(batch_size=40, use_float16=False, duration=30):
         batch[:, 0] = model.config.bos_token_id
 
         # Extract activations
-        mlp_acts = extract_activations_optimized(model, batch, use_float16)
+        extract_activations_optimized(model, batch, use_float16)
 
         total_tokens += batch.numel()
         total_batches += 1
@@ -110,8 +105,8 @@ def benchmark_single_model(batch_size=40, use_float16=False, duration=30):
     tokens_per_sec = total_tokens / elapsed
 
     print(f"Tokens/sec: {tokens_per_sec:,.1f}")
-    print(f"Batches/sec: {total_batches/elapsed:.1f}")
-    print(f"Average batch size: {total_tokens/total_batches:.1f} tokens")
+    print(f"Batches/sec: {total_batches / elapsed:.1f}")
+    print(f"Average batch size: {total_tokens / total_batches:.1f} tokens")
 
     return tokens_per_sec
 
@@ -123,9 +118,7 @@ def benchmark_single_model(batch_size=40, use_float16=False, duration=30):
     # Test different batch sizes
     for batch_size in [40, 80, 160, 320]:
         try:
-            results[f"batch_{batch_size}"] = benchmark_single_model(
-                batch_size, False, 20
-            )
+            results[f"batch_{batch_size}"] = benchmark_single_model(batch_size, False, 20)
         except Exception as e:
             print(f"Failed batch_size={batch_size}: {e}")
 
@@ -182,8 +175,6 @@ def benchmark_single_model(batch_size=40, use_float16=False, duration=30):
         print(f"Failed 16 workers: {e}")
 
     print("\n=== RESULTS SUMMARY ===")
-    for config, tokens_per_sec in sorted(
-        results.items(), key=lambda x: x[1], reverse=True
-    ):
+    for config, tokens_per_sec in sorted(results.items(), key=lambda x: x[1], reverse=True):
         improvement = tokens_per_sec / baseline
         print(f"{config:20}: {tokens_per_sec:8,.1f} tokens/sec ({improvement:.1f}x)")
diff --git a/crosslayer_transcoder/utils/utils.py b/crosslayer_transcoder/utils/utils.py
index 3f38020..f0c6fb5 100644
--- a/crosslayer_transcoder/utils/utils.py
+++ b/crosslayer_transcoder/utils/utils.py
@@ -28,11 +28,9 @@ def get_webtext_dataloader(model, dataset_name="Skylion007/openwebtext", batch_s
 
 
 def plot_actvs(actvs):
-    if type(actvs) == list:
+    if isinstance(actvs, list):
         # actvs: list(n_layers), every entry tensor batch_size x seq_len x d_acts
-        actvs = torch.stack(
-            [actv[0, 50, 300:500] for actv in actvs]
-        )  # n_layers x d_acts
+        actvs = torch.stack([actv[0, 50, 300:500] for actv in actvs])  # n_layers x d_acts
     else:  # batch_size x n_layers x d_acts
         actvs = actvs[0, :, 300:500]  # n_layers x d_acts
     actvs = actvs.cpu().numpy()
diff --git a/pyproject.toml b/pyproject.toml
index 9077485..d54bee1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,7 +40,22 @@ dev-dependencies = [
     "jupyterlab>=4.4.2",
     "ipykernel>=6.29.5",
     "pytest>=8.4.1",
+    "ruff>=0.9.0",
 ]
 
-[tool.black]
+[tool.ruff]
 line-length = 110
+target-version = "py312"
+
+[tool.ruff.lint]
+select = ["E", "W", "F", "I", "B"]
+ignore = [
+  "E501",   # line length (handled by formatter)
+  "E402",   # module-import-not-at-top-of-file (intentional for env vars)
+  "F722",   # forward-annotation-syntax-error (jaxtyping uses special syntax)
+  "F821",   # undefined-name (jaxtyping type annotations)
+  "E741",   # ambiguous-variable-name (common in ML: l, x, y, etc.)
+]
+
+[tool.ruff.lint.isort]
+known-first-party = ["crosslayer_transcoder"]
diff --git a/setup.sh b/setup.sh
index 48d1062..12b02ef 100755
--- a/setup.sh
+++ b/setup.sh
@@ -5,9 +5,9 @@ RED='\033[0;31m'
 GREEN='\033[0;32m'
 YELLOW='\033[1;33m'
 BLUE='\033[0;34m'
-NC='\033[0m' # No Color
+NC='\033[0m'
 
-echo -e "${BLUE}🚀 Setting up crosslayer-transcoder environment...${NC}"
+echo -e "${BLUE}🚀 Setting up crosslayer-transcoder...${NC}"
 
 # Function to check if command exists
 command_exists() {
@@ -16,120 +16,31 @@ command_exists() {
 
 # Install uv if not present
 if ! command_exists uv; then
-    echo -e "${YELLOW}📦 Installing uv (fast Python package manager)...${NC}"
+    echo -e "${YELLOW}📦 Installing uv...${NC}"
     curl -LsSf https://astral.sh/uv/install.sh | sh
-    
-    # Source the shell configuration to make uv available
     export PATH="$HOME/.local/bin:$HOME/.cargo/bin:$PATH"
     
     if ! command_exists uv; then
-        echo -e "${RED}❌ Failed to install uv. Please install manually: https://docs.astral.sh/uv/getting-started/installation/${NC}"
+        echo -e "${RED}❌ Failed to install uv. See: https://docs.astral.sh/uv/${NC}"
         exit 1
     fi
-    echo -e "${GREEN}✅ uv installed successfully!${NC}"
+    echo -e "${GREEN}✅ uv installed${NC}"
 else
-    echo -e "${GREEN}✅ uv is already installed${NC}"
-fi
-
-# Create .venv and install dependencies
-echo -e "${YELLOW}🔧 Creating virtual environment and installing dependencies...${NC}"
-
-# Remove existing .venv if it exists
-if [ -d ".venv" ]; then
-    echo -e "${YELLOW}🗑️  Removing existing .venv directory...${NC}"
-    rm -rf .venv
+    echo -e "${GREEN}✅ uv already installed${NC}"
 fi
 
-# Use uv to create venv and install dependencies
-# This will automatically:
-# - Install the correct Python version (3.12)
-# - Create .venv directory
-# - Install all dependencies from pyproject.toml
-export PATH="$HOME/.local/bin:$PATH"
-uv sync --dev --python 3.12
+# Install dependencies
+echo -e "${YELLOW}🔧 Installing dependencies...${NC}"
+uv sync
 
 if [ $? -eq 0 ]; then
-    echo -e "${YELLOW}🔧 Setting up permissions and PATH...${NC}"
-    
-    # Fix permissions for all executables in .venv/bin
-    chmod +x .venv/bin/*
-    
-    # Add uv to PATH permanently by updating shell profile
-    UV_PATH_EXPORT='export PATH="$HOME/.local/bin:$PATH"'
-    
-    # Check which shell profile to update
-    UPDATED_PROFILE=false
-    
-    # Try .bashrc first (most common)
-    if [ -f "$HOME/.bashrc" ] || [ "$SHELL" = "/bin/bash" ]; then
-        if [ ! -f "$HOME/.bashrc" ]; then
-            touch "$HOME/.bashrc"
-        fi
-        if ! grep -q "\.local/bin" "$HOME/.bashrc"; then
-            echo "" >> "$HOME/.bashrc"
-            echo "# Added by crosslayer-transcoder setup" >> "$HOME/.bashrc"
-            echo "$UV_PATH_EXPORT" >> "$HOME/.bashrc"
-            echo -e "${GREEN}✅ Added uv to PATH in ~/.bashrc${NC}"
-            UPDATED_PROFILE=true
-        fi
-    fi
-    
-    # Try .zshrc for zsh users
-    if [ -f "$HOME/.zshrc" ]; then
-        if ! grep -q "\.local/bin" "$HOME/.zshrc"; then
-            echo "" >> "$HOME/.zshrc"
-            echo "# Added by crosslayer-transcoder setup" >> "$HOME/.zshrc"
-            echo "$UV_PATH_EXPORT" >> "$HOME/.zshrc"
-            echo -e "${GREEN}✅ Added uv to PATH in ~/.zshrc${NC}"
-            UPDATED_PROFILE=true
-        fi
-    fi
-    
-    # Fallback to .profile (works for all POSIX shells)
-    if [ "$UPDATED_PROFILE" = false ] && [ -f "$HOME/.profile" ]; then
-        if ! grep -q "\.local/bin" "$HOME/.profile"; then
-            echo "" >> "$HOME/.profile"
-            echo "# Added by crosslayer-transcoder setup" >> "$HOME/.profile"
-            echo "$UV_PATH_EXPORT" >> "$HOME/.profile"
-            echo -e "${GREEN}✅ Added uv to PATH in ~/.profile${NC}"
-            UPDATED_PROFILE=true
-        fi
-    fi
-    
-    # Create .bashrc as final fallback
-    if [ "$UPDATED_PROFILE" = false ]; then
-        echo "" >> "$HOME/.bashrc"
-        echo "# Added by crosslayer-transcoder setup" >> "$HOME/.bashrc"
-        echo "$UV_PATH_EXPORT" >> "$HOME/.bashrc"
-        echo -e "${GREEN}✅ Created ~/.bashrc and added uv to PATH${NC}"
-    fi
-    
-    echo -e "${GREEN}✅ Environment setup complete!${NC}"
-    echo ""
-    echo -e "${BLUE}📝 To activate the environment, run:${NC}"
-    echo -e "${YELLOW}   source .venv/bin/activate${NC}"
-    echo -e "${BLUE}   (or simply: .venv/bin/activate)${NC}"
+    echo -e "${GREEN}✅ Setup complete!${NC}"
     echo ""
-    echo -e "${BLUE}📝 To run Jupyter Lab:${NC}"
-    echo -e "${YELLOW}   .venv/bin/jupyter lab${NC}"
-    echo ""
-    echo -e "${BLUE}📝 To run Python scripts:${NC}"
-    echo -e "${YELLOW}   .venv/bin/python your_script.py${NC}"
-    echo ""
-    echo -e "${BLUE}📝 To add new dependencies:${NC}"
-    echo -e "${YELLOW}   uv add package_name${NC}"
-    echo ""
-    echo -e "${BLUE}📝 To make uv available globally, restart your terminal or run:${NC}"
-    if [ -f "$HOME/.bashrc" ]; then
-        echo -e "${YELLOW}   source ~/.bashrc${NC}"
-    elif [ -f "$HOME/.zshrc" ]; then
-        echo -e "${YELLOW}   source ~/.zshrc${NC}"
-    elif [ -f "$HOME/.profile" ]; then
-        echo -e "${YELLOW}   source ~/.profile${NC}"
-    else
-        echo -e "${YELLOW}   source ~/.bashrc${NC}"
-    fi
+    echo -e "${BLUE}Usage:${NC}"
+    echo -e "  ${YELLOW}uv run python script.py${NC}    - Run a Python script"
+    echo -e "  ${YELLOW}uv add package_name${NC}        - Add a dependency"
+    echo -e "  ${YELLOW}uv sync --dev${NC}              - Install dev tools (ruff, pytest, jupyter)"
 else
-    echo -e "${RED}❌ Setup failed. Please check the error messages above.${NC}"
+    echo -e "${RED}❌ Setup failed${NC}"
     exit 1
-fi 
\ No newline at end of file
+fi
diff --git a/tests/benchmark_data_loader.py b/tests/benchmark_data_loader.py
index c029195..eda33c3 100644
--- a/tests/benchmark_data_loader.py
+++ b/tests/benchmark_data_loader.py
@@ -3,10 +3,11 @@
 Comprehensive data loader benchmark for ActivationDataModule.
 Tests both shared memory and simple buffer modes.
 """
+
 import os
 import sys
 import time
-from typing import Any, Dict, Optional
+from typing import Dict, Optional
 
 # Add project root to Python path for reliable imports
 project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -80,8 +81,7 @@ def benchmark_loader(loader, mode_name: str, duration: float = 30.0) -> float:
                 elapsed = time.time() - start_time
                 current_mb_s = (total_bytes / (1024 * 1024)) / elapsed
                 print(
-                    f"   Batch {batch_count}: {batch_time*1000:.1f}ms, "
-                    f"Running avg: {current_mb_s:.1f} MB/s"
+                    f"   Batch {batch_count}: {batch_time * 1000:.1f}ms, Running avg: {current_mb_s:.1f} MB/s"
                 )
 
     except Exception as e:
@@ -104,7 +104,7 @@ def benchmark_loader(loader, mode_name: str, duration: float = 30.0) -> float:
     # Show buffer stats if available
     if hasattr(loader, "get_stats"):
         stats = loader.get_stats()
-        print(f"   Final buffer status:")
+        print("   Final buffer status:")
         print(f"     Valid samples: {stats['valid_samples']:,} ({stats['valid_percentage']:.1f}%)")
         print(f"     Buffer memory: {stats['total_memory_gb']:.2f} GB")
 
@@ -252,7 +252,7 @@ def main():
     batch_size = 5000
     duration = 30.0
 
-    print(f"⚙️  Configuration:")
+    print("⚙️  Configuration:")
     print(f"   Batch size: {batch_size:,}")
     print(f"   Test duration: {duration}s per mode")
     print(f"   PyTorch version: {torch.__version__}")
@@ -270,7 +270,7 @@ def main():
         results["DataModule Simple"] = result
 
     # Final comparison
-    print(f"\n🏆 FINAL COMPARISON")
+    print("\n🏆 FINAL COMPARISON")
     print("=" * 60)
     if results:
         max_name_len = max(len(name) for name in results.keys())
@@ -295,7 +295,7 @@ def main():
     else:
         print("   ❌ No successful benchmarks completed")
 
-    print(f"\n✅ Benchmark complete!")
+    print("\n✅ Benchmark complete!")
 
 
 if __name__ == "__main__":
diff --git a/tests/benchmark_text_dataloader.py b/tests/benchmark_text_dataloader.py
index d29d25f..4b32498 100644
--- a/tests/benchmark_text_dataloader.py
+++ b/tests/benchmark_text_dataloader.py
@@ -8,11 +8,10 @@
 
 import nnsight
 import torch
+from data import text_dataset
 from datasets import load_dataset
 from torch.utils.data import DataLoader
 
-from data import text_dataset
-
 
 def benchmark_textdataset(
     dataset_name: str = "Skylion007/openwebtext",
@@ -35,7 +34,7 @@ def benchmark_textdataset(
         max_batches: Maximum batches to process
         max_duration: Maximum time to run benchmark (seconds)
     """
-    print(f"=== TextDataset Benchmark ===")
+    print("=== TextDataset Benchmark ===")
     print(f"Dataset: {dataset_name}")
     print(f"Model: {model_name}")
     print(f"Batch size: {batch_size}")
@@ -175,9 +174,7 @@ def benchmark_textdataset(
     print(f"Dataset load: {dataset_load_time:.2f}s")
     print(f"TextDataset: {textdataset_time:.2f}s")
     print(f"DataLoader init: {dataloader_time:.2f}s")
-    print(
-        f"Total setup: {model_load_time + dataset_load_time + textdataset_time + dataloader_time:.2f}s"
-    )
+    print(f"Total setup: {model_load_time + dataset_load_time + textdataset_time + dataloader_time:.2f}s")
 
 
 def compare_workers():
@@ -187,7 +184,7 @@ def compare_workers():
     worker_configs = [0, 1, 2, 4]
 
     for num_workers in worker_configs:
-        print(f"\n{'='*50}")
+        print(f"\n{'=' * 50}")
         print(f"Testing with {num_workers} workers")
         print("=" * 50)
 
@@ -201,25 +198,15 @@ def compare_workers():
 if __name__ == "__main__":
     import argparse
 
-    parser = argparse.ArgumentParser(
-        description="Benchmark TextDataset token throughput"
-    )
-    parser.add_argument(
-        "--dataset", default="Skylion007/openwebtext", help="Dataset name"
-    )
+    parser = argparse.ArgumentParser(description="Benchmark TextDataset token throughput")
+    parser.add_argument("--dataset", default="Skylion007/openwebtext", help="Dataset name")
     parser.add_argument("--model", default="openai-community/gpt2", help="Model name")
     parser.add_argument("--batch-size", type=int, default=32, help="Batch size")
     parser.add_argument("--seq-len", type=int, default=1024, help="Sequence length")
     parser.add_argument("--workers", type=int, default=0, help="Number of workers")
-    parser.add_argument(
-        "--max-batches", type=int, default=10, help="Max batches to process"
-    )
-    parser.add_argument(
-        "--max-duration", type=float, default=60.0, help="Max duration (seconds)"
-    )
-    parser.add_argument(
-        "--compare-workers", action="store_true", help="Compare different worker counts"
-    )
+    parser.add_argument("--max-batches", type=int, default=10, help="Max batches to process")
+    parser.add_argument("--max-duration", type=float, default=60.0, help="Max duration (seconds)")
+    parser.add_argument("--compare-workers", action="store_true", help="Compare different worker counts")
 
     args = parser.parse_args()
 
diff --git a/tests/test_dead_features.py b/tests/test_dead_features.py
index 6afbf10..2d6cd7b 100644
--- a/tests/test_dead_features.py
+++ b/tests/test_dead_features.py
@@ -51,9 +51,9 @@ def test_initialization(self, metric_params):
 
         assert metric.n_features == n_features
         assert metric.n_layers == n_layers
-        assert metric.return_per_layer == False
-        assert metric.return_neuron_indices == False
-        assert metric.return_log_freqs == False
+        assert not metric.return_per_layer
+        assert not metric.return_neuron_indices
+        assert not metric.return_log_freqs
         assert metric.n_active.shape == (n_layers, n_features)
         assert torch.all(metric.n_active == 0)
         assert metric.n_total.shape == (1,)
@@ -69,9 +69,9 @@ def test_initialization_with_flags(self, metric_params):
             return_neuron_indices=True,
         )
 
-        assert metric.return_per_layer == True
-        assert metric.return_neuron_indices == True
-        assert metric.return_log_freqs == False
+        assert metric.return_per_layer
+        assert metric.return_neuron_indices
+        assert not metric.return_log_freqs
 
 
 class TestDeadFeaturesZeroInput:
diff --git a/tests/test_deployment_policy.py b/tests/test_deployment_policy.py
index cb55dae..60b952e 100644
--- a/tests/test_deployment_policy.py
+++ b/tests/test_deployment_policy.py
@@ -4,10 +4,9 @@
 Tests that deployment policies correctly handle model instantiation and device management.
 """
 
-from unittest.mock import Mock, patch
+from unittest.mock import Mock
 
 import pytest
-import torch
 
 from crosslayer_transcoder.data import ActivationDataModule
 from crosslayer_transcoder.data.deployment_policy import (
@@ -67,13 +66,13 @@ def test_gpu_only_device_parsing(self):
         policy = create_deployment_policy(DeploymentPolicy.GPU_ONLY, device_map="cuda:3")
         devices, is_multi_gpu = policy._parse_device_map()
         assert devices == ["cuda:3"]
-        assert is_multi_gpu == False
+        assert not is_multi_gpu
 
         # Test multi-GPU
         policy = create_deployment_policy(DeploymentPolicy.GPU_ONLY, device_map="cuda:0,1,2,3")
         devices, is_multi_gpu = policy._parse_device_map()
         assert devices == ["cuda:0", "cuda:1", "cuda:2", "cuda:3"]
-        assert is_multi_gpu == True
+        assert is_multi_gpu
 
     def test_gpu_only_device_tracking_logic(self):
         """Test that GPU-only policy correctly calculates device tracking without loading models."""
diff --git a/tests/test_process_monitor.py b/tests/test_process_monitor.py
index dd8f416..e0a32ae 100644
--- a/tests/test_process_monitor.py
+++ b/tests/test_process_monitor.py
@@ -3,8 +3,6 @@
 Tests for process monitor functionality.
 """
 
-from unittest.mock import Mock, patch
-
 import pytest
 
 from crosslayer_transcoder.data.process_monitor import ProcessMonitor, WandBProcessMonitor
@@ -55,7 +53,7 @@ def test_device_code_mapping(self):
 
     def test_dashboard_device_string_generation(self):
         """Test that dashboard correctly displays device strings."""
-        monitor = ProcessMonitor()
+        ProcessMonitor()
 
         # Test different device types for dashboard display
         test_cases = [
diff --git a/tests/test_standardization_folding.py b/tests/test_standardization_folding.py
index 8136775..20b960c 100644
--- a/tests/test_standardization_folding.py
+++ b/tests/test_standardization_folding.py
@@ -8,7 +8,6 @@
     DimensionwiseOutputStandardizer,
 )
 
-
 torch.manual_seed(42)
 DTYPE = torch.float64
 BATCH_SIZE, D_ACTS, D_FEATS, N_LAYERS = 100, 768, 32, 12
@@ -29,9 +28,7 @@ def assert_allclose(lhs, rhs, rtol=1e-7, atol=1e-9):
 def test_math_sanity_check():
     encoder = Encoder(d_acts=D_ACTS, d_features=D_FEATS, n_layers=N_LAYERS).to(DTYPE)
 
-    input_std = DimensionwiseInputStandardizer(
-        n_layers=N_LAYERS, activation_dim=D_ACTS
-    ).to(DTYPE)
+    input_std = DimensionwiseInputStandardizer(n_layers=N_LAYERS, activation_dim=D_ACTS).to(DTYPE)
     input_std.initialize_from_batch(batch=BATCH)
 
     resid = BATCH[:, 0]
@@ -68,9 +65,7 @@ def test_math_sanity_check():
 
 def test_encoder_standarization_folding():
     encoder = Encoder(d_acts=D_ACTS, d_features=D_FEATS, n_layers=N_LAYERS).to(DTYPE)
-    input_std = DimensionwiseInputStandardizer(
-        n_layers=N_LAYERS, activation_dim=D_ACTS
-    ).to(DTYPE)
+    input_std = DimensionwiseInputStandardizer(n_layers=N_LAYERS, activation_dim=D_ACTS).to(DTYPE)
 
     input_std.initialize_from_batch(batch=BATCH)
 
@@ -92,12 +87,8 @@ def test_encoder_standarization_folding():
 
 
 def test_decoder_standarization_folding():
-    decoder = CrosslayerDecoder(
-        d_acts=D_ACTS, d_features=D_FEATS, n_layers=N_LAYERS
-    ).to(DTYPE)
-    output_std = DimensionwiseOutputStandardizer(
-        n_layers=N_LAYERS, activation_dim=D_ACTS
-    ).to(DTYPE)
+    decoder = CrosslayerDecoder(d_acts=D_ACTS, d_features=D_FEATS, n_layers=N_LAYERS).to(DTYPE)
+    output_std = DimensionwiseOutputStandardizer(n_layers=N_LAYERS, activation_dim=D_ACTS).to(DTYPE)
 
     output_std.initialize_from_batch(batch=BATCH)
 
@@ -119,8 +110,6 @@ def test_decoder_standarization_folding():
     b_dec_folded = output_std.fold_in_decoder_bias(decoder.b)
     folded_params["b"] = torch.nn.Parameter(b_dec_folded.clone().detach())
 
-    recons_folded = functional_call(
-        decoder, folded_params, test_features, {"layer": "all"}
-    )
+    recons_folded = functional_call(decoder, folded_params, test_features, {"layer": "all"})
 
     assert_allclose(recons, recons_folded)
diff --git a/tests/test_text_dataset.py b/tests/test_text_dataset.py
index 8ed8f74..e0d3b4c 100644
--- a/tests/test_text_dataset.py
+++ b/tests/test_text_dataset.py
@@ -10,7 +10,7 @@
 from datasets import load_dataset
 from torch.utils.data import DataLoader
 
-from crosslayer_transcoder.data.text_dataset import TextDataset, worker_init_fn
+from crosslayer_transcoder.data.text_dataset import TextDataset
 
 
 class MockDataset:
@@ -65,7 +65,7 @@ def test_init(self, text_dataset, dataset, model):
         assert text_dataset.to_tokens == model.tokenizer
         assert text_dataset.batch_size == 4
         assert text_dataset.seq_len == 128
-        assert text_dataset.drop_last_batch == True
+        assert text_dataset.drop_last_batch
         assert text_dataset.hf_text_accessor == "text"
         assert text_dataset.token_pointer == 0
         assert text_dataset.batch_pointer == 0
@@ -186,7 +186,10 @@ def test_with_dataloader(self, text_dataset):
         """Test TextDataset with PyTorch DataLoader."""
         # Test with single worker
         dataloader = DataLoader(
-            text_dataset, batch_size=None, shuffle=False, num_workers=0  # TextDataset handles batching
+            text_dataset,
+            batch_size=None,
+            shuffle=False,
+            num_workers=0,  # TextDataset handles batching
         )
 
         batch, mask = next(iter(dataloader))
@@ -350,9 +353,9 @@ def test_custom_english_sentences_roundtrip(self, model):
 
             # The tokens should match (at least the first part)
             min_len = min(len(manual_tokens), len(batch_tokens))
-            assert torch.equal(
-                torch.tensor(manual_tokens[:min_len]), batch_tokens[:min_len]
-            ), f"Token mismatch for sentence {i}: expected {manual_tokens[:min_len]}, got {batch_tokens[:min_len].tolist()}"
+            assert torch.equal(torch.tensor(manual_tokens[:min_len]), batch_tokens[:min_len]), (
+                f"Token mismatch for sentence {i}: expected {manual_tokens[:min_len]}, got {batch_tokens[:min_len].tolist()}"
+            )
 
     def test_short_sequences_padding_and_mask(self, model):
         """Test that short sequences (< seq_len) are properly padded and masked."""
@@ -393,30 +396,30 @@ def test_short_sequences_padding_and_mask(self, model):
 
             # Check that the actual tokens match manual tokenization
             actual_tokens = batch[i][:expected_length]
-            assert torch.equal(
-                actual_tokens, torch.tensor(manual_tokens)
-            ), f"Token mismatch for sentence {i}: expected {manual_tokens}, got {actual_tokens.tolist()}"
+            assert torch.equal(actual_tokens, torch.tensor(manual_tokens)), (
+                f"Token mismatch for sentence {i}: expected {manual_tokens}, got {actual_tokens.tolist()}"
+            )
 
             # Check mask behavior
             sentence_mask = mask[i]
 
             # First `expected_length` positions should be True
-            assert torch.all(
-                sentence_mask[:expected_length]
-            ), f"Mask should be True for first {expected_length} positions in sentence {i}"
+            assert torch.all(sentence_mask[:expected_length]), (
+                f"Mask should be True for first {expected_length} positions in sentence {i}"
+            )
 
             # Remaining positions should be False (padding)
             if expected_length < 50:
-                assert torch.all(
-                    ~sentence_mask[expected_length:]
-                ), f"Mask should be False for padding positions after {expected_length} in sentence {i}"
+                assert torch.all(~sentence_mask[expected_length:]), (
+                    f"Mask should be False for padding positions after {expected_length} in sentence {i}"
+                )
 
             # Check that padded positions contain zeros
             if expected_length < 50:
                 padding_tokens = batch[i][expected_length:]
-                assert torch.all(
-                    padding_tokens == 0
-                ), f"Padding positions should be zero for sentence {i}, got {padding_tokens[:5].tolist()}..."
+                assert torch.all(padding_tokens == 0), (
+                    f"Padding positions should be zero for sentence {i}, got {padding_tokens[:5].tolist()}..."
+                )
 
             # Verify roundtrip with only valid tokens
             valid_tokens = batch[i][mask[i]]
@@ -436,16 +439,16 @@ def test_short_sequences_padding_and_mask(self, model):
             assert len(decoded_text) > 0
 
             # Check that no padding tokens were included in decoding
-            assert (
-                len(valid_tokens) == expected_length
-            ), f"Valid tokens length {len(valid_tokens)} should match expected {expected_length}"
+            assert len(valid_tokens) == expected_length, (
+                f"Valid tokens length {len(valid_tokens)} should match expected {expected_length}"
+            )
 
         # Additional verification: ensure no errors are raised
         # and the function handles short sequences gracefully
         try:
             # This should work without any issues
             next(text_dataset)
-            assert False, "Should have raised StopIteration since we exhausted the dataset"
+            raise AssertionError("Should have raised StopIteration since we exhausted the dataset")
         except StopIteration:
             # This is expected behavior
             pass
diff --git a/tests/test_topk.py b/tests/test_topk.py
index 2ce6e10..d89c6e0 100644
--- a/tests/test_topk.py
+++ b/tests/test_topk.py
@@ -244,6 +244,6 @@ def test_nnsight_compatibility(nnsight_model, topk_fixture, request):
         with nnsight_model.trace("test"):
             actvs = nnsight_model.transformer.h[0].mlp.output
             topk.to(actvs.device)
-            topk_result = topk(actvs)
+            topk(actvs)
     except Exception as e:
         pytest.fail(f"nnsight compatibility test failed for {topk_fixture}: {e}")
diff --git a/uv.lock b/uv.lock
index 5cd028c..6514728 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 2
+revision = 3
 requires-python = "==3.12.*"
 resolution-markers = [
     "sys_platform == 'linux'",
@@ -357,6 +357,7 @@ dev = [
     { name = "ipykernel" },
     { name = "jupyterlab" },
     { name = "pytest" },
+    { name = "ruff" },
 ]
 
 [package.metadata]
@@ -383,6 +384,7 @@ dev = [
     { name = "ipykernel", specifier = ">=6.29.5" },
     { name = "jupyterlab", specifier = ">=4.4.2" },
     { name = "pytest", specifier = ">=8.4.1" },
+    { name = "ruff", specifier = ">=0.9.0" },
 ]
 
 [[package]]
@@ -1894,6 +1896,32 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/99/f2/c2d64f6564f32af913bf5f3f7ae41c7c263c5ae4c4e8f1a17af8af66cd46/rpds_py-0.25.1-cp312-cp312-win_arm64.whl", hash = "sha256:6d50841c425d16faf3206ddbba44c21aa3310a0cebc3c1cdfc3e3f4f9f6f5728", size = 225399, upload-time = "2025-05-21T12:43:53.351Z" },
 ]
 
+[[package]]
+name = "ruff"
+version = "0.14.11"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d4/77/9a7fe084d268f8855d493e5031ea03fa0af8cc05887f638bf1c4e3363eb8/ruff-0.14.11.tar.gz", hash = "sha256:f6dc463bfa5c07a59b1ff2c3b9767373e541346ea105503b4c0369c520a66958", size = 5993417, upload-time = "2026-01-08T19:11:58.322Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f0/a6/a4c40a5aaa7e331f245d2dc1ac8ece306681f52b636b40ef87c88b9f7afd/ruff-0.14.11-py3-none-linux_armv6l.whl", hash = "sha256:f6ff2d95cbd335841a7217bdfd9c1d2e44eac2c584197ab1385579d55ff8830e", size = 12951208, upload-time = "2026-01-08T19:12:09.218Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/5c/360a35cb7204b328b685d3129c08aca24765ff92b5a7efedbdd6c150d555/ruff-0.14.11-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6f6eb5c1c8033680f4172ea9c8d3706c156223010b8b97b05e82c59bdc774ee6", size = 13330075, upload-time = "2026-01-08T19:12:02.549Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/9e/0cc2f1be7a7d33cae541824cf3f95b4ff40d03557b575912b5b70273c9ec/ruff-0.14.11-py3-none-macosx_11_0_arm64.whl", hash = "sha256:f2fc34cc896f90080fca01259f96c566f74069a04b25b6205d55379d12a6855e", size = 12257809, upload-time = "2026-01-08T19:12:00.366Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/e5/5faab97c15bb75228d9f74637e775d26ac703cc2b4898564c01ab3637c02/ruff-0.14.11-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53386375001773ae812b43205d6064dae49ff0968774e6befe16a994fc233caa", size = 12678447, upload-time = "2026-01-08T19:12:13.899Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/33/e9767f60a2bef779fb5855cab0af76c488e0ce90f7bb7b8a45c8a2ba4178/ruff-0.14.11-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a697737dce1ca97a0a55b5ff0434ee7205943d4874d638fe3ae66166ff46edbe", size = 12758560, upload-time = "2026-01-08T19:11:42.55Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/84/4c6cf627a21462bb5102f7be2a320b084228ff26e105510cd2255ea868e5/ruff-0.14.11-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6845ca1da8ab81ab1dce755a32ad13f1db72e7fba27c486d5d90d65e04d17b8f", size = 13599296, upload-time = "2026-01-08T19:11:30.371Z" },
+    { url = "https://files.pythonhosted.org/packages/88/e1/92b5ed7ea66d849f6157e695dc23d5d6d982bd6aa8d077895652c38a7cae/ruff-0.14.11-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:e36ce2fd31b54065ec6f76cb08d60159e1b32bdf08507862e32f47e6dde8bcbf", size = 15048981, upload-time = "2026-01-08T19:12:04.742Z" },
+    { url = "https://files.pythonhosted.org/packages/61/df/c1bd30992615ac17c2fb64b8a7376ca22c04a70555b5d05b8f717163cf9f/ruff-0.14.11-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:590bcc0e2097ecf74e62a5c10a6b71f008ad82eb97b0a0079e85defe19fe74d9", size = 14633183, upload-time = "2026-01-08T19:11:40.069Z" },
+    { url = "https://files.pythonhosted.org/packages/04/e9/fe552902f25013dd28a5428a42347d9ad20c4b534834a325a28305747d64/ruff-0.14.11-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:53fe71125fc158210d57fe4da26e622c9c294022988d08d9347ec1cf782adafe", size = 14050453, upload-time = "2026-01-08T19:11:37.555Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/93/f36d89fa021543187f98991609ce6e47e24f35f008dfe1af01379d248a41/ruff-0.14.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a35c9da08562f1598ded8470fcfef2afb5cf881996e6c0a502ceb61f4bc9c8a3", size = 13757889, upload-time = "2026-01-08T19:12:07.094Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/9f/c7fb6ecf554f28709a6a1f2a7f74750d400979e8cd47ed29feeaa1bd4db8/ruff-0.14.11-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:0f3727189a52179393ecf92ec7057c2210203e6af2676f08d92140d3e1ee72c1", size = 13955832, upload-time = "2026-01-08T19:11:55.064Z" },
+    { url = "https://files.pythonhosted.org/packages/db/a0/153315310f250f76900a98278cf878c64dfb6d044e184491dd3289796734/ruff-0.14.11-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:eb09f849bd37147a789b85995ff734a6c4a095bed5fd1608c4f56afc3634cde2", size = 12586522, upload-time = "2026-01-08T19:11:35.356Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/2b/a73a2b6e6d2df1d74bf2b78098be1572191e54bec0e59e29382d13c3adc5/ruff-0.14.11-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:c61782543c1231bf71041461c1f28c64b961d457d0f238ac388e2ab173d7ecb7", size = 12724637, upload-time = "2026-01-08T19:11:47.796Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/41/09100590320394401cd3c48fc718a8ba71c7ddb1ffd07e0ad6576b3a3df2/ruff-0.14.11-py3-none-musllinux_1_2_i686.whl", hash = "sha256:82ff352ea68fb6766140381748e1f67f83c39860b6446966cff48a315c3e2491", size = 13145837, upload-time = "2026-01-08T19:11:32.87Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/d8/e035db859d1d3edf909381eb8ff3e89a672d6572e9454093538fe6f164b0/ruff-0.14.11-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:728e56879df4ca5b62a9dde2dd0eb0edda2a55160c0ea28c4025f18c03f86984", size = 13850469, upload-time = "2026-01-08T19:12:11.694Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/02/bb3ff8b6e6d02ce9e3740f4c17dfbbfb55f34c789c139e9cd91985f356c7/ruff-0.14.11-py3-none-win32.whl", hash = "sha256:337c5dd11f16ee52ae217757d9b82a26400be7efac883e9e852646f1557ed841", size = 12851094, upload-time = "2026-01-08T19:11:45.163Z" },
+    { url = "https://files.pythonhosted.org/packages/58/f1/90ddc533918d3a2ad628bc3044cdfc094949e6d4b929220c3f0eb8a1c998/ruff-0.14.11-py3-none-win_amd64.whl", hash = "sha256:f981cea63d08456b2c070e64b79cb62f951aa1305282974d4d5216e6e0178ae6", size = 14001379, upload-time = "2026-01-08T19:11:52.591Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/1c/1dbe51782c0e1e9cfce1d1004752672d2d4629ea46945d19d731ad772b3b/ruff-0.14.11-py3-none-win_arm64.whl", hash = "sha256:649fb6c9edd7f751db276ef42df1f3df41c38d67d199570ae2a7bd6cbc3590f0", size = 12938644, upload-time = "2026-01-08T19:11:50.027Z" },
+]
+
 [[package]]
 name = "safetensors"
 version = "0.5.3"