diff --git a/Env implementation.ipynb b/Env implementation.ipynb
new file mode 100644
index 0000000..7693985
--- /dev/null
+++ b/Env implementation.ipynb	
@@ -0,0 +1,677 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d782b3e7-70da-4b64-9433-ad37e9024052",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import gym4real\n",
+    "import gymnasium as gym\n",
+    "from gymnasium import spaces\n",
+    "from gym4real.envs.wds.utils import parameter_generator\n",
+    "import wntr\n",
+    "import wntr.sim\n",
+    "from gym4real.envs.wds.reward_scaling_wrapper import RewardScalingWrapper\n",
+    "from dueling_dqn import DQN_Implementation, Double_DQN_Implementation, Dueling_DQN_Implementation, Distributional_DQN_Implementation\n",
+    "from Normalise import NormaliseObservation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "2fc21950-b80c-483a-92ba-f516048d13bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config_path = os.path.join(os.getcwd(), \"gym4real\", \"envs\", \"wds\", \"world_anytown.yaml\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "e9145afc-58a2-4c2e-a830-ce90638e41d9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/world_anytown.yaml'"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "config_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "16c71143-c3da-46b0-bb54-3507d283ef9a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_params = parameter_generator(\n",
+    "    hydraulic_step=3600,\n",
+    "    duration=604800,\n",
+    "    seed=42,\n",
+    "    world_options=config_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "c631e482-afc9-4577-ab5c-b4144860d3e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Environment using SMA\n",
+    "base_params['demand_moving_average'] = True  # Turn on SMA \n",
+    "base_params['demand_exp_moving_average'] = False  # Turn off EMA "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "88dfc98b-d0ef-475d-a522-cd25478418d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_env():\n",
+    "    env = gym.make('gym4real/wds-v0', settings=base_params)\n",
+    "    env = RewardScalingWrapper(env)\n",
+    "    env = NormaliseObservation(env)\n",
+    "    return env"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "992a51a0-e773-4a28-8aa3-00ab6b59799f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/envs/rl-env/lib/python3.12/site-packages/gymnasium/spaces/box.py:235: UserWarning: \u001b[33mWARN: Box low's precision lowered by casting to float32, current low.dtype=float64\u001b[0m\n",
+      "  gym.logger.warn(\n",
+      "/opt/anaconda3/envs/rl-env/lib/python3.12/site-packages/gymnasium/spaces/box.py:305: UserWarning: \u001b[33mWARN: Box high's precision lowered by casting to float32, current high.dtype=float64\u001b[0m\n",
+      "  gym.logger.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "dqn_sma_env = make_env()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "0b3d6e1c-d1da-449a-96ad-2ceb5c78315d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# dueling_sma_model = Dueling_DQN_Implementation(\n",
+    "#     env=dqn_sma_env,\n",
+    "#     learning_rate=3e-4,\n",
+    "#     tensorboard_log=\"./wds_dueling_logs/\"\n",
+    "# )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "e114f6d5-18f1-4c0d-910d-4900fdc27eea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dist_sma_model = Distributional_DQN_Implementation(\n",
+    "    env=dqn_sma_env,\n",
+    "    learning_rate=3e-4,\n",
+    "    tensorboard_log=\"./wds_dist_logs/\",\n",
+    "    n_atoms=51,\n",
+    "    v_min=-10,\n",
+    "    v_max=10\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "7384c050-22bc-47f9-a39b-d7a38e6a8179",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Train for 200000 steps | device=cpu\n",
+      "Resetting the environment...\n",
+      "Step: 1111 | Episode Reward: 153.997 | Epsilon: 1.000\n",
+      "Resetting the environment...\n",
+      "Step: 2282 | Episode Reward: 151.169 | Epsilon: 0.995\n",
+      "Resetting the environment...\n",
+      "Step: 3408 | Episode Reward: 151.317 | Epsilon: 0.990\n",
+      "Resetting the environment...\n",
+      "Step: 4545 | Episode Reward: 153.626 | Epsilon: 0.985\n",
+      "Resetting the environment...\n",
+      "Step: 5693 | Episode Reward: 153.360 | Epsilon: 0.980\n",
+      "Resetting the environment...\n",
+      "Step: 6842 | Episode Reward: 148.326 | Epsilon: 0.975\n",
+      "Resetting the environment...\n",
+      "Step: 8000 | Episode Reward: 152.249 | Epsilon: 0.970\n",
+      "Resetting the environment...\n",
+      "Step: 9242 | Episode Reward: 133.343 | Epsilon: 0.966\n",
+      "Resetting the environment...\n",
+      "Step: 10433 | Episode Reward: 146.349 | Epsilon: 0.961\n",
+      "Resetting the environment...\n",
+      "Step: 11503 | Episode Reward: 132.059 | Epsilon: 0.956\n",
+      "Resetting the environment...\n",
+      "Step: 12634 | Episode Reward: 152.668 | Epsilon: 0.951\n",
+      "Resetting the environment...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n",
+      "  warnings.warn(self.ENgeterror(ierr))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Step: 13794 | Episode Reward: 140.755 | Epsilon: 0.946\n",
+      "Resetting the environment...\n",
+      "Step: 14950 | Episode Reward: 155.198 | Epsilon: 0.942\n",
+      "Resetting the environment...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n",
+      "  warnings.warn(self.ENgeterror(ierr))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Step: 16071 | Episode Reward: 144.100 | Epsilon: 0.937\n",
+      "Resetting the environment...\n",
+      "Step: 17212 | Episode Reward: 143.889 | Epsilon: 0.932\n",
+      "Resetting the environment...\n",
+      "Step: 18652 | Episode Reward: 109.176 | Epsilon: 0.928\n",
+      "Resetting the environment...\n",
+      "Step: 19772 | Episode Reward: 147.424 | Epsilon: 0.923\n",
+      "Resetting the environment...\n",
+      "Step: 20861 | Episode Reward: 160.615 | Epsilon: 0.918\n",
+      "Resetting the environment...\n",
+      "Step: 21947 | Episode Reward: 160.407 | Epsilon: 0.914\n",
+      "Resetting the environment...\n",
+      "Step: 23080 | Episode Reward: 148.234 | Epsilon: 0.909\n",
+      "Resetting the environment...\n",
+      "Step: 24235 | Episode Reward: 146.665 | Epsilon: 0.905\n",
+      "Resetting the environment...\n",
+      "Step: 25325 | Episode Reward: 160.068 | Epsilon: 0.900\n",
+      "Resetting the environment...\n",
+      "Step: 26565 | Episode Reward: 131.963 | Epsilon: 0.896\n",
+      "Resetting the environment...\n",
+      "Step: 27670 | Episode Reward: 159.023 | Epsilon: 0.891\n",
+      "Resetting the environment...\n",
+      "Step: 28764 | Episode Reward: 151.119 | Epsilon: 0.887\n",
+      "Resetting the environment...\n",
+      "Step: 29858 | Episode Reward: 162.179 | Epsilon: 0.882\n",
+      "Resetting the environment...\n",
+      "Step: 31007 | Episode Reward: 152.638 | Epsilon: 0.878\n",
+      "Resetting the environment...\n",
+      "Step: 32537 | Episode Reward: 111.251 | Epsilon: 0.873\n",
+      "Resetting the environment...\n",
+      "Step: 34091 | Episode Reward: 99.022 | Epsilon: 0.869\n",
+      "Resetting the environment...\n",
+      "Step: 35679 | Episode Reward: 107.237 | Epsilon: 0.865\n",
+      "Resetting the environment...\n",
+      "Step: 36792 | Episode Reward: 152.692 | Epsilon: 0.860\n",
+      "Resetting the environment...\n",
+      "Step: 37896 | Episode Reward: 159.965 | Epsilon: 0.856\n",
+      "Resetting the environment...\n",
+      "Step: 39000 | Episode Reward: 150.952 | Epsilon: 0.852\n",
+      "Resetting the environment...\n",
+      "Step: 40143 | Episode Reward: 153.381 | Epsilon: 0.848\n",
+      "Resetting the environment...\n",
+      "Step: 41290 | Episode Reward: 152.382 | Epsilon: 0.843\n",
+      "Resetting the environment...\n",
+      "Step: 42435 | Episode Reward: 158.051 | Epsilon: 0.839\n",
+      "Resetting the environment...\n",
+      "Step: 44044 | Episode Reward: 99.158 | Epsilon: 0.835\n",
+      "Resetting the environment...\n",
+      "Step: 45131 | Episode Reward: 147.270 | Epsilon: 0.831\n",
+      "Resetting the environment...\n",
+      "Step: 46194 | Episode Reward: 159.870 | Epsilon: 0.827\n",
+      "Resetting the environment...\n",
+      "Step: 47297 | Episode Reward: 160.283 | Epsilon: 0.822\n",
+      "Resetting the environment...\n",
+      "Step: 48430 | Episode Reward: 147.430 | Epsilon: 0.818\n",
+      "Resetting the environment...\n",
+      "Step: 49523 | Episode Reward: 152.502 | Epsilon: 0.814\n",
+      "Resetting the environment...\n",
+      "Step: 51084 | Episode Reward: 103.382 | Epsilon: 0.810\n",
+      "Resetting the environment...\n",
+      "Step: 52200 | Episode Reward: 152.779 | Epsilon: 0.806\n",
+      "Resetting the environment...\n",
+      "Step: 53297 | Episode Reward: 155.141 | Epsilon: 0.802\n",
+      "Resetting the environment...\n",
+      "Step: 54411 | Episode Reward: 151.961 | Epsilon: 0.798\n",
+      "Resetting the environment...\n",
+      "Step: 55521 | Episode Reward: 153.797 | Epsilon: 0.794\n",
+      "Resetting the environment...\n",
+      "Step: 56610 | Episode Reward: 163.142 | Epsilon: 0.790\n",
+      "Resetting the environment...\n",
+      "Step: 57731 | Episode Reward: 149.497 | Epsilon: 0.786\n",
+      "Resetting the environment...\n",
+      "Step: 59046 | Episode Reward: 132.982 | Epsilon: 0.782\n",
+      "Resetting the environment...\n",
+      "Step: 60135 | Episode Reward: 152.386 | Epsilon: 0.778\n",
+      "Resetting the environment...\n",
+      "Step: 61243 | Episode Reward: 154.186 | Epsilon: 0.774\n",
+      "Resetting the environment...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n",
+      "  warnings.warn(self.ENgeterror(ierr))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Step: 62886 | Episode Reward: 98.508 | Epsilon: 0.771\n",
+      "Resetting the environment...\n",
+      "Step: 64085 | Episode Reward: 162.620 | Epsilon: 0.767\n",
+      "Resetting the environment...\n",
+      "Step: 65178 | Episode Reward: 153.998 | Epsilon: 0.763\n",
+      "Resetting the environment...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n",
+      "  warnings.warn(self.ENgeterror(ierr))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Step: 66793 | Episode Reward: 103.972 | Epsilon: 0.759\n",
+      "Resetting the environment...\n",
+      "Step: 67840 | Episode Reward: 161.439 | Epsilon: 0.755\n",
+      "Resetting the environment...\n",
+      "Step: 69390 | Episode Reward: 112.187 | Epsilon: 0.751\n",
+      "Resetting the environment...\n",
+      "Step: 70459 | Episode Reward: 161.083 | Epsilon: 0.748\n",
+      "Resetting the environment...\n",
+      "Step: 71525 | Episode Reward: 151.796 | Epsilon: 0.744\n",
+      "Resetting the environment...\n",
+      "Step: 72633 | Episode Reward: 155.591 | Epsilon: 0.740\n",
+      "Resetting the environment...\n",
+      "Step: 73738 | Episode Reward: 140.940 | Epsilon: 0.737\n",
+      "Resetting the environment...\n",
+      "Step: 74829 | Episode Reward: 143.928 | Epsilon: 0.733\n",
+      "Resetting the environment...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n",
+      "  warnings.warn(self.ENgeterror(ierr))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Step: 76395 | Episode Reward: 101.345 | Epsilon: 0.729\n",
+      "Resetting the environment...\n",
+      "Step: 77607 | Episode Reward: 142.561 | Epsilon: 0.726\n",
+      "Resetting the environment...\n",
+      "Step: 78815 | Episode Reward: 141.396 | Epsilon: 0.722\n",
+      "Resetting the environment...\n",
+      "Step: 79853 | Episode Reward: 163.146 | Epsilon: 0.718\n",
+      "Resetting the environment...\n",
+      "Step: 80942 | Episode Reward: 161.397 | Epsilon: 0.715\n",
+      "Resetting the environment...\n",
+      "Step: 82028 | Episode Reward: 146.968 | Epsilon: 0.711\n",
+      "Resetting the environment...\n",
+      "Step: 83128 | Episode Reward: 153.940 | Epsilon: 0.708\n",
+      "Resetting the environment...\n",
+      "Step: 84341 | Episode Reward: 142.633 | Epsilon: 0.704\n",
+      "Resetting the environment...\n",
+      "Step: 85518 | Episode Reward: 160.601 | Epsilon: 0.701\n",
+      "Resetting the environment...\n",
+      "Step: 86619 | Episode Reward: 154.950 | Epsilon: 0.697\n",
+      "Resetting the environment...\n",
+      "Step: 87728 | Episode Reward: 151.248 | Epsilon: 0.694\n",
+      "Resetting the environment...\n",
+      "Step: 88824 | Episode Reward: 156.445 | Epsilon: 0.690\n",
+      "Resetting the environment...\n",
+      "Step: 89858 | Episode Reward: 163.781 | Epsilon: 0.687\n",
+      "Resetting the environment...\n",
+      "Step: 91035 | Episode Reward: 148.585 | Epsilon: 0.683\n",
+      "Resetting the environment...\n",
+      "Step: 92108 | Episode Reward: 162.602 | Epsilon: 0.680\n",
+      "Resetting the environment...\n",
+      "Step: 93187 | Episode Reward: 144.714 | Epsilon: 0.676\n",
+      "Resetting the environment...\n",
+      "Step: 94250 | Episode Reward: 162.002 | Epsilon: 0.673\n",
+      "Resetting the environment...\n",
+      "Step: 95763 | Episode Reward: 115.001 | Epsilon: 0.670\n",
+      "Resetting the environment...\n",
+      "Step: 96822 | Episode Reward: 162.466 | Epsilon: 0.666\n",
+      "Resetting the environment...\n",
+      "Step: 97932 | Episode Reward: 155.504 | Epsilon: 0.663\n",
+      "Resetting the environment...\n",
+      "Step: 99028 | Episode Reward: 156.306 | Epsilon: 0.660\n",
+      "Resetting the environment...\n",
+      "Step: 100099 | Episode Reward: 157.606 | Epsilon: 0.656\n",
+      "Resetting the environment...\n",
+      "Step: 101170 | Episode Reward: 156.524 | Epsilon: 0.653\n",
+      "Resetting the environment...\n",
+      "Step: 102249 | Episode Reward: 156.795 | Epsilon: 0.650\n",
+      "Resetting the environment...\n",
+      "Step: 103308 | Episode Reward: 152.345 | Epsilon: 0.647\n",
+      "Resetting the environment...\n",
+      "Step: 104405 | Episode Reward: 146.136 | Epsilon: 0.643\n",
+      "Resetting the environment...\n",
+      "Step: 105506 | Episode Reward: 151.228 | Epsilon: 0.640\n",
+      "Resetting the environment...\n",
+      "Step: 106586 | Episode Reward: 157.143 | Epsilon: 0.637\n",
+      "Resetting the environment...\n",
+      "Step: 107687 | Episode Reward: 155.193 | Epsilon: 0.634\n",
+      "Resetting the environment...\n",
+      "Step: 108745 | Episode Reward: 163.514 | Epsilon: 0.631\n",
+      "Resetting the environment...\n",
+      "Step: 109821 | Episode Reward: 157.198 | Epsilon: 0.627\n",
+      "Resetting the environment...\n",
+      "Step: 110888 | Episode Reward: 142.207 | Epsilon: 0.624\n",
+      "Resetting the environment...\n",
+      "Step: 111970 | Episode Reward: 157.823 | Epsilon: 0.621\n",
+      "Resetting the environment...\n",
+      "Step: 113059 | Episode Reward: 151.426 | Epsilon: 0.618\n",
+      "Resetting the environment...\n",
+      "Step: 114143 | Episode Reward: 153.774 | Epsilon: 0.615\n",
+      "Resetting the environment...\n",
+      "Step: 115263 | Episode Reward: 159.760 | Epsilon: 0.612\n",
+      "Resetting the environment...\n",
+      "Step: 116315 | Episode Reward: 160.141 | Epsilon: 0.609\n",
+      "Resetting the environment...\n",
+      "Step: 117375 | Episode Reward: 145.956 | Epsilon: 0.606\n",
+      "Resetting the environment...\n",
+      "Step: 118520 | Episode Reward: 152.579 | Epsilon: 0.603\n",
+      "Resetting the environment...\n",
+      "Step: 119615 | Episode Reward: 162.900 | Epsilon: 0.600\n",
+      "Resetting the environment...\n",
+      "Step: 120710 | Episode Reward: 159.072 | Epsilon: 0.597\n",
+      "Resetting the environment...\n",
+      "Step: 122248 | Episode Reward: 114.978 | Epsilon: 0.594\n",
+      "Resetting the environment...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n",
+      "  warnings.warn(self.ENgeterror(ierr))\n",
+      "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: System may be hydraulically unstable.\n",
+      "  warnings.warn(self.ENgeterror(ierr))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Step: 123742 | Episode Reward: 105.305 | Epsilon: 0.591\n",
+      "Resetting the environment...\n",
+      "Step: 124838 | Episode Reward: 153.693 | Epsilon: 0.588\n",
+      "Resetting the environment...\n",
+      "Step: 125927 | Episode Reward: 156.106 | Epsilon: 0.585\n",
+      "Resetting the environment...\n",
+      "Step: 126972 | Episode Reward: 163.293 | Epsilon: 0.582\n",
+      "Resetting the environment...\n",
+      "Step: 128030 | Episode Reward: 158.597 | Epsilon: 0.579\n",
+      "Resetting the environment...\n",
+      "Step: 129064 | Episode Reward: 163.821 | Epsilon: 0.576\n",
+      "Resetting the environment...\n",
+      "Step: 130153 | Episode Reward: 161.117 | Epsilon: 0.573\n",
+      "Resetting the environment...\n",
+      "Step: 131246 | Episode Reward: 154.042 | Epsilon: 0.570\n",
+      "Resetting the environment...\n",
+      "Step: 132334 | Episode Reward: 157.167 | Epsilon: 0.568\n",
+      "Resetting the environment...\n",
+      "Step: 133400 | Episode Reward: 152.942 | Epsilon: 0.565\n",
+      "Resetting the environment...\n",
+      "Step: 134676 | Episode Reward: 144.907 | Epsilon: 0.562\n",
+      "Resetting the environment...\n",
+      "Step: 135752 | Episode Reward: 157.574 | Epsilon: 0.559\n",
+      "Resetting the environment...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n",
+      "  warnings.warn(self.ENgeterror(ierr))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Step: 136918 | Episode Reward: 143.511 | Epsilon: 0.556\n",
+      "Resetting the environment...\n",
+      "Step: 137993 | Episode Reward: 158.570 | Epsilon: 0.554\n",
+      "Resetting the environment...\n",
+      "Step: 139027 | Episode Reward: 164.542 | Epsilon: 0.551\n",
+      "Resetting the environment...\n",
+      "Step: 140077 | Episode Reward: 161.509 | Epsilon: 0.548\n",
+      "Resetting the environment...\n",
+      "Step: 141478 | Episode Reward: 119.271 | Epsilon: 0.545\n",
+      "Resetting the environment...\n",
+      "Step: 142565 | Episode Reward: 165.614 | Epsilon: 0.543\n",
+      "Resetting the environment...\n",
+      "Step: 143643 | Episode Reward: 150.953 | Epsilon: 0.540\n",
+      "Resetting the environment...\n",
+      "Step: 144710 | Episode Reward: 160.797 | Epsilon: 0.537\n",
+      "Resetting the environment...\n",
+      "Step: 145783 | Episode Reward: 154.920 | Epsilon: 0.534\n",
+      "Resetting the environment...\n",
+      "Step: 146812 | Episode Reward: 166.769 | Epsilon: 0.532\n",
+      "Resetting the environment...\n",
+      "Step: 147836 | Episode Reward: 161.131 | Epsilon: 0.529\n",
+      "Resetting the environment...\n",
+      "Step: 149294 | Episode Reward: 119.741 | Epsilon: 0.526\n",
+      "Resetting the environment...\n",
+      "Step: 150353 | Episode Reward: 145.694 | Epsilon: 0.524\n",
+      "Resetting the environment...\n",
+      "Step: 151429 | Episode Reward: 150.523 | Epsilon: 0.521\n",
+      "Resetting the environment...\n",
+      "Step: 152482 | Episode Reward: 161.591 | Epsilon: 0.519\n",
+      "Resetting the environment...\n",
+      "Step: 153548 | Episode Reward: 156.874 | Epsilon: 0.516\n",
+      "Resetting the environment...\n",
+      "Step: 154621 | Episode Reward: 154.044 | Epsilon: 0.513\n",
+      "Resetting the environment...\n",
+      "Step: 155681 | Episode Reward: 164.463 | Epsilon: 0.511\n",
+      "Resetting the environment...\n",
+      "Step: 156725 | Episode Reward: 164.537 | Epsilon: 0.508\n",
+      "Resetting the environment...\n",
+      "Step: 157748 | Episode Reward: 164.193 | Epsilon: 0.506\n",
+      "Resetting the environment...\n",
+      "Step: 158773 | Episode Reward: 166.099 | Epsilon: 0.503\n",
+      "Resetting the environment...\n",
+      "Step: 159859 | Episode Reward: 161.974 | Epsilon: 0.501\n",
+      "Resetting the environment...\n",
+      "Step: 160914 | Episode Reward: 163.825 | Epsilon: 0.498\n",
+      "Resetting the environment...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n",
+      "  warnings.warn(self.ENgeterror(ierr))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Step: 162276 | Episode Reward: 112.850 | Epsilon: 0.496\n",
+      "Resetting the environment...\n",
+      "Step: 163324 | Episode Reward: 158.721 | Epsilon: 0.493\n",
+      "Resetting the environment...\n",
+      "Step: 164401 | Episode Reward: 162.109 | Epsilon: 0.491\n",
+      "Resetting the environment...\n",
+      "Step: 165435 | Episode Reward: 155.681 | Epsilon: 0.488\n",
+      "Resetting the environment...\n",
+      "Step: 166822 | Episode Reward: 119.148 | Epsilon: 0.486\n",
+      "Resetting the environment...\n",
+      "Step: 167884 | Episode Reward: 153.597 | Epsilon: 0.483\n",
+      "Resetting the environment...\n",
+      "Step: 168911 | Episode Reward: 158.481 | Epsilon: 0.481\n",
+      "Resetting the environment...\n",
+      "Step: 169952 | Episode Reward: 162.755 | Epsilon: 0.479\n",
+      "Resetting the environment...\n",
+      "Step: 171002 | Episode Reward: 157.843 | Epsilon: 0.476\n",
+      "Resetting the environment...\n",
+      "Step: 172050 | Episode Reward: 162.917 | Epsilon: 0.474\n",
+      "Resetting the environment...\n",
+      "Step: 173112 | Episode Reward: 159.995 | Epsilon: 0.471\n",
+      "Resetting the environment...\n",
+      "Step: 174359 | Episode Reward: 144.803 | Epsilon: 0.469\n",
+      "Resetting the environment...\n",
+      "Step: 175443 | Episode Reward: 153.168 | Epsilon: 0.467\n",
+      "Resetting the environment...\n",
+      "Step: 176495 | Episode Reward: 150.665 | Epsilon: 0.464\n",
+      "Resetting the environment...\n",
+      "Step: 177547 | Episode Reward: 156.770 | Epsilon: 0.462\n",
+      "Resetting the environment...\n",
+      "Step: 178588 | Episode Reward: 165.277 | Epsilon: 0.460\n",
+      "Resetting the environment...\n",
+      "Step: 179612 | Episode Reward: 163.793 | Epsilon: 0.458\n",
+      "Resetting the environment...\n",
+      "Step: 180638 | Episode Reward: 163.216 | Epsilon: 0.455\n",
+      "Resetting the environment...\n",
+      "Step: 181695 | Episode Reward: 154.532 | Epsilon: 0.453\n",
+      "Resetting the environment...\n",
+      "Step: 182951 | Episode Reward: 143.096 | Epsilon: 0.451\n",
+      "Resetting the environment...\n",
+      "Step: 184021 | Episode Reward: 153.395 | Epsilon: 0.448\n",
+      "Resetting the environment...\n",
+      "Step: 185066 | Episode Reward: 164.928 | Epsilon: 0.446\n",
+      "Resetting the environment...\n",
+      "Step: 186137 | Episode Reward: 163.071 | Epsilon: 0.444\n",
+      "Resetting the environment...\n",
+      "Step: 187301 | Episode Reward: 152.901 | Epsilon: 0.442\n",
+      "Resetting the environment...\n",
+      "Step: 188349 | Episode Reward: 158.262 | Epsilon: 0.440\n",
+      "Resetting the environment...\n",
+      "Step: 189421 | Episode Reward: 162.564 | Epsilon: 0.437\n",
+      "Resetting the environment...\n",
+      "Step: 190614 | Episode Reward: 146.785 | Epsilon: 0.435\n",
+      "Resetting the environment...\n",
+      "Step: 191643 | Episode Reward: 163.195 | Epsilon: 0.433\n",
+      "Resetting the environment...\n",
+      "Step: 192686 | Episode Reward: 158.665 | Epsilon: 0.431\n",
+      "Resetting the environment...\n",
+      "Step: 193749 | Episode Reward: 151.001 | Epsilon: 0.429\n",
+      "Resetting the environment...\n",
+      "Step: 194765 | Episode Reward: 164.780 | Epsilon: 0.427\n",
+      "Resetting the environment...\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n",
+      "  warnings.warn(self.ENgeterror(ierr))\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Step: 195802 | Episode Reward: 154.002 | Epsilon: 0.424\n",
+      "Resetting the environment...\n",
+      "Step: 196843 | Episode Reward: 156.423 | Epsilon: 0.422\n",
+      "Resetting the environment...\n",
+      "Step: 197891 | Episode Reward: 152.905 | Epsilon: 0.420\n",
+      "Resetting the environment...\n",
+      "Step: 198920 | Episode Reward: 160.201 | Epsilon: 0.418\n",
+      "Resetting the environment...\n",
+      "Training finished\n",
+      "Saving model to distributional-dqn-normalisation-sma.zip\n",
+      "Model saved\n"
+     ]
+    }
+   ],
+   "source": [
+    "dist_sma_model.learn(total_timesteps=200000)\n",
+    "dist_sma_model.save(\"distributional-dqn-normalisation-sma\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "010c0bfc-d2e7-4fd1-bbab-f5d3cc18325e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Normalise.py b/Normalise.py
new file mode 100644
index 0000000..768e057
--- /dev/null
+++ b/Normalise.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+# Credits:  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+#           https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+#
+# Inspiration: https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/vec_env/vec_normalize.py
+
+# Core idea is:
+# Keep running statistics of observations. Before feeding the observations to the network.
+
+# xhat = (x - μ) . (σ^2 + ε)^(-1/2)
+# ^, is just z score norm with online statistics
+
+import numpy as np
+import gymnasium as gym
+
+'''
+
+Draft before I do things:
+
+m_a = self.var * self.count  # sum of squared deviations (old)
+m_b = batch_var * batch_count  # sum of squared deviations (new)
+M2 = m_a + m_b + delta**2 * self.count * batch_count / total_count
+self.var = M2 / total_count
+
+'''
+
+class RunningMeanStd:
+
+    '''
+        Goal: Track running mean and variance using Welford's parallel online algorithm
+    '''
+
+    def __init__(self, shape=()):
+        self.mean = np.zeros(shape, dtype=np.float64)
+        self.var = np.ones(shape, dtype=np.float64)
+        self.count = 1e-4 # Div by 0 bad, so initially setting it to this
+
+    # Update the stats with a batch of samples
+    def update(self, x):
+        x = np.asarray(x)
+        if x.ndim == 1:
+            x = x.reshape(1, -1) # As its a single obs, itd be a batch of 1
+
+        batch_mean = x.mean(axis=0)
+        batch_var = x.var(axis=0)
+        batch_count = x.shape[0]
+
+        self.update_from_moments(batch_mean, batch_var, batch_count)
+
+    # Chan's parallel algorithm for combining statistics.
+    def update_from_moments(self, batch_mean, batch_var, batch_count):
+        delta = batch_mean - self.mean
+        total_count = self.count + batch_count
+
+        self.mean = self.mean + delta * batch_count / total_count # New mea being the weighted combination
+
+        # New var being the parallel algorithm
+        m_a = self.var * self.count
+        m_b = batch_var * batch_count
+        M2 = m_a + m_b + (delta ** 2) * self.count * batch_count / total_count # Sum of squared deviations from mean!
+        self.var = M2 / total_count
+
+        self.count = total_count
+
+    def normalise(self, x):
+        return (x - self.mean) / np.sqrt(self.var + 1e-8) # z-score norm
+
+
+class NormaliseObservation(gym.Wrapper):
+    '''
+        Gym wrapper for obs norm using running statistics.
+        
+        Aim:
+            During training: updates stats and normalises
+            During evaluation: normalises only (set training=False)
+    '''
+
+    def __init__(self, env):
+        super().__init__(env)
+        self.rms = RunningMeanStd(shape=env.observation_space.shape)
+        self.training = True
+
+    def step(self, action):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        if self.training:
+            self.rms.update(obs)
+        return self.rms.normalise(obs), reward, terminated, truncated, info
+
+    def reset(self, **kwargs):
+        obs, info = self.env.reset(**kwargs)
+        if self.training:
+            self.rms.update(obs)
+        return self.rms.normalise(obs), info
\ No newline at end of file
diff --git a/distributional-dqn-normalisation-ema.zip b/distributional-dqn-normalisation-ema.zip
new file mode 100644
index 0000000..d95354b
Binary files /dev/null and b/distributional-dqn-normalisation-ema.zip differ
diff --git a/distributional-dqn-normalisation-sma.zip b/distributional-dqn-normalisation-sma.zip
new file mode 100644
index 0000000..3a26381
Binary files /dev/null and b/distributional-dqn-normalisation-sma.zip differ
diff --git a/dueling-dqn-normalisation-ema.zip b/dueling-dqn-normalisation-ema.zip
new file mode 100644
index 0000000..efa43da
Binary files /dev/null and b/dueling-dqn-normalisation-ema.zip differ
diff --git a/dueling-dqn-normalisation-sma.zip b/dueling-dqn-normalisation-sma.zip
new file mode 100644
index 0000000..5e1ca8c
Binary files /dev/null and b/dueling-dqn-normalisation-sma.zip differ
diff --git a/dueling-dqn-sma.zip b/dueling-dqn-sma.zip
new file mode 100644
index 0000000..9039d67
Binary files /dev/null and b/dueling-dqn-sma.zip differ
diff --git a/dueling_dqn.py b/dueling_dqn.py
new file mode 100644
index 0000000..d3e6364
--- /dev/null
+++ b/dueling_dqn.py
@@ -0,0 +1,662 @@
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import random
+from collections import deque
+import numpy as np
+import os
+import io
+import zipfile
+from torch.utils.tensorboard import SummaryWriter
+
+# Following code implements a DQN and DDQN agent from scratch
+# DDQN Class uses inheritance from original DQN Class
+# Uses epsilon decay, more random and exploratory behaviour initially and eventually follows experience 
+
+class Neural_Network(nn.Module):
+    
+    # Create Q-Network which takes in states and outputs state-action pairs
+    def __init__(self, state_dimension, action_dimension, hidden=128):
+        super(Neural_Network, self).__init__()
+        self.net = nn.Sequential(
+            nn.Linear(state_dimension, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dimension))
+        
+    # Calculate Forward pass prediction by network
+    def forward(self, x):
+        return self.net(x)
+
+import torch
+import torch.nn as nn
+
+class Dueling_Neural_Network(nn.Module):
+    """
+    Dueling architecture:
+    Q(s,a) = V(s) + (A(s,a) - mean_a A(s,a))
+    """
+    def __init__(self, state_dimension, action_dimension, hidden=128):
+        super(Dueling_Neural_Network, self).__init__()
+
+        # shared feature extractor
+        self.feature = nn.Sequential(
+            nn.Linear(state_dimension, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, hidden),
+            nn.ReLU(),
+        )
+
+        # value stream outputs scalar V(s)
+        self.value_stream = nn.Sequential(
+            nn.Linear(hidden, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, 1),
+        )
+
+        # advantage stream outputs A(s,a) for each action
+        self.adv_stream = nn.Sequential(
+            nn.Linear(hidden, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dimension),
+        )
+
+    def forward(self, x):
+        features = self.feature(x)
+        values = self.value_stream(features)          # shape: (B, 1)
+        advantages = self.adv_stream(features)        # shape: (B, A)
+
+        # subtract mean advantage for identifiability
+        advantages = advantages - advantages.mean(dim=1, keepdim=True)
+
+        q_values = values + advantages                # broadcast (B,1) + (B,A)
+        return q_values
+
+
+class Replay_Buffer:
+
+    # Initialise buffer to have max size N. Any new experiences will replace old experiences if N > max_size
+    def __init__(self, N):
+        self.buffer = deque(maxlen=N)
+
+    # Add experience to replay buffer
+    def store_experience(self, state, action, reward, next_state, done):
+        self.buffer.append((state, action, reward, next_state, done))
+
+    # Create random sample batch of experience from replay buffer (size = bath_size)
+    def mini_batch(self, batch_size):
+        batch = random.sample(self.buffer, batch_size)
+        state, action, reward, next_state, done = zip(*batch)
+        return (np.array(state), np.array(action), np.array(reward), np.array(next_state), np.array(done))
+
+    def __len__(self):
+        return len(self.buffer)
+
+class DQN_Implementation:
+    
+    def __init__(self, env, learning_rate=3e-4, buffer_size=50000, batch_size=64, gamma=0.99, tensorboard_log=None):
+        # Initialise parameters env, lr, batch_size, gamma and tensorboard_logs for metrics
+        # Buffer Size of 50000 allows for more episodes to be stored, helps with learning unexpected events
+        self.env = env
+        self.learning_rate = learning_rate
+        self.buffer_size= buffer_size
+        self.batch_size = batch_size
+        self.gamma = gamma
+        self.tensorboard_log = tensorboard_log
+
+        # Hyperparameters, using epsilon decay to improve learning over episodes
+        # Initially agent acts completely random, as training increases follows optimal policy as experience increases
+        self.epsilon = 1.0
+        self.end_epsilon = 0.05
+        self.decay_rate = 0.005
+        self.update_frequency = 1000 
+        self.steps_completed = 0
+
+        self.experience_memory = Replay_Buffer(buffer_size)  # Initialise Replay Buffer
+        
+        self.state_dimension = env.observation_space.shape[0]  # State Space Dimension
+        self.action_dimension = env.action_space.n  # Action Space Dimension
+
+        self.policy_network = Neural_Network(self.state_dimension, self.action_dimension)  # Initialise policy action-value network
+        self.target_network = Neural_Network(self.state_dimension, self.action_dimension)  # Initialise target action-value network
+        
+        self.target_network.load_state_dict(self.policy_network.state_dict())  # target network has same initial parmaeters as policy network
+        self.target_network.eval() 
+
+        self.optim = optim.Adam(self.policy_network.parameters(), lr=self.learning_rate)  # Initialise Adam Optimizer for LR
+
+        # Initialise Logging for Tensorboard
+        self.writer = None
+        if self.tensorboard_log:
+            os.makedirs(self.tensorboard_log, exist_ok=True)
+            self.writer = SummaryWriter(log_dir=self.tensorboard_log)
+
+    def action(self, state):
+        if random.random() < self.epsilon:  # Randomly select action if prob < epsilon
+            return random.randrange(self.action_dimension)
+        else:  # otherwise choose action to maximise q (epsilon greedy)
+            with torch.no_grad():  # To improve speed and memory for PyTorch
+                q_values = self.policy_network(torch.FloatTensor(state).unsqueeze(0))
+                return q_values.argmax().item()
+
+    def update_network(self):
+        if len(self.experience_memory) < self.batch_size:
+            return None
+
+        states, actions, rewards, next_states, dones = self.experience_memory.mini_batch(self.batch_size)
+
+        states = torch.FloatTensor(states)
+        actions = torch.LongTensor(actions).unsqueeze(1)
+        rewards = torch.FloatTensor(rewards).unsqueeze(1)
+        next_states = torch.FloatTensor(next_states)
+        dones = torch.FloatTensor(dones).unsqueeze(1)
+
+        # Calculate Policy Network Q-Values Q(s, a)
+        policy_q = self.policy_network(states).gather(1, actions)
+
+        with torch.no_grad():
+            # Calculate Target Network Q-Values using Bellman Equation; if terminal state then target_q = rewards
+            target_q = rewards + (self.gamma * self.target_network(next_states).max(1)[0].unsqueeze(1) * (1 - dones))
+
+        huber_loss = nn.HuberLoss(delta=1.0)(policy_q, target_q)  # We normalise rewards with an upper bound of 1 hence delta of 1.0
+        
+        # Weights only updated for Policy Network using gradient descent, not for Target Network
+        self.optim.zero_grad()
+        huber_loss.backward()
+        self.optim.step()
+        
+        return huber_loss.item()
+
+    def learn(self, total_timesteps):
+        print(f"Train for {total_timesteps} steps")
+        state, _ = self.env.reset()
+        episode_reward, episode_count = 0, 0
+        
+        for step in range(1, total_timesteps + 1):
+            self.steps_completed += 1
+            
+            action = self.action(state)
+            
+            next_state, reward, terminated, truncated, _ = self.env.step(action)
+            episode_reward += reward
+            
+            flag = terminated
+            # Store experience in Replay Buffer
+            self.experience_memory.store_experience(state, action, reward, next_state, flag)
+            
+            state = next_state
+
+            loss_val = self.update_network()
+            # If episode finishes or terminates after special event (overflow)
+            if terminated or truncated:
+                if self.writer:
+                    self.writer.add_scalar("rollout/episode_reward_mean", episode_reward, self.steps_completed)
+                    self.writer.add_scalar("train/epsilon", self.epsilon, self.steps_completed)
+                
+                print(f"Step: {self.steps_completed} | Episode Reward: {episode_reward:.3f} | Epsilon: {self.epsilon:.3f}")
+                
+                state, _ = self.env.reset()
+                episode_reward = 0
+                episode_count += 1
+
+                # Epsilon Decay formula
+                self.epsilon = max(self.end_epsilon, self.epsilon * (1-self.decay_rate))
+
+            # Every 100 steps add the train/loss value to the log
+            if loss_val is not None and step % 100 == 0 and self.writer:
+                self.writer.add_scalar("train/loss", loss_val, self.steps_completed)
+
+            # Every 1000 steps update the target network
+            if self.steps_completed % self.update_frequency == 0:
+                self.target_network.load_state_dict(self.policy_network.state_dict())
+
+        print("Training finished")
+        if self.writer:
+            self.writer.flush()
+            self.writer.close()
+            
+    # Save model and .zip file
+    def save(self, path):
+        if not path.endswith(".zip"):
+            path += ".zip"
+            
+        print(f"Saving model to {path}")
+        
+        buffer = io.BytesIO()
+        torch.save({
+            'model_state_dictionary': self.policy_network.state_dict(),
+            'optimizer_state_dictionary': self.optim.state_dict(),
+            'epsilon': self.epsilon}, buffer)
+        buffer.seek(0)
+        
+        with zipfile.ZipFile(path, 'w', zipfile.ZIP_DEFLATED) as zf:
+            zf.writestr("policy.pth", buffer.read())
+            
+        print("Model saved")
+
+class Double_DQN_Implementation(DQN_Implementation):
+
+    # Only change required is update_network method when calculating target_q from Bellman Equation
+    def update_network(self):
+        if len(self.experience_memory) < self.batch_size:
+            return None
+
+        states, actions, rewards, next_states, dones = self.experience_memory.mini_batch(self.batch_size)
+
+        states = torch.FloatTensor(states)
+        actions = torch.LongTensor(actions).unsqueeze(1)
+        rewards = torch.FloatTensor(rewards).unsqueeze(1)
+        next_states = torch.FloatTensor(next_states)
+        dones = torch.FloatTensor(dones).unsqueeze(1)
+
+        # Calculate Policy Network Q-Values Q(s, a)
+        policy_q = self.policy_network(states).gather(1, actions)
+
+        with torch.no_grad():
+            # Difference between DQN and DDQN
+            # Select best actions from policy network, calculate target_q values with these action from the target network
+            best_next_actions = self.policy_network(next_states).argmax(1).unsqueeze(1)
+            target_q = rewards + (self.gamma * self.target_network(next_states).gather(1, best_next_actions) * (1 - dones))
+
+        huber_loss = nn.HuberLoss(delta=1.0)(policy_q, target_q)  # We normalise rewards with an upper bound of 1 hence delta of 1.0
+        
+        # Weights only updated for Policy Network using gradient descent, not for Target Network
+        self.optim.zero_grad()
+        huber_loss.backward()
+        self.optim.step()
+        
+        return huber_loss.item()
+
+
+class Dueling_DQN_Implementation:
+    
+    def __init__(self, env, learning_rate=3e-4, buffer_size=50000, batch_size=64,
+                 gamma=0.99, tensorboard_log=None, hidden=128):
+        
+        self.env = env
+        self.learning_rate = learning_rate
+        self.buffer_size = buffer_size
+        self.batch_size = batch_size
+        self.gamma = gamma
+        self.tensorboard_log = tensorboard_log
+
+       
+        self.epsilon = 1.0
+        self.end_epsilon = 0.05
+        self.decay_rate = 0.005
+        self.update_frequency = 1000
+        self.steps_completed = 0
+
+        
+        self.experience_memory = Replay_Buffer(buffer_size)
+
+        self.state_dimension = env.observation_space.shape[0]
+        self.action_dimension = env.action_space.n
+
+        
+        self.policy_network = Dueling_Neural_Network(self.state_dimension, self.action_dimension, hidden=hidden)
+        self.target_network = Dueling_Neural_Network(self.state_dimension, self.action_dimension, hidden=hidden)
+
+        self.target_network.load_state_dict(self.policy_network.state_dict())
+        self.target_network.eval()
+
+        self.optim = optim.Adam(self.policy_network.parameters(), lr=self.learning_rate)
+
+        
+        self.writer = None
+        if self.tensorboard_log:
+            os.makedirs(self.tensorboard_log, exist_ok=True)
+            self.writer = SummaryWriter(log_dir=self.tensorboard_log)
+
+    def action(self, state):
+        if random.random() < self.epsilon:
+            return random.randrange(self.action_dimension)
+        else:
+            with torch.no_grad():
+                q_values = self.policy_network(torch.FloatTensor(state).unsqueeze(0))
+                return q_values.argmax().item()
+
+    def update_network(self):
+        if len(self.experience_memory) < self.batch_size:
+            return None
+
+        states, actions, rewards, next_states, dones = self.experience_memory.mini_batch(self.batch_size)
+
+        states = torch.FloatTensor(states)
+        actions = torch.LongTensor(actions).unsqueeze(1)
+        rewards = torch.FloatTensor(rewards).unsqueeze(1)
+        next_states = torch.FloatTensor(next_states)
+        dones = torch.FloatTensor(dones).unsqueeze(1)
+
+        
+        policy_q = self.policy_network(states).gather(1, actions)
+
+        with torch.no_grad():
+            # vanilla DQN target (max over target net)
+            next_q_max = self.target_network(next_states).max(1)[0].unsqueeze(1)
+            target_q = rewards + (self.gamma * next_q_max * (1 - dones))
+
+        huber_loss = nn.HuberLoss(delta=1.0)(policy_q, target_q)
+
+        self.optim.zero_grad()
+        huber_loss.backward()
+        self.optim.step()
+
+        return huber_loss.item()
+
+    def learn(self, total_timesteps):
+        print(f"Train for {total_timesteps} steps")
+        state, _ = self.env.reset()
+        episode_reward = 0
+
+        for step in range(1, total_timesteps + 1):
+            self.steps_completed += 1
+
+            act = self.action(state)
+            next_state, reward, terminated, truncated, _ = self.env.step(act)
+
+            episode_reward += reward
+
+            done_flag = terminated
+            self.experience_memory.store_experience(state, act, reward, next_state, done_flag)
+
+            state = next_state
+
+            loss_val = self.update_network()
+
+            if terminated or truncated:
+                if self.writer:
+                    self.writer.add_scalar("rollout/episode_reward_mean", episode_reward, self.steps_completed)
+                    self.writer.add_scalar("train/epsilon", self.epsilon, self.steps_completed)
+
+                print(f"Step: {self.steps_completed} | Episode Reward: {episode_reward:.3f} | Epsilon: {self.epsilon:.3f}")
+
+                state, _ = self.env.reset()
+                episode_reward = 0
+
+                self.epsilon = max(self.end_epsilon, self.epsilon * (1 - self.decay_rate))
+
+            if loss_val is not None and step % 100 == 0 and self.writer:
+                self.writer.add_scalar("train/loss", loss_val, self.steps_completed)
+
+            if self.steps_completed % self.update_frequency == 0:
+                self.target_network.load_state_dict(self.policy_network.state_dict())
+
+        print("Training finished")
+        if self.writer:
+            self.writer.flush()
+            self.writer.close()
+
+    def save(self, path):
+        if not path.endswith(".zip"):
+            path += ".zip"
+
+        print(f"Saving model to {path}")
+
+        buffer = io.BytesIO()
+        torch.save({
+            'model_state_dictionary': self.policy_network.state_dict(),
+            'optimizer_state_dictionary': self.optim.state_dict(),
+            'epsilon': self.epsilon
+        }, buffer)
+        buffer.seek(0)
+
+        with zipfile.ZipFile(path, 'w', zipfile.ZIP_DEFLATED) as zf:
+            zf.writestr("policy.pth", buffer.read())
+
+        print("Model saved")
+
+########################################################################################################################################
+# Distributional DQN
+
+class Distributional_Neural_Network(nn.Module):
+    def __init__(self, state_dimension, action_dimension, n_atoms=51, hidden=128):
+        super().__init__()
+        self.action_dimension = action_dimension
+        self.n_atoms = n_atoms
+
+        self.net = nn.Sequential(
+            nn.Linear(state_dimension, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, hidden),
+            nn.ReLU(),
+            nn.Linear(hidden, action_dimension * n_atoms)
+        )
+
+    def forward(self, x):
+        logits = self.net(x)  # (B, A*N)
+        logits = logits.view(-1, self.action_dimension, self.n_atoms)  # (B, A, N)
+        return logits
+
+class Distributional_DQN_Implementation:
+    def __init__(
+        self,
+        env,
+        learning_rate=3e-4,
+        buffer_size=50000,
+        batch_size=64,
+        gamma=0.99,
+        tensorboard_log=None,
+        n_atoms=51,
+        v_min=-10.0,
+        v_max=10.0,
+        hidden=128,
+        update_frequency=1000,
+        decay_rate=0.005,
+        end_epsilon=0.05,
+        device=None
+    ):
+        self.env = env
+        self.learning_rate = learning_rate
+        self.buffer_size = buffer_size
+        self.batch_size = batch_size
+        self.gamma = gamma
+        self.tensorboard_log = tensorboard_log
+
+        # Epsilon-greedy params
+        self.epsilon = 1.0
+        self.end_epsilon = end_epsilon
+        self.decay_rate = decay_rate
+
+        self.update_frequency = update_frequency
+        self.steps_completed = 0
+
+        self.experience_memory = Replay_Buffer(buffer_size)
+
+        self.state_dimension = env.observation_space.shape[0]
+        self.action_dimension = env.action_space.n
+
+        # Distribution support (atoms)
+        self.n_atoms = n_atoms
+        self.v_min = float(v_min)
+        self.v_max = float(v_max)
+        self.delta_z = (self.v_max - self.v_min) / (self.n_atoms - 1)
+
+        z = torch.linspace(self.v_min, self.v_max, self.n_atoms)
+        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        self.z = z.to(self.device)  # (N,)
+
+        self.policy_network = Distributional_Neural_Network(
+            self.state_dimension, self.action_dimension, n_atoms=self.n_atoms, hidden=hidden
+        ).to(self.device)
+
+        self.target_network = Distributional_Neural_Network(
+            self.state_dimension, self.action_dimension, n_atoms=self.n_atoms, hidden=hidden
+        ).to(self.device)
+
+        self.target_network.load_state_dict(self.policy_network.state_dict())
+        self.target_network.eval()
+
+        self.optim = optim.Adam(self.policy_network.parameters(), lr=self.learning_rate)
+
+        # TensorBoard
+        self.writer = None
+        if self.tensorboard_log:
+            os.makedirs(self.tensorboard_log, exist_ok=True)
+            self.writer = SummaryWriter(log_dir=self.tensorboard_log)
+
+    def _dist(self, logits):
+        # Convert logits -> probability distribution over atoms
+        return torch.softmax(logits, dim=-1)  # (..., N)
+
+    def _expected_q(self, logits):
+        # E[Z] for each action: sum p(z_i) * z_i
+        probs = self._dist(logits)  # (B, A, N)
+        q = torch.sum(probs * self.z.view(1, 1, -1), dim=-1)  # (B, A)
+        return q
+
+    def action(self, state):
+        if random.random() < self.epsilon:
+            return random.randrange(self.action_dimension)
+        else:
+            with torch.no_grad():
+                s = torch.FloatTensor(state).unsqueeze(0).to(self.device)
+                logits = self.policy_network(s)  # (1, A, N)
+                q_vals = self._expected_q(logits)  # (1, A)
+                return q_vals.argmax(dim=1).item()
+
+    @torch.no_grad()
+    def _project_distribution(self, rewards, dones, next_logits):
+        """
+        C51 projection:
+        Tz = r + gamma*(1-done)*z
+        Project onto fixed support [v_min, v_max]
+        """
+        # next_logits: (B, A, N)
+        next_probs = self._dist(next_logits)  # (B, A, N)
+
+        # Greedy action by expected value under target distributions
+        next_q = torch.sum(next_probs * self.z.view(1, 1, -1), dim=-1)  # (B, A)
+        next_actions = next_q.argmax(dim=1, keepdim=True)  # (B, 1)
+
+        # Pick distribution for best action: (B, N)
+        next_probs_a = next_probs.gather(1, next_actions.unsqueeze(-1).expand(-1, -1, self.n_atoms)).squeeze(1)
+
+        # Bellman update on atoms
+        # rewards, dones: (B, 1)
+        Tz = rewards + self.gamma * (1.0 - dones) * self.z.view(1, -1)  # (B, N)
+        Tz = torch.clamp(Tz, self.v_min, self.v_max)
+
+        # Compute projection locations
+        b = (Tz - self.v_min) / self.delta_z  # (B, N)
+        l = torch.floor(b).long()
+        u = torch.ceil(b).long()
+
+        # Fix possible numerical issues where l==u
+        l = torch.clamp(l, 0, self.n_atoms - 1)
+        u = torch.clamp(u, 0, self.n_atoms - 1)
+
+        m = torch.zeros_like(next_probs_a)  # (B, N)
+
+        # Distribute probability mass
+        # m[l] += p * (u - b)
+        # m[u] += p * (b - l)
+        offset = torch.arange(next_probs_a.size(0), device=self.device).unsqueeze(1)  # (B,1)
+
+        m.view(-1).index_add_(
+            0,
+            (l + offset * self.n_atoms).view(-1),
+            (next_probs_a * (u.float() - b)).view(-1)
+        )
+        m.view(-1).index_add_(
+            0,
+            (u + offset * self.n_atoms).view(-1),
+            (next_probs_a * (b - l.float())).view(-1)
+        )
+
+        # m is the projected target distribution (B, N)
+        return m
+
+    def update_network(self):
+        if len(self.experience_memory) < self.batch_size:
+            return None
+
+        states, actions, rewards, next_states, dones = self.experience_memory.mini_batch(self.batch_size)
+
+        states = torch.FloatTensor(states).to(self.device)
+        actions = torch.LongTensor(actions).unsqueeze(1).to(self.device)   # (B,1)
+        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(self.device)  # (B,1)
+        next_states = torch.FloatTensor(next_states).to(self.device)
+        dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device)      # (B,1)
+
+        # Current logits for all actions
+        logits = self.policy_network(states)  # (B, A, N)
+        # Select logits for taken actions: (B, N)
+        logits_a = logits.gather(1, actions.unsqueeze(-1).expand(-1, -1, self.n_atoms)).squeeze(1)
+
+        # Target projected distribution m: (B, N)
+        with torch.no_grad():
+            next_logits = self.target_network(next_states)  # (B, A, N)
+            target_dist = self._project_distribution(rewards, dones, next_logits)  # (B, N)
+
+        # Cross-entropy loss: - sum m * log p
+        log_probs = torch.log_softmax(logits_a, dim=-1)  # (B, N)
+        loss = -(target_dist * log_probs).sum(dim=-1).mean()
+
+        self.optim.zero_grad()
+        loss.backward()
+        self.optim.step()
+
+        return loss.item()
+    def save(self, path):
+        if not path.endswith(".zip"):
+            path += ".zip"
+
+        print(f"Saving model to {path}")
+
+        buffer = io.BytesIO()
+        torch.save({
+            'model_state_dictionary': self.policy_network.state_dict(),
+            'optimizer_state_dictionary': self.optim.state_dict(),
+            'epsilon': self.epsilon
+        }, buffer)
+        buffer.seek(0)
+
+        with zipfile.ZipFile(path, 'w', zipfile.ZIP_DEFLATED) as zf:
+            zf.writestr("policy.pth", buffer.read())
+
+        print("Model saved")
+
+
+    def learn(self, total_timesteps):
+        print(f"Train for {total_timesteps} steps | device={self.device}")
+        state, _ = self.env.reset()
+        episode_reward = 0.0
+
+        for step in range(1, total_timesteps + 1):
+            self.steps_completed += 1
+
+            act = self.action(state)
+            next_state, reward, terminated, truncated, _ = self.env.step(act)
+
+            episode_reward += reward
+
+            done_flag = terminated  # follow your convention
+            self.experience_memory.store_experience(state, act, reward, next_state, done_flag)
+            state = next_state
+
+            loss_val = self.update_network()
+
+            if terminated or truncated:
+                if self.writer:
+                    self.writer.add_scalar("rollout/episode_reward_mean", episode_reward, self.steps_completed)
+                    self.writer.add_scalar("train/epsilon", self.epsilon, self.steps_completed)
+
+                print(f"Step: {self.steps_completed} | Episode Reward: {episode_reward:.3f} | Epsilon: {self.epsilon:.3f}")
+
+                state, _ = self.env.reset()
+                episode_reward = 0.0
+
+                self.epsilon = max(self.end_epsilon, self.epsilon * (1 - self.decay_rate))
+
+            if loss_val is not None and step % 100 == 0 and self.writer:
+                self.writer.add_scalar("train/loss", loss_val, self.steps_completed)
+
+            if self.steps_completed % self.update_frequency == 0:
+                self.target_network.load_state_dict(self.policy_network.state_dict())
+
+        print("Training finished")
+        if self.writer:
+            self.writer.flush()
+            self.writer.close()
\ No newline at end of file
diff --git a/wds_dist_logs.zip b/wds_dist_logs.zip
new file mode 100644
index 0000000..d775da8
Binary files /dev/null and b/wds_dist_logs.zip differ
diff --git a/wds_dueling_logs.zip b/wds_dueling_logs.zip
new file mode 100644
index 0000000..4934e08
Binary files /dev/null and b/wds_dueling_logs.zip differ