diff --git a/Env implementation.ipynb b/Env implementation.ipynb new file mode 100644 index 0000000..7693985 --- /dev/null +++ b/Env implementation.ipynb @@ -0,0 +1,677 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 7, + "id": "d782b3e7-70da-4b64-9433-ad37e9024052", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import gym4real\n", + "import gymnasium as gym\n", + "from gymnasium import spaces\n", + "from gym4real.envs.wds.utils import parameter_generator\n", + "import wntr\n", + "import wntr.sim\n", + "from gym4real.envs.wds.reward_scaling_wrapper import RewardScalingWrapper\n", + "from dueling_dqn import DQN_Implementation, Double_DQN_Implementation, Dueling_DQN_Implementation, Distributional_DQN_Implementation\n", + "from Normalise import NormaliseObservation" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "2fc21950-b80c-483a-92ba-f516048d13bd", + "metadata": {}, + "outputs": [], + "source": [ + "config_path = os.path.join(os.getcwd(), \"gym4real\", \"envs\", \"wds\", \"world_anytown.yaml\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e9145afc-58a2-4c2e-a830-ce90638e41d9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/world_anytown.yaml'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "config_path" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "16c71143-c3da-46b0-bb54-3507d283ef9a", + "metadata": {}, + "outputs": [], + "source": [ + "base_params = parameter_generator(\n", + " hydraulic_step=3600,\n", + " duration=604800,\n", + " seed=42,\n", + " world_options=config_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c631e482-afc9-4577-ab5c-b4144860d3e4", + "metadata": {}, + "outputs": [], + "source": [ + "# Environment using SMA\n", + "base_params['demand_moving_average'] = True # Turn on SMA \n", + "base_params['demand_exp_moving_average'] = False # Turn off EMA " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "88dfc98b-d0ef-475d-a522-cd25478418d3", + "metadata": {}, + "outputs": [], + "source": [ + "def make_env():\n", + " env = gym.make('gym4real/wds-v0', settings=base_params)\n", + " env = RewardScalingWrapper(env)\n", + " env = NormaliseObservation(env)\n", + " return env" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "992a51a0-e773-4a28-8aa3-00ab6b59799f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/anaconda3/envs/rl-env/lib/python3.12/site-packages/gymnasium/spaces/box.py:235: UserWarning: \u001b[33mWARN: Box low's precision lowered by casting to float32, current low.dtype=float64\u001b[0m\n", + " gym.logger.warn(\n", + "/opt/anaconda3/envs/rl-env/lib/python3.12/site-packages/gymnasium/spaces/box.py:305: UserWarning: \u001b[33mWARN: Box high's precision lowered by casting to float32, current high.dtype=float64\u001b[0m\n", + " gym.logger.warn(\n" + ] + } + ], + "source": [ + "dqn_sma_env = make_env()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0b3d6e1c-d1da-449a-96ad-2ceb5c78315d", + "metadata": {}, + "outputs": [], + "source": [ + "# dueling_sma_model = Dueling_DQN_Implementation(\n", + "# env=dqn_sma_env,\n", + "# learning_rate=3e-4,\n", + "# tensorboard_log=\"./wds_dueling_logs/\"\n", + "# )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "e114f6d5-18f1-4c0d-910d-4900fdc27eea", + "metadata": {}, + "outputs": [], + "source": [ + "dist_sma_model = Distributional_DQN_Implementation(\n", + " env=dqn_sma_env,\n", + " learning_rate=3e-4,\n", + " tensorboard_log=\"./wds_dist_logs/\",\n", + " n_atoms=51,\n", + " v_min=-10,\n", + " v_max=10\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "7384c050-22bc-47f9-a39b-d7a38e6a8179", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train for 200000 steps | device=cpu\n", + "Resetting the environment...\n", + "Step: 1111 | Episode Reward: 153.997 | Epsilon: 1.000\n", + "Resetting the environment...\n", + "Step: 2282 | Episode Reward: 151.169 | Epsilon: 0.995\n", + "Resetting the environment...\n", + "Step: 3408 | Episode Reward: 151.317 | Epsilon: 0.990\n", + "Resetting the environment...\n", + "Step: 4545 | Episode Reward: 153.626 | Epsilon: 0.985\n", + "Resetting the environment...\n", + "Step: 5693 | Episode Reward: 153.360 | Epsilon: 0.980\n", + "Resetting the environment...\n", + "Step: 6842 | Episode Reward: 148.326 | Epsilon: 0.975\n", + "Resetting the environment...\n", + "Step: 8000 | Episode Reward: 152.249 | Epsilon: 0.970\n", + "Resetting the environment...\n", + "Step: 9242 | Episode Reward: 133.343 | Epsilon: 0.966\n", + "Resetting the environment...\n", + "Step: 10433 | Episode Reward: 146.349 | Epsilon: 0.961\n", + "Resetting the environment...\n", + "Step: 11503 | Episode Reward: 132.059 | Epsilon: 0.956\n", + "Resetting the environment...\n", + "Step: 12634 | Episode Reward: 152.668 | Epsilon: 0.951\n", + "Resetting the environment...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n", + " warnings.warn(self.ENgeterror(ierr))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step: 13794 | Episode Reward: 140.755 | Epsilon: 0.946\n", + "Resetting the environment...\n", + "Step: 14950 | Episode Reward: 155.198 | Epsilon: 0.942\n", + "Resetting the environment...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n", + " warnings.warn(self.ENgeterror(ierr))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step: 16071 | Episode Reward: 144.100 | Epsilon: 0.937\n", + "Resetting the environment...\n", + "Step: 17212 | Episode Reward: 143.889 | Epsilon: 0.932\n", + "Resetting the environment...\n", + "Step: 18652 | Episode Reward: 109.176 | Epsilon: 0.928\n", + "Resetting the environment...\n", + "Step: 19772 | Episode Reward: 147.424 | Epsilon: 0.923\n", + "Resetting the environment...\n", + "Step: 20861 | Episode Reward: 160.615 | Epsilon: 0.918\n", + "Resetting the environment...\n", + "Step: 21947 | Episode Reward: 160.407 | Epsilon: 0.914\n", + "Resetting the environment...\n", + "Step: 23080 | Episode Reward: 148.234 | Epsilon: 0.909\n", + "Resetting the environment...\n", + "Step: 24235 | Episode Reward: 146.665 | Epsilon: 0.905\n", + "Resetting the environment...\n", + "Step: 25325 | Episode Reward: 160.068 | Epsilon: 0.900\n", + "Resetting the environment...\n", + "Step: 26565 | Episode Reward: 131.963 | Epsilon: 0.896\n", + "Resetting the environment...\n", + "Step: 27670 | Episode Reward: 159.023 | Epsilon: 0.891\n", + "Resetting the environment...\n", + "Step: 28764 | Episode Reward: 151.119 | Epsilon: 0.887\n", + "Resetting the environment...\n", + "Step: 29858 | Episode Reward: 162.179 | Epsilon: 0.882\n", + "Resetting the environment...\n", + "Step: 31007 | Episode Reward: 152.638 | Epsilon: 0.878\n", + "Resetting the environment...\n", + "Step: 32537 | Episode Reward: 111.251 | Epsilon: 0.873\n", + "Resetting the environment...\n", + "Step: 34091 | Episode Reward: 99.022 | Epsilon: 0.869\n", + "Resetting the environment...\n", + "Step: 35679 | Episode Reward: 107.237 | Epsilon: 0.865\n", + "Resetting the environment...\n", + "Step: 36792 | Episode Reward: 152.692 | Epsilon: 0.860\n", + "Resetting the environment...\n", + "Step: 37896 | Episode Reward: 159.965 | Epsilon: 0.856\n", + "Resetting the environment...\n", + "Step: 39000 | Episode Reward: 150.952 | Epsilon: 0.852\n", + "Resetting the environment...\n", + "Step: 40143 | Episode Reward: 153.381 | Epsilon: 0.848\n", + "Resetting the environment...\n", + "Step: 41290 | Episode Reward: 152.382 | Epsilon: 0.843\n", + "Resetting the environment...\n", + "Step: 42435 | Episode Reward: 158.051 | Epsilon: 0.839\n", + "Resetting the environment...\n", + "Step: 44044 | Episode Reward: 99.158 | Epsilon: 0.835\n", + "Resetting the environment...\n", + "Step: 45131 | Episode Reward: 147.270 | Epsilon: 0.831\n", + "Resetting the environment...\n", + "Step: 46194 | Episode Reward: 159.870 | Epsilon: 0.827\n", + "Resetting the environment...\n", + "Step: 47297 | Episode Reward: 160.283 | Epsilon: 0.822\n", + "Resetting the environment...\n", + "Step: 48430 | Episode Reward: 147.430 | Epsilon: 0.818\n", + "Resetting the environment...\n", + "Step: 49523 | Episode Reward: 152.502 | Epsilon: 0.814\n", + "Resetting the environment...\n", + "Step: 51084 | Episode Reward: 103.382 | Epsilon: 0.810\n", + "Resetting the environment...\n", + "Step: 52200 | Episode Reward: 152.779 | Epsilon: 0.806\n", + "Resetting the environment...\n", + "Step: 53297 | Episode Reward: 155.141 | Epsilon: 0.802\n", + "Resetting the environment...\n", + "Step: 54411 | Episode Reward: 151.961 | Epsilon: 0.798\n", + "Resetting the environment...\n", + "Step: 55521 | Episode Reward: 153.797 | Epsilon: 0.794\n", + "Resetting the environment...\n", + "Step: 56610 | Episode Reward: 163.142 | Epsilon: 0.790\n", + "Resetting the environment...\n", + "Step: 57731 | Episode Reward: 149.497 | Epsilon: 0.786\n", + "Resetting the environment...\n", + "Step: 59046 | Episode Reward: 132.982 | Epsilon: 0.782\n", + "Resetting the environment...\n", + "Step: 60135 | Episode Reward: 152.386 | Epsilon: 0.778\n", + "Resetting the environment...\n", + "Step: 61243 | Episode Reward: 154.186 | Epsilon: 0.774\n", + "Resetting the environment...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n", + " warnings.warn(self.ENgeterror(ierr))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step: 62886 | Episode Reward: 98.508 | Epsilon: 0.771\n", + "Resetting the environment...\n", + "Step: 64085 | Episode Reward: 162.620 | Epsilon: 0.767\n", + "Resetting the environment...\n", + "Step: 65178 | Episode Reward: 153.998 | Epsilon: 0.763\n", + "Resetting the environment...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n", + " warnings.warn(self.ENgeterror(ierr))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step: 66793 | Episode Reward: 103.972 | Epsilon: 0.759\n", + "Resetting the environment...\n", + "Step: 67840 | Episode Reward: 161.439 | Epsilon: 0.755\n", + "Resetting the environment...\n", + "Step: 69390 | Episode Reward: 112.187 | Epsilon: 0.751\n", + "Resetting the environment...\n", + "Step: 70459 | Episode Reward: 161.083 | Epsilon: 0.748\n", + "Resetting the environment...\n", + "Step: 71525 | Episode Reward: 151.796 | Epsilon: 0.744\n", + "Resetting the environment...\n", + "Step: 72633 | Episode Reward: 155.591 | Epsilon: 0.740\n", + "Resetting the environment...\n", + "Step: 73738 | Episode Reward: 140.940 | Epsilon: 0.737\n", + "Resetting the environment...\n", + "Step: 74829 | Episode Reward: 143.928 | Epsilon: 0.733\n", + "Resetting the environment...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n", + " warnings.warn(self.ENgeterror(ierr))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step: 76395 | Episode Reward: 101.345 | Epsilon: 0.729\n", + "Resetting the environment...\n", + "Step: 77607 | Episode Reward: 142.561 | Epsilon: 0.726\n", + "Resetting the environment...\n", + "Step: 78815 | Episode Reward: 141.396 | Epsilon: 0.722\n", + "Resetting the environment...\n", + "Step: 79853 | Episode Reward: 163.146 | Epsilon: 0.718\n", + "Resetting the environment...\n", + "Step: 80942 | Episode Reward: 161.397 | Epsilon: 0.715\n", + "Resetting the environment...\n", + "Step: 82028 | Episode Reward: 146.968 | Epsilon: 0.711\n", + "Resetting the environment...\n", + "Step: 83128 | Episode Reward: 153.940 | Epsilon: 0.708\n", + "Resetting the environment...\n", + "Step: 84341 | Episode Reward: 142.633 | Epsilon: 0.704\n", + "Resetting the environment...\n", + "Step: 85518 | Episode Reward: 160.601 | Epsilon: 0.701\n", + "Resetting the environment...\n", + "Step: 86619 | Episode Reward: 154.950 | Epsilon: 0.697\n", + "Resetting the environment...\n", + "Step: 87728 | Episode Reward: 151.248 | Epsilon: 0.694\n", + "Resetting the environment...\n", + "Step: 88824 | Episode Reward: 156.445 | Epsilon: 0.690\n", + "Resetting the environment...\n", + "Step: 89858 | Episode Reward: 163.781 | Epsilon: 0.687\n", + "Resetting the environment...\n", + "Step: 91035 | Episode Reward: 148.585 | Epsilon: 0.683\n", + "Resetting the environment...\n", + "Step: 92108 | Episode Reward: 162.602 | Epsilon: 0.680\n", + "Resetting the environment...\n", + "Step: 93187 | Episode Reward: 144.714 | Epsilon: 0.676\n", + "Resetting the environment...\n", + "Step: 94250 | Episode Reward: 162.002 | Epsilon: 0.673\n", + "Resetting the environment...\n", + "Step: 95763 | Episode Reward: 115.001 | Epsilon: 0.670\n", + "Resetting the environment...\n", + "Step: 96822 | Episode Reward: 162.466 | Epsilon: 0.666\n", + "Resetting the environment...\n", + "Step: 97932 | Episode Reward: 155.504 | Epsilon: 0.663\n", + "Resetting the environment...\n", + "Step: 99028 | Episode Reward: 156.306 | Epsilon: 0.660\n", + "Resetting the environment...\n", + "Step: 100099 | Episode Reward: 157.606 | Epsilon: 0.656\n", + "Resetting the environment...\n", + "Step: 101170 | Episode Reward: 156.524 | Epsilon: 0.653\n", + "Resetting the environment...\n", + "Step: 102249 | Episode Reward: 156.795 | Epsilon: 0.650\n", + "Resetting the environment...\n", + "Step: 103308 | Episode Reward: 152.345 | Epsilon: 0.647\n", + "Resetting the environment...\n", + "Step: 104405 | Episode Reward: 146.136 | Epsilon: 0.643\n", + "Resetting the environment...\n", + "Step: 105506 | Episode Reward: 151.228 | Epsilon: 0.640\n", + "Resetting the environment...\n", + "Step: 106586 | Episode Reward: 157.143 | Epsilon: 0.637\n", + "Resetting the environment...\n", + "Step: 107687 | Episode Reward: 155.193 | Epsilon: 0.634\n", + "Resetting the environment...\n", + "Step: 108745 | Episode Reward: 163.514 | Epsilon: 0.631\n", + "Resetting the environment...\n", + "Step: 109821 | Episode Reward: 157.198 | Epsilon: 0.627\n", + "Resetting the environment...\n", + "Step: 110888 | Episode Reward: 142.207 | Epsilon: 0.624\n", + "Resetting the environment...\n", + "Step: 111970 | Episode Reward: 157.823 | Epsilon: 0.621\n", + "Resetting the environment...\n", + "Step: 113059 | Episode Reward: 151.426 | Epsilon: 0.618\n", + "Resetting the environment...\n", + "Step: 114143 | Episode Reward: 153.774 | Epsilon: 0.615\n", + "Resetting the environment...\n", + "Step: 115263 | Episode Reward: 159.760 | Epsilon: 0.612\n", + "Resetting the environment...\n", + "Step: 116315 | Episode Reward: 160.141 | Epsilon: 0.609\n", + "Resetting the environment...\n", + "Step: 117375 | Episode Reward: 145.956 | Epsilon: 0.606\n", + "Resetting the environment...\n", + "Step: 118520 | Episode Reward: 152.579 | Epsilon: 0.603\n", + "Resetting the environment...\n", + "Step: 119615 | Episode Reward: 162.900 | Epsilon: 0.600\n", + "Resetting the environment...\n", + "Step: 120710 | Episode Reward: 159.072 | Epsilon: 0.597\n", + "Resetting the environment...\n", + "Step: 122248 | Episode Reward: 114.978 | Epsilon: 0.594\n", + "Resetting the environment...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n", + " warnings.warn(self.ENgeterror(ierr))\n", + "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: System may be hydraulically unstable.\n", + " warnings.warn(self.ENgeterror(ierr))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step: 123742 | Episode Reward: 105.305 | Epsilon: 0.591\n", + "Resetting the environment...\n", + "Step: 124838 | Episode Reward: 153.693 | Epsilon: 0.588\n", + "Resetting the environment...\n", + "Step: 125927 | Episode Reward: 156.106 | Epsilon: 0.585\n", + "Resetting the environment...\n", + "Step: 126972 | Episode Reward: 163.293 | Epsilon: 0.582\n", + "Resetting the environment...\n", + "Step: 128030 | Episode Reward: 158.597 | Epsilon: 0.579\n", + "Resetting the environment...\n", + "Step: 129064 | Episode Reward: 163.821 | Epsilon: 0.576\n", + "Resetting the environment...\n", + "Step: 130153 | Episode Reward: 161.117 | Epsilon: 0.573\n", + "Resetting the environment...\n", + "Step: 131246 | Episode Reward: 154.042 | Epsilon: 0.570\n", + "Resetting the environment...\n", + "Step: 132334 | Episode Reward: 157.167 | Epsilon: 0.568\n", + "Resetting the environment...\n", + "Step: 133400 | Episode Reward: 152.942 | Epsilon: 0.565\n", + "Resetting the environment...\n", + "Step: 134676 | Episode Reward: 144.907 | Epsilon: 0.562\n", + "Resetting the environment...\n", + "Step: 135752 | Episode Reward: 157.574 | Epsilon: 0.559\n", + "Resetting the environment...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n", + " warnings.warn(self.ENgeterror(ierr))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step: 136918 | Episode Reward: 143.511 | Epsilon: 0.556\n", + "Resetting the environment...\n", + "Step: 137993 | Episode Reward: 158.570 | Epsilon: 0.554\n", + "Resetting the environment...\n", + "Step: 139027 | Episode Reward: 164.542 | Epsilon: 0.551\n", + "Resetting the environment...\n", + "Step: 140077 | Episode Reward: 161.509 | Epsilon: 0.548\n", + "Resetting the environment...\n", + "Step: 141478 | Episode Reward: 119.271 | Epsilon: 0.545\n", + "Resetting the environment...\n", + "Step: 142565 | Episode Reward: 165.614 | Epsilon: 0.543\n", + "Resetting the environment...\n", + "Step: 143643 | Episode Reward: 150.953 | Epsilon: 0.540\n", + "Resetting the environment...\n", + "Step: 144710 | Episode Reward: 160.797 | Epsilon: 0.537\n", + "Resetting the environment...\n", + "Step: 145783 | Episode Reward: 154.920 | Epsilon: 0.534\n", + "Resetting the environment...\n", + "Step: 146812 | Episode Reward: 166.769 | Epsilon: 0.532\n", + "Resetting the environment...\n", + "Step: 147836 | Episode Reward: 161.131 | Epsilon: 0.529\n", + "Resetting the environment...\n", + "Step: 149294 | Episode Reward: 119.741 | Epsilon: 0.526\n", + "Resetting the environment...\n", + "Step: 150353 | Episode Reward: 145.694 | Epsilon: 0.524\n", + "Resetting the environment...\n", + "Step: 151429 | Episode Reward: 150.523 | Epsilon: 0.521\n", + "Resetting the environment...\n", + "Step: 152482 | Episode Reward: 161.591 | Epsilon: 0.519\n", + "Resetting the environment...\n", + "Step: 153548 | Episode Reward: 156.874 | Epsilon: 0.516\n", + "Resetting the environment...\n", + "Step: 154621 | Episode Reward: 154.044 | Epsilon: 0.513\n", + "Resetting the environment...\n", + "Step: 155681 | Episode Reward: 164.463 | Epsilon: 0.511\n", + "Resetting the environment...\n", + "Step: 156725 | Episode Reward: 164.537 | Epsilon: 0.508\n", + "Resetting the environment...\n", + "Step: 157748 | Episode Reward: 164.193 | Epsilon: 0.506\n", + "Resetting the environment...\n", + "Step: 158773 | Episode Reward: 166.099 | Epsilon: 0.503\n", + "Resetting the environment...\n", + "Step: 159859 | Episode Reward: 161.974 | Epsilon: 0.501\n", + "Resetting the environment...\n", + "Step: 160914 | Episode Reward: 163.825 | Epsilon: 0.498\n", + "Resetting the environment...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n", + " warnings.warn(self.ENgeterror(ierr))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step: 162276 | Episode Reward: 112.850 | Epsilon: 0.496\n", + "Resetting the environment...\n", + "Step: 163324 | Episode Reward: 158.721 | Epsilon: 0.493\n", + "Resetting the environment...\n", + "Step: 164401 | Episode Reward: 162.109 | Epsilon: 0.491\n", + "Resetting the environment...\n", + "Step: 165435 | Episode Reward: 155.681 | Epsilon: 0.488\n", + "Resetting the environment...\n", + "Step: 166822 | Episode Reward: 119.148 | Epsilon: 0.486\n", + "Resetting the environment...\n", + "Step: 167884 | Episode Reward: 153.597 | Epsilon: 0.483\n", + "Resetting the environment...\n", + "Step: 168911 | Episode Reward: 158.481 | Epsilon: 0.481\n", + "Resetting the environment...\n", + "Step: 169952 | Episode Reward: 162.755 | Epsilon: 0.479\n", + "Resetting the environment...\n", + "Step: 171002 | Episode Reward: 157.843 | Epsilon: 0.476\n", + "Resetting the environment...\n", + "Step: 172050 | Episode Reward: 162.917 | Epsilon: 0.474\n", + "Resetting the environment...\n", + "Step: 173112 | Episode Reward: 159.995 | Epsilon: 0.471\n", + "Resetting the environment...\n", + "Step: 174359 | Episode Reward: 144.803 | Epsilon: 0.469\n", + "Resetting the environment...\n", + "Step: 175443 | Episode Reward: 153.168 | Epsilon: 0.467\n", + "Resetting the environment...\n", + "Step: 176495 | Episode Reward: 150.665 | Epsilon: 0.464\n", + "Resetting the environment...\n", + "Step: 177547 | Episode Reward: 156.770 | Epsilon: 0.462\n", + "Resetting the environment...\n", + "Step: 178588 | Episode Reward: 165.277 | Epsilon: 0.460\n", + "Resetting the environment...\n", + "Step: 179612 | Episode Reward: 163.793 | Epsilon: 0.458\n", + "Resetting the environment...\n", + "Step: 180638 | Episode Reward: 163.216 | Epsilon: 0.455\n", + "Resetting the environment...\n", + "Step: 181695 | Episode Reward: 154.532 | Epsilon: 0.453\n", + "Resetting the environment...\n", + "Step: 182951 | Episode Reward: 143.096 | Epsilon: 0.451\n", + "Resetting the environment...\n", + "Step: 184021 | Episode Reward: 153.395 | Epsilon: 0.448\n", + "Resetting the environment...\n", + "Step: 185066 | Episode Reward: 164.928 | Epsilon: 0.446\n", + "Resetting the environment...\n", + "Step: 186137 | Episode Reward: 163.071 | Epsilon: 0.444\n", + "Resetting the environment...\n", + "Step: 187301 | Episode Reward: 152.901 | Epsilon: 0.442\n", + "Resetting the environment...\n", + "Step: 188349 | Episode Reward: 158.262 | Epsilon: 0.440\n", + "Resetting the environment...\n", + "Step: 189421 | Episode Reward: 162.564 | Epsilon: 0.437\n", + "Resetting the environment...\n", + "Step: 190614 | Episode Reward: 146.785 | Epsilon: 0.435\n", + "Resetting the environment...\n", + "Step: 191643 | Episode Reward: 163.195 | Epsilon: 0.433\n", + "Resetting the environment...\n", + "Step: 192686 | Episode Reward: 158.665 | Epsilon: 0.431\n", + "Resetting the environment...\n", + "Step: 193749 | Episode Reward: 151.001 | Epsilon: 0.429\n", + "Resetting the environment...\n", + "Step: 194765 | Episode Reward: 164.780 | Epsilon: 0.427\n", + "Resetting the environment...\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/kamalrajanisrani/Documents/MSc Advanced Machine Learning/Semester 1/Reinforcement Learning/gym4ReaL/gym4real/envs/wds/simulator/epynet/epanet2.py:683: UserWarning: WARNING: Pumps cannot deliver enough flow or head.\n", + " warnings.warn(self.ENgeterror(ierr))\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Step: 195802 | Episode Reward: 154.002 | Epsilon: 0.424\n", + "Resetting the environment...\n", + "Step: 196843 | Episode Reward: 156.423 | Epsilon: 0.422\n", + "Resetting the environment...\n", + "Step: 197891 | Episode Reward: 152.905 | Epsilon: 0.420\n", + "Resetting the environment...\n", + "Step: 198920 | Episode Reward: 160.201 | Epsilon: 0.418\n", + "Resetting the environment...\n", + "Training finished\n", + "Saving model to distributional-dqn-normalisation-sma.zip\n", + "Model saved\n" + ] + } + ], + "source": [ + "dist_sma_model.learn(total_timesteps=200000)\n", + "dist_sma_model.save(\"distributional-dqn-normalisation-sma\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "010c0bfc-d2e7-4fd1-bbab-f5d3cc18325e", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Normalise.py b/Normalise.py new file mode 100644 index 0000000..768e057 --- /dev/null +++ b/Normalise.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + +# Credits: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm +# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm +# +# Inspiration: https://github.com/DLR-RM/stable-baselines3/blob/master/stable_baselines3/common/vec_env/vec_normalize.py + +# Core idea is: +# Keep running statistics of observations. Before feeding the observations to the network. + +# xhat = (x - μ) . (σ^2 + ε)^(-1/2) +# ^, is just z score norm with online statistics + +import numpy as np +import gymnasium as gym + +''' + +Draft before I do things: + +m_a = self.var * self.count # sum of squared deviations (old) +m_b = batch_var * batch_count # sum of squared deviations (new) +M2 = m_a + m_b + delta**2 * self.count * batch_count / total_count +self.var = M2 / total_count + +''' + +class RunningMeanStd: + + ''' + Goal: Track running mean and variance using Welford's parallel online algorithm + ''' + + def __init__(self, shape=()): + self.mean = np.zeros(shape, dtype=np.float64) + self.var = np.ones(shape, dtype=np.float64) + self.count = 1e-4 # Div by 0 bad, so initially setting it to this + + # Update the stats with a batch of samples + def update(self, x): + x = np.asarray(x) + if x.ndim == 1: + x = x.reshape(1, -1) # As its a single obs, itd be a batch of 1 + + batch_mean = x.mean(axis=0) + batch_var = x.var(axis=0) + batch_count = x.shape[0] + + self.update_from_moments(batch_mean, batch_var, batch_count) + + # Chan's parallel algorithm for combining statistics. + def update_from_moments(self, batch_mean, batch_var, batch_count): + delta = batch_mean - self.mean + total_count = self.count + batch_count + + self.mean = self.mean + delta * batch_count / total_count # New mea being the weighted combination + + # New var being the parallel algorithm + m_a = self.var * self.count + m_b = batch_var * batch_count + M2 = m_a + m_b + (delta ** 2) * self.count * batch_count / total_count # Sum of squared deviations from mean! + self.var = M2 / total_count + + self.count = total_count + + def normalise(self, x): + return (x - self.mean) / np.sqrt(self.var + 1e-8) # z-score norm + + +class NormaliseObservation(gym.Wrapper): + ''' + Gym wrapper for obs norm using running statistics. + + Aim: + During training: updates stats and normalises + During evaluation: normalises only (set training=False) + ''' + + def __init__(self, env): + super().__init__(env) + self.rms = RunningMeanStd(shape=env.observation_space.shape) + self.training = True + + def step(self, action): + obs, reward, terminated, truncated, info = self.env.step(action) + if self.training: + self.rms.update(obs) + return self.rms.normalise(obs), reward, terminated, truncated, info + + def reset(self, **kwargs): + obs, info = self.env.reset(**kwargs) + if self.training: + self.rms.update(obs) + return self.rms.normalise(obs), info \ No newline at end of file diff --git a/distributional-dqn-normalisation-ema.zip b/distributional-dqn-normalisation-ema.zip new file mode 100644 index 0000000..d95354b Binary files /dev/null and b/distributional-dqn-normalisation-ema.zip differ diff --git a/distributional-dqn-normalisation-sma.zip b/distributional-dqn-normalisation-sma.zip new file mode 100644 index 0000000..3a26381 Binary files /dev/null and b/distributional-dqn-normalisation-sma.zip differ diff --git a/dueling-dqn-normalisation-ema.zip b/dueling-dqn-normalisation-ema.zip new file mode 100644 index 0000000..efa43da Binary files /dev/null and b/dueling-dqn-normalisation-ema.zip differ diff --git a/dueling-dqn-normalisation-sma.zip b/dueling-dqn-normalisation-sma.zip new file mode 100644 index 0000000..5e1ca8c Binary files /dev/null and b/dueling-dqn-normalisation-sma.zip differ diff --git a/dueling-dqn-sma.zip b/dueling-dqn-sma.zip new file mode 100644 index 0000000..9039d67 Binary files /dev/null and b/dueling-dqn-sma.zip differ diff --git a/dueling_dqn.py b/dueling_dqn.py new file mode 100644 index 0000000..d3e6364 --- /dev/null +++ b/dueling_dqn.py @@ -0,0 +1,662 @@ +import torch +import torch.nn as nn +import torch.optim as optim +import random +from collections import deque +import numpy as np +import os +import io +import zipfile +from torch.utils.tensorboard import SummaryWriter + +# Following code implements a DQN and DDQN agent from scratch +# DDQN Class uses inheritance from original DQN Class +# Uses epsilon decay, more random and exploratory behaviour initially and eventually follows experience + +class Neural_Network(nn.Module): + + # Create Q-Network which takes in states and outputs state-action pairs + def __init__(self, state_dimension, action_dimension, hidden=128): + super(Neural_Network, self).__init__() + self.net = nn.Sequential( + nn.Linear(state_dimension, hidden), + nn.ReLU(), + nn.Linear(hidden, hidden), + nn.ReLU(), + nn.Linear(hidden, action_dimension)) + + # Calculate Forward pass prediction by network + def forward(self, x): + return self.net(x) + +import torch +import torch.nn as nn + +class Dueling_Neural_Network(nn.Module): + """ + Dueling architecture: + Q(s,a) = V(s) + (A(s,a) - mean_a A(s,a)) + """ + def __init__(self, state_dimension, action_dimension, hidden=128): + super(Dueling_Neural_Network, self).__init__() + + # shared feature extractor + self.feature = nn.Sequential( + nn.Linear(state_dimension, hidden), + nn.ReLU(), + nn.Linear(hidden, hidden), + nn.ReLU(), + ) + + # value stream outputs scalar V(s) + self.value_stream = nn.Sequential( + nn.Linear(hidden, hidden), + nn.ReLU(), + nn.Linear(hidden, 1), + ) + + # advantage stream outputs A(s,a) for each action + self.adv_stream = nn.Sequential( + nn.Linear(hidden, hidden), + nn.ReLU(), + nn.Linear(hidden, action_dimension), + ) + + def forward(self, x): + features = self.feature(x) + values = self.value_stream(features) # shape: (B, 1) + advantages = self.adv_stream(features) # shape: (B, A) + + # subtract mean advantage for identifiability + advantages = advantages - advantages.mean(dim=1, keepdim=True) + + q_values = values + advantages # broadcast (B,1) + (B,A) + return q_values + + +class Replay_Buffer: + + # Initialise buffer to have max size N. Any new experiences will replace old experiences if N > max_size + def __init__(self, N): + self.buffer = deque(maxlen=N) + + # Add experience to replay buffer + def store_experience(self, state, action, reward, next_state, done): + self.buffer.append((state, action, reward, next_state, done)) + + # Create random sample batch of experience from replay buffer (size = bath_size) + def mini_batch(self, batch_size): + batch = random.sample(self.buffer, batch_size) + state, action, reward, next_state, done = zip(*batch) + return (np.array(state), np.array(action), np.array(reward), np.array(next_state), np.array(done)) + + def __len__(self): + return len(self.buffer) + +class DQN_Implementation: + + def __init__(self, env, learning_rate=3e-4, buffer_size=50000, batch_size=64, gamma=0.99, tensorboard_log=None): + # Initialise parameters env, lr, batch_size, gamma and tensorboard_logs for metrics + # Buffer Size of 50000 allows for more episodes to be stored, helps with learning unexpected events + self.env = env + self.learning_rate = learning_rate + self.buffer_size= buffer_size + self.batch_size = batch_size + self.gamma = gamma + self.tensorboard_log = tensorboard_log + + # Hyperparameters, using epsilon decay to improve learning over episodes + # Initially agent acts completely random, as training increases follows optimal policy as experience increases + self.epsilon = 1.0 + self.end_epsilon = 0.05 + self.decay_rate = 0.005 + self.update_frequency = 1000 + self.steps_completed = 0 + + self.experience_memory = Replay_Buffer(buffer_size) # Initialise Replay Buffer + + self.state_dimension = env.observation_space.shape[0] # State Space Dimension + self.action_dimension = env.action_space.n # Action Space Dimension + + self.policy_network = Neural_Network(self.state_dimension, self.action_dimension) # Initialise policy action-value network + self.target_network = Neural_Network(self.state_dimension, self.action_dimension) # Initialise target action-value network + + self.target_network.load_state_dict(self.policy_network.state_dict()) # target network has same initial parmaeters as policy network + self.target_network.eval() + + self.optim = optim.Adam(self.policy_network.parameters(), lr=self.learning_rate) # Initialise Adam Optimizer for LR + + # Initialise Logging for Tensorboard + self.writer = None + if self.tensorboard_log: + os.makedirs(self.tensorboard_log, exist_ok=True) + self.writer = SummaryWriter(log_dir=self.tensorboard_log) + + def action(self, state): + if random.random() < self.epsilon: # Randomly select action if prob < epsilon + return random.randrange(self.action_dimension) + else: # otherwise choose action to maximise q (epsilon greedy) + with torch.no_grad(): # To improve speed and memory for PyTorch + q_values = self.policy_network(torch.FloatTensor(state).unsqueeze(0)) + return q_values.argmax().item() + + def update_network(self): + if len(self.experience_memory) < self.batch_size: + return None + + states, actions, rewards, next_states, dones = self.experience_memory.mini_batch(self.batch_size) + + states = torch.FloatTensor(states) + actions = torch.LongTensor(actions).unsqueeze(1) + rewards = torch.FloatTensor(rewards).unsqueeze(1) + next_states = torch.FloatTensor(next_states) + dones = torch.FloatTensor(dones).unsqueeze(1) + + # Calculate Policy Network Q-Values Q(s, a) + policy_q = self.policy_network(states).gather(1, actions) + + with torch.no_grad(): + # Calculate Target Network Q-Values using Bellman Equation; if terminal state then target_q = rewards + target_q = rewards + (self.gamma * self.target_network(next_states).max(1)[0].unsqueeze(1) * (1 - dones)) + + huber_loss = nn.HuberLoss(delta=1.0)(policy_q, target_q) # We normalise rewards with an upper bound of 1 hence delta of 1.0 + + # Weights only updated for Policy Network using gradient descent, not for Target Network + self.optim.zero_grad() + huber_loss.backward() + self.optim.step() + + return huber_loss.item() + + def learn(self, total_timesteps): + print(f"Train for {total_timesteps} steps") + state, _ = self.env.reset() + episode_reward, episode_count = 0, 0 + + for step in range(1, total_timesteps + 1): + self.steps_completed += 1 + + action = self.action(state) + + next_state, reward, terminated, truncated, _ = self.env.step(action) + episode_reward += reward + + flag = terminated + # Store experience in Replay Buffer + self.experience_memory.store_experience(state, action, reward, next_state, flag) + + state = next_state + + loss_val = self.update_network() + # If episode finishes or terminates after special event (overflow) + if terminated or truncated: + if self.writer: + self.writer.add_scalar("rollout/episode_reward_mean", episode_reward, self.steps_completed) + self.writer.add_scalar("train/epsilon", self.epsilon, self.steps_completed) + + print(f"Step: {self.steps_completed} | Episode Reward: {episode_reward:.3f} | Epsilon: {self.epsilon:.3f}") + + state, _ = self.env.reset() + episode_reward = 0 + episode_count += 1 + + # Epsilon Decay formula + self.epsilon = max(self.end_epsilon, self.epsilon * (1-self.decay_rate)) + + # Every 100 steps add the train/loss value to the log + if loss_val is not None and step % 100 == 0 and self.writer: + self.writer.add_scalar("train/loss", loss_val, self.steps_completed) + + # Every 1000 steps update the target network + if self.steps_completed % self.update_frequency == 0: + self.target_network.load_state_dict(self.policy_network.state_dict()) + + print("Training finished") + if self.writer: + self.writer.flush() + self.writer.close() + + # Save model and .zip file + def save(self, path): + if not path.endswith(".zip"): + path += ".zip" + + print(f"Saving model to {path}") + + buffer = io.BytesIO() + torch.save({ + 'model_state_dictionary': self.policy_network.state_dict(), + 'optimizer_state_dictionary': self.optim.state_dict(), + 'epsilon': self.epsilon}, buffer) + buffer.seek(0) + + with zipfile.ZipFile(path, 'w', zipfile.ZIP_DEFLATED) as zf: + zf.writestr("policy.pth", buffer.read()) + + print("Model saved") + +class Double_DQN_Implementation(DQN_Implementation): + + # Only change required is update_network method when calculating target_q from Bellman Equation + def update_network(self): + if len(self.experience_memory) < self.batch_size: + return None + + states, actions, rewards, next_states, dones = self.experience_memory.mini_batch(self.batch_size) + + states = torch.FloatTensor(states) + actions = torch.LongTensor(actions).unsqueeze(1) + rewards = torch.FloatTensor(rewards).unsqueeze(1) + next_states = torch.FloatTensor(next_states) + dones = torch.FloatTensor(dones).unsqueeze(1) + + # Calculate Policy Network Q-Values Q(s, a) + policy_q = self.policy_network(states).gather(1, actions) + + with torch.no_grad(): + # Difference between DQN and DDQN + # Select best actions from policy network, calculate target_q values with these action from the target network + best_next_actions = self.policy_network(next_states).argmax(1).unsqueeze(1) + target_q = rewards + (self.gamma * self.target_network(next_states).gather(1, best_next_actions) * (1 - dones)) + + huber_loss = nn.HuberLoss(delta=1.0)(policy_q, target_q) # We normalise rewards with an upper bound of 1 hence delta of 1.0 + + # Weights only updated for Policy Network using gradient descent, not for Target Network + self.optim.zero_grad() + huber_loss.backward() + self.optim.step() + + return huber_loss.item() + + +class Dueling_DQN_Implementation: + + def __init__(self, env, learning_rate=3e-4, buffer_size=50000, batch_size=64, + gamma=0.99, tensorboard_log=None, hidden=128): + + self.env = env + self.learning_rate = learning_rate + self.buffer_size = buffer_size + self.batch_size = batch_size + self.gamma = gamma + self.tensorboard_log = tensorboard_log + + + self.epsilon = 1.0 + self.end_epsilon = 0.05 + self.decay_rate = 0.005 + self.update_frequency = 1000 + self.steps_completed = 0 + + + self.experience_memory = Replay_Buffer(buffer_size) + + self.state_dimension = env.observation_space.shape[0] + self.action_dimension = env.action_space.n + + + self.policy_network = Dueling_Neural_Network(self.state_dimension, self.action_dimension, hidden=hidden) + self.target_network = Dueling_Neural_Network(self.state_dimension, self.action_dimension, hidden=hidden) + + self.target_network.load_state_dict(self.policy_network.state_dict()) + self.target_network.eval() + + self.optim = optim.Adam(self.policy_network.parameters(), lr=self.learning_rate) + + + self.writer = None + if self.tensorboard_log: + os.makedirs(self.tensorboard_log, exist_ok=True) + self.writer = SummaryWriter(log_dir=self.tensorboard_log) + + def action(self, state): + if random.random() < self.epsilon: + return random.randrange(self.action_dimension) + else: + with torch.no_grad(): + q_values = self.policy_network(torch.FloatTensor(state).unsqueeze(0)) + return q_values.argmax().item() + + def update_network(self): + if len(self.experience_memory) < self.batch_size: + return None + + states, actions, rewards, next_states, dones = self.experience_memory.mini_batch(self.batch_size) + + states = torch.FloatTensor(states) + actions = torch.LongTensor(actions).unsqueeze(1) + rewards = torch.FloatTensor(rewards).unsqueeze(1) + next_states = torch.FloatTensor(next_states) + dones = torch.FloatTensor(dones).unsqueeze(1) + + + policy_q = self.policy_network(states).gather(1, actions) + + with torch.no_grad(): + # vanilla DQN target (max over target net) + next_q_max = self.target_network(next_states).max(1)[0].unsqueeze(1) + target_q = rewards + (self.gamma * next_q_max * (1 - dones)) + + huber_loss = nn.HuberLoss(delta=1.0)(policy_q, target_q) + + self.optim.zero_grad() + huber_loss.backward() + self.optim.step() + + return huber_loss.item() + + def learn(self, total_timesteps): + print(f"Train for {total_timesteps} steps") + state, _ = self.env.reset() + episode_reward = 0 + + for step in range(1, total_timesteps + 1): + self.steps_completed += 1 + + act = self.action(state) + next_state, reward, terminated, truncated, _ = self.env.step(act) + + episode_reward += reward + + done_flag = terminated + self.experience_memory.store_experience(state, act, reward, next_state, done_flag) + + state = next_state + + loss_val = self.update_network() + + if terminated or truncated: + if self.writer: + self.writer.add_scalar("rollout/episode_reward_mean", episode_reward, self.steps_completed) + self.writer.add_scalar("train/epsilon", self.epsilon, self.steps_completed) + + print(f"Step: {self.steps_completed} | Episode Reward: {episode_reward:.3f} | Epsilon: {self.epsilon:.3f}") + + state, _ = self.env.reset() + episode_reward = 0 + + self.epsilon = max(self.end_epsilon, self.epsilon * (1 - self.decay_rate)) + + if loss_val is not None and step % 100 == 0 and self.writer: + self.writer.add_scalar("train/loss", loss_val, self.steps_completed) + + if self.steps_completed % self.update_frequency == 0: + self.target_network.load_state_dict(self.policy_network.state_dict()) + + print("Training finished") + if self.writer: + self.writer.flush() + self.writer.close() + + def save(self, path): + if not path.endswith(".zip"): + path += ".zip" + + print(f"Saving model to {path}") + + buffer = io.BytesIO() + torch.save({ + 'model_state_dictionary': self.policy_network.state_dict(), + 'optimizer_state_dictionary': self.optim.state_dict(), + 'epsilon': self.epsilon + }, buffer) + buffer.seek(0) + + with zipfile.ZipFile(path, 'w', zipfile.ZIP_DEFLATED) as zf: + zf.writestr("policy.pth", buffer.read()) + + print("Model saved") + +######################################################################################################################################## +# Distributional DQN + +class Distributional_Neural_Network(nn.Module): + def __init__(self, state_dimension, action_dimension, n_atoms=51, hidden=128): + super().__init__() + self.action_dimension = action_dimension + self.n_atoms = n_atoms + + self.net = nn.Sequential( + nn.Linear(state_dimension, hidden), + nn.ReLU(), + nn.Linear(hidden, hidden), + nn.ReLU(), + nn.Linear(hidden, action_dimension * n_atoms) + ) + + def forward(self, x): + logits = self.net(x) # (B, A*N) + logits = logits.view(-1, self.action_dimension, self.n_atoms) # (B, A, N) + return logits + +class Distributional_DQN_Implementation: + def __init__( + self, + env, + learning_rate=3e-4, + buffer_size=50000, + batch_size=64, + gamma=0.99, + tensorboard_log=None, + n_atoms=51, + v_min=-10.0, + v_max=10.0, + hidden=128, + update_frequency=1000, + decay_rate=0.005, + end_epsilon=0.05, + device=None + ): + self.env = env + self.learning_rate = learning_rate + self.buffer_size = buffer_size + self.batch_size = batch_size + self.gamma = gamma + self.tensorboard_log = tensorboard_log + + # Epsilon-greedy params + self.epsilon = 1.0 + self.end_epsilon = end_epsilon + self.decay_rate = decay_rate + + self.update_frequency = update_frequency + self.steps_completed = 0 + + self.experience_memory = Replay_Buffer(buffer_size) + + self.state_dimension = env.observation_space.shape[0] + self.action_dimension = env.action_space.n + + # Distribution support (atoms) + self.n_atoms = n_atoms + self.v_min = float(v_min) + self.v_max = float(v_max) + self.delta_z = (self.v_max - self.v_min) / (self.n_atoms - 1) + + z = torch.linspace(self.v_min, self.v_max, self.n_atoms) + self.device = device or ("cuda" if torch.cuda.is_available() else "cpu") + self.z = z.to(self.device) # (N,) + + self.policy_network = Distributional_Neural_Network( + self.state_dimension, self.action_dimension, n_atoms=self.n_atoms, hidden=hidden + ).to(self.device) + + self.target_network = Distributional_Neural_Network( + self.state_dimension, self.action_dimension, n_atoms=self.n_atoms, hidden=hidden + ).to(self.device) + + self.target_network.load_state_dict(self.policy_network.state_dict()) + self.target_network.eval() + + self.optim = optim.Adam(self.policy_network.parameters(), lr=self.learning_rate) + + # TensorBoard + self.writer = None + if self.tensorboard_log: + os.makedirs(self.tensorboard_log, exist_ok=True) + self.writer = SummaryWriter(log_dir=self.tensorboard_log) + + def _dist(self, logits): + # Convert logits -> probability distribution over atoms + return torch.softmax(logits, dim=-1) # (..., N) + + def _expected_q(self, logits): + # E[Z] for each action: sum p(z_i) * z_i + probs = self._dist(logits) # (B, A, N) + q = torch.sum(probs * self.z.view(1, 1, -1), dim=-1) # (B, A) + return q + + def action(self, state): + if random.random() < self.epsilon: + return random.randrange(self.action_dimension) + else: + with torch.no_grad(): + s = torch.FloatTensor(state).unsqueeze(0).to(self.device) + logits = self.policy_network(s) # (1, A, N) + q_vals = self._expected_q(logits) # (1, A) + return q_vals.argmax(dim=1).item() + + @torch.no_grad() + def _project_distribution(self, rewards, dones, next_logits): + """ + C51 projection: + Tz = r + gamma*(1-done)*z + Project onto fixed support [v_min, v_max] + """ + # next_logits: (B, A, N) + next_probs = self._dist(next_logits) # (B, A, N) + + # Greedy action by expected value under target distributions + next_q = torch.sum(next_probs * self.z.view(1, 1, -1), dim=-1) # (B, A) + next_actions = next_q.argmax(dim=1, keepdim=True) # (B, 1) + + # Pick distribution for best action: (B, N) + next_probs_a = next_probs.gather(1, next_actions.unsqueeze(-1).expand(-1, -1, self.n_atoms)).squeeze(1) + + # Bellman update on atoms + # rewards, dones: (B, 1) + Tz = rewards + self.gamma * (1.0 - dones) * self.z.view(1, -1) # (B, N) + Tz = torch.clamp(Tz, self.v_min, self.v_max) + + # Compute projection locations + b = (Tz - self.v_min) / self.delta_z # (B, N) + l = torch.floor(b).long() + u = torch.ceil(b).long() + + # Fix possible numerical issues where l==u + l = torch.clamp(l, 0, self.n_atoms - 1) + u = torch.clamp(u, 0, self.n_atoms - 1) + + m = torch.zeros_like(next_probs_a) # (B, N) + + # Distribute probability mass + # m[l] += p * (u - b) + # m[u] += p * (b - l) + offset = torch.arange(next_probs_a.size(0), device=self.device).unsqueeze(1) # (B,1) + + m.view(-1).index_add_( + 0, + (l + offset * self.n_atoms).view(-1), + (next_probs_a * (u.float() - b)).view(-1) + ) + m.view(-1).index_add_( + 0, + (u + offset * self.n_atoms).view(-1), + (next_probs_a * (b - l.float())).view(-1) + ) + + # m is the projected target distribution (B, N) + return m + + def update_network(self): + if len(self.experience_memory) < self.batch_size: + return None + + states, actions, rewards, next_states, dones = self.experience_memory.mini_batch(self.batch_size) + + states = torch.FloatTensor(states).to(self.device) + actions = torch.LongTensor(actions).unsqueeze(1).to(self.device) # (B,1) + rewards = torch.FloatTensor(rewards).unsqueeze(1).to(self.device) # (B,1) + next_states = torch.FloatTensor(next_states).to(self.device) + dones = torch.FloatTensor(dones).unsqueeze(1).to(self.device) # (B,1) + + # Current logits for all actions + logits = self.policy_network(states) # (B, A, N) + # Select logits for taken actions: (B, N) + logits_a = logits.gather(1, actions.unsqueeze(-1).expand(-1, -1, self.n_atoms)).squeeze(1) + + # Target projected distribution m: (B, N) + with torch.no_grad(): + next_logits = self.target_network(next_states) # (B, A, N) + target_dist = self._project_distribution(rewards, dones, next_logits) # (B, N) + + # Cross-entropy loss: - sum m * log p + log_probs = torch.log_softmax(logits_a, dim=-1) # (B, N) + loss = -(target_dist * log_probs).sum(dim=-1).mean() + + self.optim.zero_grad() + loss.backward() + self.optim.step() + + return loss.item() + def save(self, path): + if not path.endswith(".zip"): + path += ".zip" + + print(f"Saving model to {path}") + + buffer = io.BytesIO() + torch.save({ + 'model_state_dictionary': self.policy_network.state_dict(), + 'optimizer_state_dictionary': self.optim.state_dict(), + 'epsilon': self.epsilon + }, buffer) + buffer.seek(0) + + with zipfile.ZipFile(path, 'w', zipfile.ZIP_DEFLATED) as zf: + zf.writestr("policy.pth", buffer.read()) + + print("Model saved") + + + def learn(self, total_timesteps): + print(f"Train for {total_timesteps} steps | device={self.device}") + state, _ = self.env.reset() + episode_reward = 0.0 + + for step in range(1, total_timesteps + 1): + self.steps_completed += 1 + + act = self.action(state) + next_state, reward, terminated, truncated, _ = self.env.step(act) + + episode_reward += reward + + done_flag = terminated # follow your convention + self.experience_memory.store_experience(state, act, reward, next_state, done_flag) + state = next_state + + loss_val = self.update_network() + + if terminated or truncated: + if self.writer: + self.writer.add_scalar("rollout/episode_reward_mean", episode_reward, self.steps_completed) + self.writer.add_scalar("train/epsilon", self.epsilon, self.steps_completed) + + print(f"Step: {self.steps_completed} | Episode Reward: {episode_reward:.3f} | Epsilon: {self.epsilon:.3f}") + + state, _ = self.env.reset() + episode_reward = 0.0 + + self.epsilon = max(self.end_epsilon, self.epsilon * (1 - self.decay_rate)) + + if loss_val is not None and step % 100 == 0 and self.writer: + self.writer.add_scalar("train/loss", loss_val, self.steps_completed) + + if self.steps_completed % self.update_frequency == 0: + self.target_network.load_state_dict(self.policy_network.state_dict()) + + print("Training finished") + if self.writer: + self.writer.flush() + self.writer.close() \ No newline at end of file diff --git a/wds_dist_logs.zip b/wds_dist_logs.zip new file mode 100644 index 0000000..d775da8 Binary files /dev/null and b/wds_dist_logs.zip differ diff --git a/wds_dueling_logs.zip b/wds_dueling_logs.zip new file mode 100644 index 0000000..4934e08 Binary files /dev/null and b/wds_dueling_logs.zip differ