From 392c74e2026dd227202414a8667b4c67f19eb7e6 Mon Sep 17 00:00:00 2001 From: pseudo-rnd-thoughts Date: Thu, 21 Aug 2025 22:32:55 +0100 Subject: [PATCH] Upgrade to Gymnasium rather than Gym --- README.md | 2 +- distributed/rpc/batch/reinforce.py | 11 +++++------ distributed/rpc/batch/requirements.txt | 2 +- docs/source/index.rst | 6 +++--- reinforcement_learning/actor_critic.py | 19 ++++++++----------- reinforcement_learning/reinforce.py | 15 +++++++-------- reinforcement_learning/requirements.txt | 5 ++--- 7 files changed, 27 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 03de115b22..ae16f304ed 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ https://pytorch.org/examples/ - [Variational Auto-Encoders](./vae/README.md) - [Superresolution using an efficient sub-pixel convolutional neural network](./super_resolution/README.md) - [Hogwild training of shared ConvNets across multiple processes on MNIST](mnist_hogwild) -- [Training a CartPole to balance in OpenAI Gym with actor-critic](./reinforcement_learning/README.md) +- [Training a CartPole to balance with actor-critic](./reinforcement_learning/README.md) - [Natural Language Inference (SNLI) with GloVe vectors, LSTMs, and torchtext](snli) - [Time sequence prediction - use an LSTM to learn Sine waves](./time_sequence_prediction/README.md) - [Implement the Neural Style Transfer algorithm on images](./fast_neural_style/README.md) diff --git a/distributed/rpc/batch/reinforce.py b/distributed/rpc/batch/reinforce.py index 13a06315de..93661229d5 100644 --- a/distributed/rpc/batch/reinforce.py +++ b/distributed/rpc/batch/reinforce.py @@ -1,5 +1,5 @@ import argparse -import gym +import gymnasium as gym import os import threading import time @@ -68,7 +68,7 @@ class Observer: def __init__(self, batch=True): self.id = rpc.get_worker_info().id - 1 self.env = gym.make('CartPole-v1') - self.env.seed(args.seed) + self.env.reset(seed=args.seed) self.select_action = Agent.select_action_batch if batch else Agent.select_action def run_episode(self, agent_rref, n_steps): @@ -92,10 +92,10 @@ def run_episode(self, agent_rref, n_steps): ) # apply the action to the environment, and get the reward - state, reward, done, _ = self.env.step(action) + state, reward, terminated, truncated, _ = self.env.step(action) rewards[step] = reward - if done or step + 1 >= n_steps: + if terminated or truncated or step + 1 >= n_steps: curr_rewards = rewards[start_step:(step + 1)] R = 0 for i in range(curr_rewards.numel() -1, -1, -1): @@ -226,8 +226,7 @@ def run_worker(rank, world_size, n_episode, batch, print_log=True): last_reward, running_reward = agent.run_episode(n_steps=NUM_STEPS) if print_log: - print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format( - i_episode, last_reward, running_reward)) + print(f'Episode {i_episode}\tLast reward: {last_reward:.2f}\tAverage reward: {running_reward:.2f}') else: # other ranks are the observer rpc.init_rpc(OBSERVER_NAME.format(rank), rank=rank, world_size=world_size) diff --git a/distributed/rpc/batch/requirements.txt b/distributed/rpc/batch/requirements.txt index 2e4bc8c273..597fddb1bb 100644 --- a/distributed/rpc/batch/requirements.txt +++ b/distributed/rpc/batch/requirements.txt @@ -1,4 +1,4 @@ torch==2.2.0 torchvision==0.7.0 numpy -gym +gymnasium diff --git a/docs/source/index.rst b/docs/source/index.rst index 1436b85d1a..819dcc248d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -88,12 +88,12 @@ experiment with PyTorch. `GO TO EXAMPLE `__ :opticon:`link-external` --- - Training a CartPole to balance in OpenAI Gym with actor-critic - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + Training a CartPole to balance with actor-critic + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This reinforcement learning tutorial demonstrates how to train a CartPole to balance - in the `OpenAI Gym `__ toolkit by using the + in the `Gymnasium `__ toolkit by using the `Actor-Critic `__ method. `GO TO EXAMPLE `__ :opticon:`link-external` diff --git a/reinforcement_learning/actor_critic.py b/reinforcement_learning/actor_critic.py index c5a3ee6d79..c9cf3b9c61 100644 --- a/reinforcement_learning/actor_critic.py +++ b/reinforcement_learning/actor_critic.py @@ -1,5 +1,5 @@ import argparse -import gym +import gymnasium as gym import numpy as np from itertools import count from collections import namedtuple @@ -24,7 +24,8 @@ args = parser.parse_args() -env = gym.make('CartPole-v1') +render_mode = "human" if args.render else None +env = gym.make('CartPole-v1', render_mode=render_mode) env.reset(seed=args.seed) torch.manual_seed(args.seed) @@ -152,14 +153,11 @@ def main(): action = select_action(state) # take the action - state, reward, done, _, _ = env.step(action) - - if args.render: - env.render() + state, reward, terminated, truncated, _ = env.step(action) model.rewards.append(reward) ep_reward += reward - if done: + if terminated or truncated: break # update cumulative reward @@ -170,13 +168,12 @@ def main(): # log results if i_episode % args.log_interval == 0: - print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format( - i_episode, ep_reward, running_reward)) + print(f'Episode {i_episode}\tLast reward: {ep_reward:.2f}\tAverage reward: {running_reward:.2f}') # check if we have "solved" the cart pole problem if running_reward > env.spec.reward_threshold: - print("Solved! Running reward is now {} and " - "the last episode runs to {} time steps!".format(running_reward, t)) + print(f"Solved! Running reward is now {running_reward} and " + f"the last episode runs to {t} time steps!") break diff --git a/reinforcement_learning/reinforce.py b/reinforcement_learning/reinforce.py index 961598174c..048ea99f37 100644 --- a/reinforcement_learning/reinforce.py +++ b/reinforcement_learning/reinforce.py @@ -1,5 +1,5 @@ import argparse -import gym +import gymnasium as gym import numpy as np from itertools import count from collections import deque @@ -22,7 +22,8 @@ args = parser.parse_args() -env = gym.make('CartPole-v1') +render_mode = "human" if args.render else None +env = gym.make('CartPole-v1', render_mode=render_mode) env.reset(seed=args.seed) torch.manual_seed(args.seed) @@ -85,22 +86,20 @@ def main(): ep_reward = 0 for t in range(1, 10000): # Don't infinite loop while learning action = select_action(state) - state, reward, done, _, _ = env.step(action) + state, reward, terminated, truncated, _ = env.step(action) if args.render: env.render() policy.rewards.append(reward) ep_reward += reward - if done: + if terminated or truncated: break running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward finish_episode() if i_episode % args.log_interval == 0: - print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format( - i_episode, ep_reward, running_reward)) + print(f'Episode {i_episode}\tLast reward: {ep_reward:.2f}\tAverage reward: {running_reward:.2f}') if running_reward > env.spec.reward_threshold: - print("Solved! Running reward is now {} and " - "the last episode runs to {} time steps!".format(running_reward, t)) + print(f"Solved! Running reward is now {running_reward} and the last episode runs to {t} time steps!") break diff --git a/reinforcement_learning/requirements.txt b/reinforcement_learning/requirements.txt index bd262b2733..e21bf14fd9 100644 --- a/reinforcement_learning/requirements.txt +++ b/reinforcement_learning/requirements.txt @@ -1,4 +1,3 @@ torch -numpy<2 -gym -pygame +numpy +gymnasium[classic-control]