-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDQN.py
More file actions
443 lines (370 loc) · 16.8 KB
/
DQN.py
File metadata and controls
443 lines (370 loc) · 16.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.optimizers import Adam
import numpy as np
import random, time, keras, json, os
from collections import deque
class DQN:
def __init__(self, sim, name="no_name", verbose=True):
# Constants and such
self.sim = sim
self.name = name
self.METADATA = sim.METADATA
self.action_size = self.sim.n_actions
self.DEBUG = sim.DEBUG
self.verbose = verbose
# DQN memory
self.memory = deque(maxlen=self.METADATA['memory_size'])
# Information to save to file
self.logs = {
'best_reward' : -10000,
'total_rewards' : list(),
'agent_pos' : list(),
'agent_deaths' : list(),
'maps' : list(),
'init_memories' : 0,
'total_time' : 0,
'n_episodes' : 0,
}
# DQN Parameters
self.max_eps = self.METADATA['max_eps']
self.min_eps = self.METADATA['min_eps']
self.eps_decay_rate = self.METADATA['eps_decay_rate']
self.eps = self.max_eps
self.gamma = self.METADATA['gamma']
self.alpha = self.METADATA['alpha']
self.target_update_freq = self.METADATA['target_update']
# Network and target network
self.model = self.make_network()
self.target = keras.models.clone_model(self.model)
self.target.set_weights(self.model.get_weights())
# Print Constants
if self.verbose:
width, height = self.METADATA['width'], self.METADATA['height']
print("\n\t[Parameters]")
print("[decay]", self.METADATA['eps_decay_rate'])
print("[alpha]", self.METADATA['alpha'])
print("[gamma]", self.METADATA['gamma'])
print("[batch]", self.METADATA['batch_size'])
print("[size]", f"{width}x{height}")
print("[wind speed]", self.METADATA['wind'][0])
print("[target upd]", self.METADATA['target_update'], "\n")
'''
Main methods related to learning
'''
# The learning algorithm
def learn(self, n_episodes=1000):
# Time the entire run
start_time = time.time()
# Save the total number of episodes
self.logs['n_episodes'] = n_episodes
# Initialize counter to update the target network in intervals
target_update_counter = self.target_update_freq
# Start the main learning loop
for episode in range(n_episodes):
# Initialze the done flag, the reward accumulator, time, rewards etc
done = False
total_reward = 0
t0 = time.time()
rewards = list()
# Initialize the state, and reshape because Keras expects the
# first dimension to be the batch size
state = self.sim.reset()
state = np.reshape(state, [1] + list(state.shape))
# Keep track of the agent starting positions
if self.DEBUG > 0:
agent_x, agent_y = self.sim.W.agents[0].x, self.sim.W.agents[0].y
self.logs['agent_pos'].append((agent_x, agent_y))
# Start the simulation episode
while not done:
# Execute an action following the e-greedy policy
action = self.choose_action(state)
sprime, reward, done, _ = self.sim.step(action)
sprime = np.reshape(sprime, [1] + list(sprime.shape))
# Store the observed experience in memory
self.remember(state, action, reward, sprime, done)
# If we have collected enough experiences, learn from memory
if len(self.memory) > self.METADATA['batch_size']:
self.replay()
# Every set number of iterations, update the target network
target_update_counter -= 1
if target_update_counter == 0:
target_update_counter = self.target_update_freq
self.target.set_weights(self.model.get_weights())
# Set the state S to be the next state S', for the next iteration
state = sprime
# Keep track of the rewards and the total accumulated reward
total_reward += reward
rewards.append(reward)
# Keep track of agent deaths
if self.DEBUG > 0:
if len(self.sim.W.agents) == 0:
self.logs['agent_deaths'].append(True)
else:
self.logs['agent_deaths'].append(False)
# If the last episode was somewhat successful, render its final state
if total_reward >= 0.9 * self.logs['best_reward'] or total_reward > 300:
map_string = self.sim.render()
if total_reward > self.logs['best_reward']:
self.logs['best_reward'] = total_reward
# Save the state of the map
if self.DEBUG > 0:
self.logs['maps'].append([episode, map_string])
# Print some information about the episode
print(f"[Episode {episode + 1}]\tTime: {round(time.time() - t0, 3)}")
print(f"\t\tEpsilon: {round(self.eps, 3)}")
print(f"\t\tAgent dead: {len(self.sim.W.agents) == 0}")
print(f"\t\tReward: {round(total_reward, 0)}\n")
# Decay the epsilon value for the next episode
self.decay_epsilon(episode)
# Log the rewards over time
self.logs['total_rewards'].append(total_reward)
# Save the total time taken for this run
self.logs['total_time'] = round(time.time() - start_time, 3)
# Write logs and model to file
self.write_data()
# Fit the model with a random sample taken from the memory
def replay(self):
states_batch = list()
predicted_batch = list()
# Sample a random batch of memories from memory
batch = random.sample(self.memory, self.METADATA['batch_size'])
for state, action, reward, sprime, done in batch:
# Get the prediction for the state S
prediction = self.target.predict(state)[0]
# If S was a terminal state, the cumulative reward from that
# state forward is simply the reward recieved for that state
if done:
prediction[action] = reward
# Otherwise, estimate the cumulative reward from that state
# forward by using the maximum Q-value of the state S' as a
# proxy (=bootstrapping via TD methods)
else:
predQ = np.amax(self.target.predict(sprime)[0])
prediction[action] = reward + self.gamma * predQ
# Store the states and their updated predictions for this batch
states_batch.append(state[0])
predicted_batch.append(prediction)
# Convert the batch into numpy arrays for Keras and fit the model
states = np.array(states_batch)
predictions = np.array(predicted_batch)
self.model.fit(states, predictions, epochs=1, verbose=0)
# Choose an action A based on state S following the e-greedy policy
def choose_action(self, state, eps=None):
# Epsilon is either taken from current value, or passed as argument
eps_threshold = self.eps if eps is None else eps
# Either choose action with highest Q-value or a random action
if random.uniform(0, 1) > eps_threshold:
return np.argmax(self.model.predict(state)[0])
else:
return np.random.choice(self.METADATA['n_actions'])
# Decay the epsilon value in a rate proportional to the episode number
def decay_epsilon(self, episode_num=None):
self.eps = self.min_eps \
+ (self.max_eps - self.min_eps) \
* np.exp(-self.eps_decay_rate * episode_num)
# Store an experience in memory
def remember(self, state, action, reward, sprime, done):
self.memory.append((state, action, reward, sprime, done))
# Create the neural network
def make_network(self):
input_shape = (self.sim.W.WIDTH, self.sim.W.HEIGHT, self.sim.W.DEPTH)
layers = [
Flatten(input_shape=input_shape),
# One hidden layer with 50 neurons and a sigmoid activation function
Dense(units=50,
activation='sigmoid'),
# Output layer has a linear activation function
Dense(units=self.action_size,
activation='linear'),
]
'''
TODO: Consider using an initializer for the layers:
bias_initializer='random_uniform',
kernel_initializer='random_uniform'),
'''
model = Sequential(layers)
# Compile model with mean squared error loss metric
model.compile(loss='mse',
# And an Adam optimizer with gradient clipping
optimizer=Adam(lr=self.alpha,
clipvalue=1))
if self.verbose:
model.summary()
return model
'''
Miscellaneous, helper methods:
'''
# Play the simulation by following the optimal policy
def play_optimal(self, eps=0):
done = False
total_reward = 0
state = self.sim.reset()
while not done:
self.sim.render()
state = np.reshape(state, [1] + list(state.shape))
self.show_info(state)
action = self.choose_action(state, eps=eps)
state, reward, done, _ = self.sim.step(action)
total_reward += reward
time.sleep(0.1)
self.sim.render()
print(f"Total reward: {total_reward}")
# Show the Q-values for each action in the state, the best action, and wind info
def show_info(self, state):
w_speed, w_vector = self.sim.W.wind_speed, self.sim.W.wind_vector
print(f"Wind Speed: {w_speed}")
print(f"Wind direction: {w_vector}")
# Predict the Q-values via the network
QVals = self.model.predict(state)[0]
# Print the Q-values and their maximum
key_map = {0:'N', 1:'S', 2:'E', 3:'W', 4:'D', 5:' '}
print("| ", end="")
for idx, val in enumerate(QVals):
val = round(val, 2)
direction = key_map[idx]
extra_space = " " if val > 0 else ""
print(f"{direction} : {extra_space}{val:.2f} | ", end="")
if idx == 1:
print("\n| ", end="")
print(f"\nBest Action: {key_map[np.argmax(QVals)]}\n")
'''
Collects memories as follows:
Make the agent walk somewhat randomly but always such that it walks clockwise around
fire. So if the agent is below and to the right of the fire it will chose an action
to either go left or downwards until it is below the fire and then it will go either
up or left. When the fire is contained, the simulation is reset.
It only collects the memories that lead to a successful containment of the fire.
'''
def collect_memories(self, num_of_episodes=100, perform_baseline=False):
if not num_of_episodes:
return
# Wipe internal memory
self.memory = deque()
success_count = 0
episode = 0
# While memory is not filled up
while True:
total_reward = 0
memories = list()
done = False
state = self.sim.reset()
state = np.reshape(state, [1] + list(state.shape))
while not done:
# Choose an action
action = self.choose_randomwalk_action()
# Observe sprime and reward
sprime, reward, done, _ = self.sim.step(action)
sprime = np.reshape(sprime, [1] + list(sprime.shape))
# Collect memories
memories.append((state, action, reward, sprime, done))
state = sprime
total_reward += reward
# Only if we contained the fire, we collect the memories
if not perform_baseline and reward == self.METADATA['contained_bonus']:
success_count += 1
# Store successful experience in memory
for state, action, reward, sprime, done in memories:
self.remember(state, action, reward, sprime, done)
# Set done to true
done = True
# Collect logging info and return
if success_count == num_of_episodes:
self.logs['init_memories'] = len(self.memory)
return
# Collect logging data and print status if we are doing a baseline run
if perform_baseline:
self.logs['total_rewards'].append(total_reward)
if episode % 100 == 0:
print(f"Episode {episode}/{num_of_episodes}")
if len(self.sim.W.agents) == 0:
self.logs['agent_deaths'].append(True)
else:
self.logs['agent_deaths'].append(False)
# Stop if we are done
if episode == num_of_episodes - 1:
self.logs['n_episodes'] = num_of_episodes
break
episode += 1
# Write (sparse) log to file (if we are doing a baseline run)
self.write_data()
# Choose an action depending on the agents position relative to the fire.
# The action should be safe (avoiding the fire), if possible
def choose_randomwalk_action(self, avoid_fire=True):
# It can happen in SARSA to ask for an action when agent has died.
# However, that action is never looked at and is irrelevant
if not self.sim.W.agents:
return 0
key_map = {'N':0, 'S':1, 'E':2, 'W':3}
width, height = self.sim.W.WIDTH, self.sim.W.HEIGHT
agent_x, agent_y = self.sim.W.agents[0].x, self.sim.W.agents[0].y
mid_x, mid_y = (int(width / 2), int(height / 2))
# Loop to try to avoid choosing actions that lead to death
count = 0
while True:
# The chosen action should always make the agent go around the fire
if agent_x >= mid_x and agent_y > mid_y:
possible_actions = ["S", "W"]
if agent_x > mid_x and agent_y <= mid_y:
possible_actions = ["S", "E"]
if agent_x <= mid_x and agent_y < mid_y:
possible_actions = ["N", "E"]
if agent_x < mid_x and agent_y >= mid_y:
possible_actions = ["N", "W"]
# Choose randomly from valid actions
action = key_map[np.random.choice(possible_actions)]
if not avoid_fire:
break
# Break when it is a safe move or when we have tried too often
fire_at_loc = self.sim.W.agents[0].fire_in_direction(action)
if not fire_at_loc or count > 10:
break
count += 1
return action
# Writes the logs and the metadata to a file with an appropriate name
def write_data(self):
# Also save metadata
self.logs['metadata'] = self.METADATA
# Create filename
n_episodes = self.logs['n_episodes']
if n_episodes >= 1000:
n_episodes /= 1000
else:
n_episodes = 0
memories = self.logs['init_memories']
name = self.sim.get_name(self.sim.W.WIDTH, \
int(n_episodes), memories, self.name)
counter = 0
while os.path.isfile("Logs/" + name) or os.path.isfile("Models/" + name):
if counter > 0:
n_digits_to_delete = len(str(counter))
name = name[:-n_digits_to_delete]
name = name + str(counter)
counter += 1
# If the folders don't exist, create them
if not os.path.exists("Logs/"):
os.makedirs("Logs/")
if not os.path.exists("Models/"):
os.makedirs("Models/")
# Write model
self.save_model(name)
# Write logs
with open("Logs/" + name, 'w') as file:
json.dump(self.logs, file)
# Loads the weights of the model from file.
def load_model(self):
all_names = os.listdir("Models")
print("\nChoose a model from the list below to load:")
for idx, name in enumerate(all_names):
print(f"\t[{idx}] {name}")
try:
selection = int(input(f"Select one [0-{idx}]: \n"))
name = os.path.join("Models", all_names[selection])
self.model.load_weights(name)
print("Model loaded!")
except (ValueError, IndexError) as e:
print("Invalid Selection")
# Saves the weights of the model to file
def save_model(self, name):
name = "Models/" + name
self.model.save_weights(name)