Wildfire-Control-Python/DQN.py at master · dashdeckers/Wildfire-Control-Python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.optimizers import Adam

import numpy as np
import random, time, keras, json, os
from collections import deque

class DQN:
    def __init__(self, sim, name="no_name", verbose=True):
        # Constants and such
        self.sim = sim
        self.name = name
        self.METADATA = sim.METADATA
        self.action_size = self.sim.n_actions
        self.DEBUG = sim.DEBUG
        self.verbose = verbose

        # DQN memory
        self.memory = deque(maxlen=self.METADATA['memory_size'])

        # Information to save to file
        self.logs = {
            'best_reward'   : -10000,
            'total_rewards' : list(),
            'agent_pos'     : list(),
            'agent_deaths'  : list(),
            'maps'          : list(),
            'init_memories' : 0,
            'total_time'    : 0,
            'n_episodes'    : 0,
        }

        # DQN Parameters
        self.max_eps = self.METADATA['max_eps']
        self.min_eps = self.METADATA['min_eps']
        self.eps_decay_rate = self.METADATA['eps_decay_rate']
        self.eps = self.max_eps
        self.gamma = self.METADATA['gamma']
        self.alpha = self.METADATA['alpha']
        self.target_update_freq = self.METADATA['target_update']

        # Network and target network
        self.model = self.make_network()
        self.target = keras.models.clone_model(self.model)
        self.target.set_weights(self.model.get_weights())

        # Print Constants
        if self.verbose:
            width, height = self.METADATA['width'], self.METADATA['height']
            print("\n\t[Parameters]")
            print("[decay]", self.METADATA['eps_decay_rate'])
            print("[alpha]", self.METADATA['alpha'])
            print("[gamma]", self.METADATA['gamma'])
            print("[batch]", self.METADATA['batch_size'])
            print("[size]", f"{width}x{height}")
            print("[wind speed]", self.METADATA['wind'][0])
            print("[target upd]", self.METADATA['target_update'], "\n")

    '''
    Main methods related to learning
    '''

    # The learning algorithm
    def learn(self, n_episodes=1000):
        # Time the entire run
        start_time = time.time()

        # Save the total number of episodes
        self.logs['n_episodes'] = n_episodes

        # Initialize counter to update the target network in intervals
        target_update_counter = self.target_update_freq

        # Start the main learning loop
        for episode in range(n_episodes):

            # Initialze the done flag, the reward accumulator, time, rewards etc
            done = False
            total_reward = 0
            t0 = time.time()
            rewards = list()

            # Initialize the state, and reshape because Keras expects the
            # first dimension to be the batch size
            state = self.sim.reset()
            state = np.reshape(state, [1] + list(state.shape))

            # Keep track of the agent starting positions
            if self.DEBUG > 0:
                agent_x, agent_y = self.sim.W.agents[0].x, self.sim.W.agents[0].y
                self.logs['agent_pos'].append((agent_x, agent_y))

            # Start the simulation episode
            while not done:
                # Execute an action following the e-greedy policy
                action = self.choose_action(state)
                sprime, reward, done, _ = self.sim.step(action)
                sprime = np.reshape(sprime, [1] + list(sprime.shape))

                # Store the observed experience in memory
                self.remember(state, action, reward, sprime, done)

                # If we have collected enough experiences, learn from memory
                if len(self.memory) > self.METADATA['batch_size']:
                    self.replay()

                # Every set number of iterations, update the target network
                target_update_counter -= 1
                if target_update_counter == 0:
                    target_update_counter = self.target_update_freq
                    self.target.set_weights(self.model.get_weights())

                # Set the state S to be the next state S', for the next iteration
                state = sprime

                # Keep track of the rewards and the total accumulated reward
                total_reward += reward
                rewards.append(reward)

            # Keep track of agent deaths
            if self.DEBUG > 0:
                if len(self.sim.W.agents) == 0:
                    self.logs['agent_deaths'].append(True)
                else:
                    self.logs['agent_deaths'].append(False)

            # If the last episode was somewhat successful, render its final state
            if total_reward >= 0.9 * self.logs['best_reward'] or total_reward > 300:
                map_string = self.sim.render()
                if total_reward > self.logs['best_reward']:
                    self.logs['best_reward'] = total_reward
                # Save the state of the map
                if self.DEBUG > 0:
                    self.logs['maps'].append([episode, map_string])

            # Print some information about the episode
            print(f"[Episode {episode + 1}]\tTime: {round(time.time() - t0, 3)}")
            print(f"\t\tEpsilon: {round(self.eps, 3)}")
            print(f"\t\tAgent dead: {len(self.sim.W.agents) == 0}")
            print(f"\t\tReward: {round(total_reward, 0)}\n")

            # Decay the epsilon value for the next episode
            self.decay_epsilon(episode)

            # Log the rewards over time
            self.logs['total_rewards'].append(total_reward)

        # Save the total time taken for this run
        self.logs['total_time'] = round(time.time() - start_time, 3)

        # Write logs and model to file
        self.write_data()

    # Fit the model with a random sample taken from the memory
    def replay(self):
        states_batch = list()
        predicted_batch = list()

        # Sample a random batch of memories from memory
        batch = random.sample(self.memory, self.METADATA['batch_size'])

        for state, action, reward, sprime, done in batch:
            # Get the prediction for the state S
            prediction = self.target.predict(state)[0]

            # If S was a terminal state, the cumulative reward from that
            # state forward is simply the reward recieved for that state
            if done:
                prediction[action] = reward
            # Otherwise, estimate the cumulative reward from that state
            # forward by using the maximum Q-value of the state S' as a
            # proxy (=bootstrapping via TD methods)
            else:
                predQ = np.amax(self.target.predict(sprime)[0])
                prediction[action] = reward + self.gamma * predQ

            # Store the states and their updated predictions for this batch
            states_batch.append(state[0])
            predicted_batch.append(prediction)

        # Convert the batch into numpy arrays for Keras and fit the model
        states = np.array(states_batch)
        predictions = np.array(predicted_batch)
        self.model.fit(states, predictions, epochs=1, verbose=0)

    # Choose an action A based on state S following the e-greedy policy
    def choose_action(self, state, eps=None):
        # Epsilon is either taken from current value, or passed as argument
        eps_threshold = self.eps if eps is None else eps

        # Either choose action with highest Q-value or a random action
        if random.uniform(0, 1) > eps_threshold:
            return np.argmax(self.model.predict(state)[0])
        else:
            return np.random.choice(self.METADATA['n_actions'])

    # Decay the epsilon value in a rate proportional to the episode number
    def decay_epsilon(self, episode_num=None):
        self.eps = self.min_eps \
                    + (self.max_eps - self.min_eps) \
                    * np.exp(-self.eps_decay_rate * episode_num)

    # Store an experience in memory
    def remember(self, state, action, reward, sprime, done):
        self.memory.append((state, action, reward, sprime, done))

    # Create the neural network
    def make_network(self):
        input_shape = (self.sim.W.WIDTH, self.sim.W.HEIGHT, self.sim.W.DEPTH)
        layers = [
            Flatten(input_shape=input_shape),
            # One hidden layer with 50 neurons and a sigmoid activation function
            Dense(units=50,
                  activation='sigmoid'),
            # Output layer has a linear activation function
            Dense(units=self.action_size,
                  activation='linear'),
        ]
        '''
        TODO: Consider using an initializer for the layers:
        bias_initializer='random_uniform',
        kernel_initializer='random_uniform'),
        '''
        model = Sequential(layers)
        # Compile model with mean squared error loss metric
        model.compile(loss='mse',
                      # And an Adam optimizer with gradient clipping
                      optimizer=Adam(lr=self.alpha,
                                     clipvalue=1))
        if self.verbose:
            model.summary()
        return model

    '''
    Miscellaneous, helper methods:
    '''

    # Play the simulation by following the optimal policy
    def play_optimal(self, eps=0):
        done = False
        total_reward = 0
        state = self.sim.reset()
        while not done:
            self.sim.render()
            state = np.reshape(state, [1] + list(state.shape))
            self.show_info(state)
            action = self.choose_action(state, eps=eps)
            state, reward, done, _ = self.sim.step(action)
            total_reward += reward
            time.sleep(0.1)
        self.sim.render()
        print(f"Total reward: {total_reward}")

    # Show the Q-values for each action in the state, the best action, and wind info
    def show_info(self, state):
        w_speed, w_vector = self.sim.W.wind_speed, self.sim.W.wind_vector
        print(f"Wind Speed: {w_speed}")
        print(f"Wind direction: {w_vector}")

        # Predict the Q-values via the network
        QVals = self.model.predict(state)[0]

        # Print the Q-values and their maximum
        key_map = {0:'N', 1:'S', 2:'E', 3:'W', 4:'D', 5:' '}
        print("| ", end="")
        for idx, val in enumerate(QVals):
            val = round(val, 2)
            direction = key_map[idx]
            extra_space = " " if val > 0 else ""
            print(f"{direction} : {extra_space}{val:.2f} | ", end="")
            if idx == 1:
                print("\n| ", end="")
        print(f"\nBest Action: {key_map[np.argmax(QVals)]}\n")

    '''
    Collects memories as follows:

    Make the agent walk somewhat randomly but always such that it walks clockwise around
    fire. So if the agent is below and to the right of the fire it will chose an action
    to either go left or downwards until it is below the fire and then it will go either
    up or left. When the fire is contained, the simulation is reset.

    It only collects the memories that lead to a successful containment of the fire.
    '''
    def collect_memories(self, num_of_episodes=100, perform_baseline=False):
        if not num_of_episodes:
            return
        # Wipe internal memory
        self.memory = deque()
        success_count = 0
        episode = 0

        # While memory is not filled up
        while True:
            total_reward = 0
            memories = list()
            done = False

            state = self.sim.reset()
            state = np.reshape(state, [1] + list(state.shape))

            while not done:
                # Choose an action
                action = self.choose_randomwalk_action()

                # Observe sprime and reward
                sprime, reward, done, _ = self.sim.step(action)
                sprime = np.reshape(sprime, [1] + list(sprime.shape))

                # Collect memories
                memories.append((state, action, reward, sprime, done))
                state = sprime
                total_reward += reward

                # Only if we contained the fire, we collect the memories
                if not perform_baseline and reward == self.METADATA['contained_bonus']:
                    success_count += 1
                    # Store successful experience in memory
                    for state, action, reward, sprime, done in memories:
                        self.remember(state, action, reward, sprime, done)
                    # Set done to true
                    done = True
                    # Collect logging info and return
                    if success_count == num_of_episodes:
                        self.logs['init_memories'] = len(self.memory)
                        return

            # Collect logging data and print status if we are doing a baseline run
            if perform_baseline:
                self.logs['total_rewards'].append(total_reward)

                if episode % 100 == 0:
                    print(f"Episode {episode}/{num_of_episodes}")

                if len(self.sim.W.agents) == 0:
                    self.logs['agent_deaths'].append(True)
                else:
                    self.logs['agent_deaths'].append(False)

                # Stop if we are done
                if episode == num_of_episodes - 1:
                    self.logs['n_episodes'] = num_of_episodes
                    break
                episode += 1

        # Write (sparse) log to file (if we are doing a baseline run)
        self.write_data()


    # Choose an action depending on the agents position relative to the fire.
    # The action should be safe (avoiding the fire), if possible
    def choose_randomwalk_action(self, avoid_fire=True):
        # It can happen in SARSA to ask for an action when agent has died.
        # However, that action is never looked at and is irrelevant
        if not self.sim.W.agents:
            return 0

        key_map = {'N':0, 'S':1, 'E':2, 'W':3}
        width, height = self.sim.W.WIDTH, self.sim.W.HEIGHT
        agent_x, agent_y = self.sim.W.agents[0].x, self.sim.W.agents[0].y
        mid_x, mid_y = (int(width / 2), int(height / 2))

        # Loop to try to avoid choosing actions that lead to death
        count = 0
        while True:
            # The chosen action should always make the agent go around the fire
            if agent_x >= mid_x and agent_y > mid_y:
                possible_actions = ["S", "W"]
            if agent_x > mid_x and agent_y <= mid_y:
                possible_actions = ["S", "E"]
            if agent_x <= mid_x and agent_y < mid_y:
                possible_actions = ["N", "E"]
            if agent_x < mid_x and agent_y >= mid_y:
                possible_actions = ["N", "W"]

            # Choose randomly from valid actions
            action = key_map[np.random.choice(possible_actions)]

            if not avoid_fire:
                break

            # Break when it is a safe move or when we have tried too often
            fire_at_loc = self.sim.W.agents[0].fire_in_direction(action)
            if not fire_at_loc or count > 10:
                break
            count += 1

        return action

    # Writes the logs and the metadata to a file with an appropriate name
    def write_data(self):
        # Also save metadata
        self.logs['metadata'] = self.METADATA

        # Create filename
        n_episodes = self.logs['n_episodes']
        if n_episodes >= 1000:
            n_episodes /= 1000
        else:
            n_episodes = 0
        memories = self.logs['init_memories']
        name = self.sim.get_name(self.sim.W.WIDTH, \
            int(n_episodes), memories, self.name)
        counter = 0
        while os.path.isfile("Logs/" + name) or os.path.isfile("Models/" + name):
            if counter > 0:
                n_digits_to_delete = len(str(counter))
                name = name[:-n_digits_to_delete]
            name = name + str(counter)
            counter += 1

        # If the folders don't exist, create them
        if not os.path.exists("Logs/"):
            os.makedirs("Logs/")
        if not os.path.exists("Models/"):
            os.makedirs("Models/")

        # Write model
        self.save_model(name)

        # Write logs
        with open("Logs/" + name, 'w') as file:
            json.dump(self.logs, file)

    # Loads the weights of the model from file.
    def load_model(self):
        all_names = os.listdir("Models")
        print("\nChoose a model from the list below to load:")
        for idx, name in enumerate(all_names):
            print(f"\t[{idx}] {name}")
        try:
            selection = int(input(f"Select one [0-{idx}]: \n"))
            name = os.path.join("Models", all_names[selection])
            self.model.load_weights(name)
            print("Model loaded!")
        except (ValueError, IndexError) as e:
            print("Invalid Selection")

    # Saves the weights of the model to file
    def save_model(self, name):
        name = "Models/" + name
        self.model.save_weights(name)