-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy path_mdp_centralized_agent.py
More file actions
189 lines (143 loc) · 7.49 KB
/
_mdp_centralized_agent.py
File metadata and controls
189 lines (143 loc) · 7.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
import numpy as np
import math
import random
import copy
from typing import Optional
import gymnasium
from gymnasium import spaces
from gymnasium.spaces import Box, Tuple
from gymnasium.spaces import MultiDiscrete
class CentralizedSystem(gymnasium.Env):
def __init__(self, list_agents) :
super().__init__()
self.agents = list_agents
self.n_agents = len(list_agents)
print(self.n_agents)
self.t = 0
low = []
high = []
for i in range(2 * self.n_agents):
if i % 2 == 0:
low.append(1)
high.append(self.agents[i//2].M)
else:
low.append(-self.agents[i//2].B)
high.append(self.agents[i//2].B)
observation_space = Box(low=np.array(low), high=np.array(high), dtype=np.int64)
# print(observation_space)
self._agents_aoi = [1 for i in range(self.n_agents)]
self._agents_energy = [0 for i in range(self.n_agents)]
self.observation_space = observation_space
action_space = spaces.multi_discrete.MultiDiscrete([3] * self.n_agents) # Each element is in {0, 1, 2}
# print(action_space)
self.action_space = action_space
# print(action_space.sample())
def _get_obs(self):
observation = []
for i in range(self.n_agents):
observation.append(self._agents_aoi[i])
observation.append(self._agents_energy[i])
return observation
def make_state_dictionary(self, state = None) : #, state_vector):
state = []
for agent in self.agents:
local_aoi = self._agents_aoi[self.agents.index(agent)]
local_energy = self._agents_energy[self.agents.index(agent)]
# we have to compute the state index
state_index = agent.compute_state_index(local_aoi, local_energy)
local_state = dict({'x': local_aoi, 'e': local_energy, 'index': state_index})
state.append(local_state)
return state
def global_reward(self, global_action, state = None, vector_rewards = True, LP = False):
# compute the global reward. The state is the implicit one saved in the vectors self._agents_aoi and self._agents_energy
# we have to compute the individual states given the global (implicit) state
if state is None:
state = dict()
for agent in self.agents:
local_aoi = self._agents_aoi[self.agents.index(agent)]
local_energy = self._agents_energy[self.agents.index(agent)]
# we have to compute the state index
state_index = agent.compute_state_index(local_aoi, local_energy)
local_state = dict({'x': local_aoi, 'e': local_energy, 'index': state_index})
state['Agent {}'.format(self.agents.index(agent))] = local_state
# print(state)
global_reward = 0
global_reward_components = np.zeros((self.n_agents, ))
for i in range(self.n_agents):
if LP:
agent_state = state['Agent {}'.format(i+1)]
else:
agent_state = state['Agent {}'.format(i)]
if agent_state['e']<0:
# the penalty is equal to the absolute value of the energy (i.e. how many energy units we need to add in order to have a positive energy and therefore conclude the processing action)
if global_action[i] == 0:
global_reward += agent_state['x'] - agent_state['e']
global_reward_components[i] = agent_state['x'] - agent_state['e']
else:
global_reward += 1000
global_reward_components[i] = 1000
elif global_action[i] == 0 and (agent_state['x'] >= self.agents[i].M or agent_state['e'] >= self.agents[i].B):
global_reward += agent_state['x'] + 1000
global_reward_components[i] = agent_state['x'] + 1000
elif global_action[i] <= 1 and agent_state['e'] >= 0:
global_reward += agent_state['x']
global_reward_components[i] = agent_state['x']
elif global_action[i] == 2:
# we can actually simulate the amount of other agents that choose this action
# for the moment, the cost of the other agents is equal to the average amount of agents that choose the common action
cost_other_agents = lambda x : self.agents[i].congestion_penalty_parameters['multiplier'] * (x-1)**self.agents[i].congestion_penalty_parameters['exponent']
# the interaction with the other agents is limited to countin the number of agents that choose the same action (note how the agent agent_index does not have to be counted)
other_agents_interation_action = np.count_nonzero(global_action == 2)
global_reward += agent_state['x'] + cost_other_agents(other_agents_interation_action)
global_reward_components[i] = agent_state['x'] + cost_other_agents(other_agents_interation_action)
else:
print(agent_state, action[i])
raise ValueError("Error in reward function")
if vector_rewards:
return global_reward, global_reward_components
else:
return global_reward
def cost_function(self, state, action):
if action == 2:
return 1
else:
return 0
def step(self, action, vector_rewards = False):
# we have to compute the individual states given the global (implicit) state
self.t += 1
next_state_vector = []
for agent in self.agents:
local_aoi = self._agents_aoi[self.agents.index(agent)]
local_energy = self._agents_energy[self.agents.index(agent)]
# we have to compute the state index
state_index = agent.compute_state_index(local_aoi, local_energy)
local_state = dict({'x': local_aoi, 'e': local_energy, 'index': state_index})
# we use the local funciton step to find the next state for each agent. The reward will be computed in the global function
next_local_state, _, _, _ = agent.step(local_state, action[self.agents.index(agent)], training = False)
next_state_vector.append(next_local_state)
global_reward = self.global_reward(action, vector_rewards = vector_rewards)
if vector_rewards:
global_reward_components = global_reward[1]
global_reward = copy.copy(global_reward[0])
else:
global_reward_components = None
# we have to update the global state
for i in range(self.n_agents):
self._agents_aoi[i] = next_state_vector[i]['x']
self._agents_energy[i] = next_state_vector[i]['e']
# obtain energy, aoi and server availability from state index
if self.t >= self.agents[0].max_episodes_steps:
episode_ended = True
else:
episode_ended = False
observation = self._get_obs()
return observation, global_reward, episode_ended, False, {'global_reward_components': global_reward_components}
def reset(self, seed: Optional[int] = None, options: Optional[dict] = None):
super().reset(seed=seed)
self.t = 0
self._agents_aoi = [1 for i in range(self.n_agents)]
self._agents_energy = [0 for i in range(self.n_agents)]
observation = self._get_obs()
return observation , {}