-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathLP_verify_gradient_objective_function.py
More file actions
343 lines (262 loc) · 15.6 KB
/
LP_verify_gradient_objective_function.py
File metadata and controls
343 lines (262 loc) · 15.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["VECLIB_MAXIMUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
import cvxpy as cp
import numpy as np
import math
import pickle as pk
import random
import multiprocessing as mp
from _value_iteration_helper import *
from _mdp_single_device import SingleDevice
from _mdp_centralized_agent import CentralizedSystem
def find_optimal_policy(agent, local_noise = 0, other_noise = 0, return_LM = False, verbose = False, only_final_discounted_reward = False, print_policy = False):
# Replace these with your actual data
num_states = agent.M*(-agent.min_energy + agent.B+1)
num_actions = 3
r = np.zeros((num_states, num_actions)) # Reward matrix
for x in range(1, agent.M+1):
for e in range(agent.min_energy, agent.B+1):
index=agent.compute_state_index(x,e)
state = dict({'x': x, 'e': e, 'index': index})
r[index, 0] = agent.reward_function(state = state, action= 0, training=False)
r[index, 1] = agent.reward_function(state = state, action= 1, training=False)
# only for action = 2 we also need to take into consideration the noise on the other elements (if necessary)
r[index, 2] = agent.reward_function(state = state, action= 2, other_noise = other_noise, training=False)
c = np.zeros((num_states, num_actions)) # Constraint cost matrix
for x in range(1, agent.M+1):
for e in range(agent.min_energy, agent.B+1):
index=agent.compute_state_index(x,e)
state = dict({'x': x, 'e': e, 'index': index})
c[index, 0] = agent.cost_function(state = state, action= 0)
c[index, 1] = agent.cost_function(state = state, action= 1)
c[index, 2] = agent.cost_function(state = state, action= 2)
# the constrain bound has to consider the local noise
d = agent.average_constraint + local_noise # Constraint bound
P = np.zeros((num_states, num_actions, num_states)) # Transition probability matrix
P0 = compute_P0(agent)
P1 = compute_P1(agent)
P2 = compute_P2(agent)
P[:, 0, :] = P0 # Transition probabilities for action 0
P[:, 1, :] = P1 # Transition probabilities for action 1
P[:, 2, :] = P2 # Transition probabilities for action 2
# mu = np.ones(num_states) / num_states # Initial distribution
mu = np.zeros(num_states)
s0 = agent.compute_state_index(1, 0) # Starting state index
mu[s0] = 1 # Start from the state (1, 0)
# Decision variables: x(s,a)
x_var = cp.Variable(num_states * num_actions)
# Objective: minimize total expected discounted cost
r_flat = r.flatten()
objective = cp.Minimize(r_flat @ x_var)
# Flow conservation constraints
flow_constraints = []
for s in range(num_states):
lhs = cp.sum(x_var[s*num_actions:(s+1)*num_actions])
rhs = (1 - agent.gamma) * mu[s]
for sp in range(num_states):
for a in range(num_actions):
rhs += agent.gamma * P[sp, a, s] * x_var[sp*num_actions + a]
flow_constraints.append(lhs == rhs)
# Constraint cost
c_flat = c.flatten()
cost_constraint = cp.sum(cp.multiply(c_flat, x_var)) <= d
# Non-negativity
nonneg = [x_var >= 0]
# filter out non-allowed actions
allowed = np.ones((num_states, num_actions), dtype=bool) # Allowed actions matrix
non_allowed_actions = []
for s in range(num_states):
_, e = agent.compute_state_coordinates(s)
if e < 0:
non_allowed_actions.append(x_var[s * num_actions + 1] == 0)
non_allowed_actions.append(x_var[s * num_actions + 2] == 0)
# non_allowed_actions = [x[s*num_actions + a] == 0 for s in range(num_states) for a in range(num_actions) if not allowed[s, a]]
# Solve the LP
constraints = flow_constraints + [cost_constraint] + nonneg + non_allowed_actions
prob = cp.Problem(objective, constraints)
prob.solve(solver=cp.ECOS, warm_start=True) # If unavailable, try solver="ECOS" or "SCS"
if verbose:
print("Solver status:", prob.status)
print(x_var.value.shape)
# Retrieve the optimal solution
x_opt = x_var.value.reshape(num_states, num_actions)
LM = cost_constraint.dual_value # Lagrange multiplier for the cost constraint
if verbose:
print("Lagrange multiplier for the cost constraint (d):", LM)
# Compute the stochastic policy: π(a|s) = x(s,a) / sum_a' x(s,a')
policy = np.zeros((num_states, num_actions))
for s in range(num_states):
denom = x_opt[s].sum()
if denom > 1e-8:
policy[s] = x_opt[s] / denom
else:
policy[s] = np.ones(num_actions) / num_actions # fallback uniform policy if denominator is near-zero
# policy_list.append(policy)
if verbose:
print("Optimal policy shape:", policy.shape)
print("Example policy at state 0:", policy[0])
# print the reward obtained with the optimal policy
total_reward = np.sum(x_opt * r) # r is the reward matrix
constraint_cost = np.sum(x_opt * c) # c is the constraint cost matrix
if verbose or only_final_discounted_reward:
if local_noise != 0:
print('Agent {}. Local noise applied: {}. Discounted reward = {}. Average cost = {} '.format(agent.index, local_noise, total_reward/ (1 - agent.gamma), constraint_cost))
elif other_noise != 0:
print('Agent {}. Other noise applied: {}. Discounted reward = {}. Average cost = {} '.format(agent.index, other_noise, total_reward/(1 - agent.gamma), constraint_cost))
else:
print('Agent {}. No noise applied. Discounted reward = {}. Average cost = {} '.format(agent.index, total_reward/(1 - agent.gamma), constraint_cost))
# print("Total expected average reward:", total_reward)
# print("Total expected discounted reward:", total_reward/(1 - agent.gamma))
total_discounted_reward = total_reward / (1 - agent.gamma)
if verbose:
print("Total expected average constraint cost:", constraint_cost)
print("Total expected discounted constraint cost:", constraint_cost/(1 - agent.gamma))
print("Constraint upper bound (d):", d)
# print the amount of states which have at least two actions that can be chosen with a non-negligible probability
if verbose:
count = 0
for s in range(num_states):
if np.sum(policy[s] > 0.1) == 2:
count += 1
print("Number of states with at least one action probability > 0.1:", count)
# now we try to print the policy in a more readable way
if print_policy or verbose:
print("Policy in a more readable format:")
for x in range(1, agent.M + 1):
for e in range(0, agent.B + 1):
index = agent.compute_state_index(x, e)
if math.fabs(np.median(policy[index]) - 1/3) > 1e-5: # we only print the states where the policy is not uniform
if np.median(policy[index]) > 0.0001:
# state with stochastic policy
# print the two actions having probability greater than 0.0001
policy_actions = np.where(policy[index] > 0.0001)[0]
print("State: x={}, e={}. Optimal actions: {}. Policy: {}".format(x, e, policy_actions, policy[index]))
else:
print("State: x={}, e={}. Optimal action: {}".format(x, e, np.argmax(policy[index]), policy[index]))
# we return the policy, the total discounted reward and the average cost
if return_LM:
return policy, total_discounted_reward, constraint_cost, LM
else:
return policy, total_discounted_reward, constraint_cost
def wrapper_single_experiment(list_environments):
verbose = False
agent = list_environments
_, total_discounted_reward, _, optimal_LM = find_optimal_policy(agent, local_noise=0, other_noise=0, return_LM = True, verbose = False)
average_reward = total_discounted_reward * ((1 - agent.gamma) / (1 - agent.gamma**(agent.max_episodes_steps+1)))
# print("Optimal policy:", optimal_policy)
if verbose:
print("Total discounted reward:", total_discounted_reward)
print("Average reward:", average_reward)
print("Average cost:", average_cost)
# local noise is either 0.0001 or -0.0001
positive_local_noise = 0.00001
negative_local_noise = -0.00001
_, total_discounted_reward_local_positive_noise, _ = find_optimal_policy(agent, local_noise=positive_local_noise, other_noise=0, verbose = False)
average_reward_local_positive_noise = total_discounted_reward_local_positive_noise * ((1 - agent.gamma) / (1 - agent.gamma**(agent.max_episodes_steps+1)))
gradient_local_component_positive = (average_reward_local_positive_noise - average_reward) / positive_local_noise
_, total_discounted_reward_local_negative_noise, _ = find_optimal_policy(agent, local_noise=negative_local_noise, other_noise=0, verbose = False)
average_reward_local_negative_noise = total_discounted_reward_local_negative_noise * ((1 - agent.gamma) / (1 - agent.gamma**(agent.max_episodes_steps+1)))
gradient_local_component_negative = (average_reward_local_negative_noise - average_reward) / negative_local_noise
local_noise = 0.05 * random.random() -.025 # values between -0.025 and 0.025
# local cnoise has to be such that local_noise + agent.average_constraint is in [0, 1]
if local_noise + agent.average_constraint < 0.01:
local_noise = -agent.average_constraint + 0.01
elif local_noise + agent.average_constraint > 0.99:
local_noise = 1 - agent.average_constraint - 0.01
_, total_discounted_reward_local_noise, _ = find_optimal_policy(agent, local_noise=local_noise, other_noise=0, verbose = False)
average_reward_local_noise = total_discounted_reward_local_noise * ((1 - agent.gamma) / (1 - agent.gamma**(agent.max_episodes_steps+1)))
gradient_local_component = (average_reward_local_noise - average_reward) / local_noise
if verbose:
print("Average reward with local noise:", average_reward_local_noise)
print("Gradient local component: ", gradient_local_component)
other_noise = 0.5 * random.random() -.25 # values between -0.25 and 0.25
_, total_discounted_reward_other_noise, _ = find_optimal_policy(agent, local_noise=0, other_noise=other_noise, verbose = False)
average_reward_other_noise = total_discounted_reward_other_noise * ((1 - agent.gamma) / (1 - agent.gamma**(agent.max_episodes_steps+1)))
gradient_other_component = (average_reward_other_noise - average_reward) / other_noise
if verbose:
print("Average reward with other noise:", average_reward_other_noise)
print("Gradient other component: ", gradient_other_component)
positive_other_noise = 0.00001
negative_other_noise = -0.00001
_, total_discounted_reward_other_positive_noise, average_cost = find_optimal_policy(agent, local_noise=0, other_noise=positive_other_noise, verbose = False)
average_reward_other_positive_noise = total_discounted_reward_other_positive_noise * ((1 - agent.gamma) / (1 - agent.gamma**(agent.max_episodes_steps+1)))
gradient_other_component_positive = (average_reward_other_positive_noise - average_reward) / positive_other_noise
_, total_discounted_reward_other_negative_noise, _ = find_optimal_policy(agent, local_noise=0, other_noise=negative_other_noise, verbose = False)
average_reward_other_negative_noise = total_discounted_reward_other_negative_noise * ((1 - agent.gamma) / (1 - agent.gamma**(agent.max_episodes_steps+1)))
gradient_other_component_negative = (average_reward_other_negative_noise - average_reward) / negative_other_noise
print("Agent {} completed".format(agent.index))
return [optimal_LM, gradient_local_component_positive, gradient_local_component_negative, local_noise, gradient_local_component, other_noise, gradient_other_component, gradient_other_component_positive, gradient_other_component_negative, agent.sum_average_constraints, average_cost, agent.index]
n_experiments = 15
experiments_environments = []
MAX_ITERATIONS = 0
list_agents = []
total_objective_function = 0
verbose = False
only_final_discounted_reward = True
print_policy = False
# np.random.seed(0) # For reproducibility
# random.seed(0) # For reproducibility
M_values = [5, 6, 7, 8, 9, 10]
B_values = [5, 6, 7, 8, 9, 10]
min_value_harvesting_list = [0, 1]
max_value_harvesting_list = [1]
min_value_cost_list = [1]
max_value_cost_list = [3, 4, 5]
congestion_penalty_exponent = 2
list_environments = []
# here we should read the file for D3C and extract the final constraints
# definition of the players in the system
for experiment in range(n_experiments):
# print('Creating agent {}'.format(player+1))
# we are just simulating the existence of an actual system, so n_agents is irrelevant
n_agents = 15 # Number of agents in the system
M = random.choice(M_values)
B = random.choice(B_values)
agent = SingleDevice(M = M, B=B, total_players=n_agents, congestion_penalty_multiplier = 1, congestion_penalty_exponent = congestion_penalty_exponent)
agent.gamma = .95
harvesting_distribution = np.zeros((agent.B + 1, ))
min_value_harvesting = random.choice(min_value_harvesting_list)
max_value_harvesting = random.choice(max_value_harvesting_list)
for i in range(min_value_harvesting, max_value_harvesting+1):
harvesting_distribution[i] = 1
harvesting_distribution = harvesting_distribution / np.sum(harvesting_distribution)
agent.p_H = harvesting_distribution
# print('Harvesting distribution:', harvesting_distribution)
min_value_cost = random.choice(min_value_cost_list)
max_value_cost = random.choice(max_value_cost_list)
p_C=np.zeros((agent.B+1, )) # vector of energy cost probabilities (0,1,...,B+h)
for i in range(min_value_cost, max_value_cost+1):
p_C[i] = 1
p_C = p_C / np.sum(p_C)
agent.p_C = p_C
# print(min_value_harvesting, max_value_harvesting, min_value_cost, max_value_cost)
agent.average_constraint = random.uniform(0.01, 0.25)
agent.discounted_constraint = agent.average_constraint * (1 - agent.gamma**agent.max_episodes_steps)/(1 - agent.gamma)
agent.sum_average_constraints = np.sum(np.random.uniform(0.01, 0.25, size=n_agents)) # Sum of average constraints for all agents
agent.index = experiment+1
# agent.local_noise = 0.01 * random.random() + 0.001
# agent.other_noise = 0.5 * random.random()
list_environments.append((agent, ))
pool = mp.Pool(15)
results = pool.starmap(wrapper_single_experiment, list_environments)
error_estimate_gradient = []
print(results)
for experiment in range(n_experiments):
result_experiment = results[experiment]
print("Experiment {}:".format(experiment + 1))
print("Lagrange multiplier:", result_experiment[0])
print("Gradient local component: {}, {}".format(result_experiment[1], result_experiment[2]))
print("Derivative local component: ", result_experiment[4])
print("Gradient other component:", result_experiment[6])
error_estimate_gradient.append((-result_experiment[1] - result_experiment[0])/ result_experiment[0])
print("Error gradient:", error_estimate_gradient[-1])
print("-"*50)
print(r"Error estimate gradient: {} $\pm$ {}".format(np.mean(error_estimate_gradient), np.std(error_estimate_gradient)))
# now we save the results in a file
folder = os.getcwd() + "/results/gradient_estimate_LP/"
with open(folder + "results_gradient_objective_function_alpha_{}.pkl".format(congestion_penalty_exponent), "wb") as f:
pk.dump(results, f)