File size: 17,612 Bytes
717576e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 |
import tensorflow as tf
import numpy as np
import gym
import os
import shutil
import sys
import time
np.random.seed(1)
tf.set_random_seed(1)
fold = int(sys.argv[1])
st = time.process_time()
MAX_EPISODES = 1500
LR_A = 0.0005 # learning rate for actor
LR_C = 0.0005 # learning rate for critic
GAMMA = 0.999 # reward discount
REPLACE_ITER_A = 1700
REPLACE_ITER_C = 1500
MEMORY_CAPACITY = 200000
BATCH_SIZE = 32
DISPLAY_THRESHOLD = 100 # display until the running reward > 100
DATA_PATH = './data'
LOAD_MODEL = False
SAVE_MODEL_ITER = 100000
RENDER = False
OUTPUT_GRAPH = False
ENV_NAME = 'BipedalWalker-v2'
GLOBAL_STEP = tf.Variable(0, trainable=False)
INCREASE_GS = GLOBAL_STEP.assign(tf.add(GLOBAL_STEP, 1))
LR_A = tf.train.exponential_decay(LR_A, GLOBAL_STEP, 10000, .97, staircase=True)
LR_C = tf.train.exponential_decay(LR_C, GLOBAL_STEP, 10000, .97, staircase=True)
END_POINT = (200 - 10) * (14/30) # from game
env = gym.make(ENV_NAME)
env.seed(1)
STATE_DIM = env.observation_space.shape[0] # 24
ACTION_DIM = env.action_space.shape[0] # 4
ACTION_BOUND = env.action_space.high # [1, 1, 1, 1]
# all placeholder for tf
with tf.name_scope('S'):
S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s')
with tf.name_scope('R'):
R = tf.placeholder(tf.float32, [None, 1], name='r')
with tf.name_scope('S_'):
S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_')
############################### Actor ####################################
class Actor(object):
def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter):
self.sess = sess
self.a_dim = action_dim
self.action_bound = action_bound
self.lr = learning_rate
self.t_replace_iter = t_replace_iter
self.t_replace_counter = 0
with tf.variable_scope('Actor'):
# input s, output a
self.a = self._build_net(S, scope='eval_net', trainable=True)
# input s_, output a, get a_ for critic
self.a_ = self._build_net(S_, scope='target_net', trainable=False)
self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net')
def _build_net(self, s, scope, trainable):
with tf.variable_scope(scope):
init_w = tf.random_normal_initializer(0., 0.01)
init_b = tf.constant_initializer(0.01)
two = tf.constant(2.0, dtype=tf.float32)
def UAF(x):
UAF_A = tf.Variable(tf.random_normal([1], mean=1.1, stddev=0.0, dtype=tf.float32), dtype=tf.float32)
UAF_B = tf.Variable(tf.random_normal([1], mean=-0.01, stddev=0.0, dtype=tf.float32), dtype=tf.float32)
UAF_C = tf.Variable(tf.random_normal([1], mean=-0.1, stddev=0.0, dtype=tf.float32), dtype=tf.float32)
UAF_D = tf.Variable(tf.random_normal([1], mean=-0.9, stddev=0.0, dtype=tf.float32), dtype=tf.float32)
UAF_E = tf.Variable(tf.random_normal([1], mean=0.01, stddev=0.0, dtype=tf.float32), dtype=tf.float32)
P1 = tf.multiply(UAF_A,(x+UAF_B)) +tf.multiply(UAF_C,tf.pow(x,two))
P2 = tf.multiply(UAF_D,(x-UAF_B))
P3 = tf.nn.relu(P1) + tf.math.log1p(tf.exp(-tf.abs(P1)))
P4 = tf.nn.relu(P2) + tf.math.log1p(tf.exp(-tf.abs(P2)))
return P3 - P4 + UAF_E
net = tf.layers.dense(s, 500, activation=UAF,
kernel_initializer=init_w, bias_initializer=init_b, name='l1', trainable=trainable)
net = tf.layers.dense(net, 200, activation=UAF,
kernel_initializer=init_w, bias_initializer=init_b, name='l2', trainable=trainable)
with tf.variable_scope('a'):
actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w,
bias_initializer=init_b, name='a', trainable=trainable)
scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound
return scaled_a
def learn(self, s): # batch update
self.sess.run(self.train_op, feed_dict={S: s})
if self.t_replace_counter % self.t_replace_iter == 0:
self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
self.t_replace_counter += 1
def choose_action(self, s):
s = s[np.newaxis, :] # single state
return self.sess.run(self.a, feed_dict={S: s})[0] # single action
def add_grad_to_graph(self, a_grads):
with tf.variable_scope('policy_grads'):
# ys = policy;
# xs = policy's parameters;
# self.a_grads = the gradients of the policy to get more Q
# tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams
self.policy_grads_and_vars = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads)
with tf.variable_scope('A_train'):
opt = tf.train.RMSPropOptimizer(-self.lr) # (- learning rate) for ascent policy
self.train_op = opt.apply_gradients(zip(self.policy_grads_and_vars, self.e_params), global_step=GLOBAL_STEP)
############################### Critic ####################################
class Critic(object):
def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a, a_):
self.sess = sess
self.s_dim = state_dim
self.a_dim = action_dim
self.lr = learning_rate
self.gamma = gamma
self.t_replace_iter = t_replace_iter
self.t_replace_counter = 0
with tf.variable_scope('Critic'):
# Input (s, a), output q
self.a = a
self.q = self._build_net(S, self.a, 'eval_net', trainable=True)
# Input (s_, a_), output q_ for q_target
self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net
self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net')
self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net')
with tf.variable_scope('target_q'):
self.target_q = R + self.gamma * self.q_
with tf.variable_scope('abs_TD'):
self.abs_td = tf.abs(self.target_q - self.q)
self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights')
with tf.variable_scope('TD_error'):
self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.target_q, self.q))
with tf.variable_scope('C_train'):
self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss, global_step=GLOBAL_STEP)
with tf.variable_scope('a_grad'):
self.a_grads = tf.gradients(self.q, a)[0] # tensor of gradients of each sample (None, a_dim)
def _build_net(self, s, a, scope, trainable):
with tf.variable_scope(scope):
init_w = tf.random_normal_initializer(0., 0.01)
init_b = tf.constant_initializer(0.01)
two = tf.constant(2.0, dtype=tf.float32)
def UAF2(x):
return tf.nn.relu(x)
with tf.variable_scope('l1'):
n_l1 = 700
# combine the action and states together in this way
w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable)
w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable)
b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable)
net = UAF2(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1)
with tf.variable_scope('l2'):
net = tf.layers.dense(net, 20, activation=UAF2, kernel_initializer=init_w,
bias_initializer=init_b, name='l2', trainable=trainable)
with tf.variable_scope('q'):
q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a)
return q
def learn(self, s, a, r, s_, ISW):
_, abs_td = self.sess.run([self.train_op, self.abs_td], feed_dict={S: s, self.a: a, R: r, S_: s_, self.ISWeights: ISW})
if self.t_replace_counter % self.t_replace_iter == 0:
self.sess.run([tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)])
self.t_replace_counter += 1
return abs_td
class SumTree(object):
"""
This SumTree code is modified version and the original code is from:
https://github.com/jaara/AI-blog/blob/master/SumTree.py
Story the data with it priority in tree and data frameworks.
"""
data_pointer = 0
def __init__(self, capacity):
self.capacity = capacity # for all priority values
self.tree = np.zeros(2 * capacity - 1)+1e-5
# [--------------Parent nodes-------------][-------leaves to recode priority-------]
# size: capacity - 1 size: capacity
self.data = np.zeros(capacity, dtype=object) # for all transitions
# [--------------data frame-------------]
# size: capacity
def add_new_priority(self, p, data):
leaf_idx = self.data_pointer + self.capacity - 1
self.data[self.data_pointer] = data # update data_frame
self.update(leaf_idx, p) # update tree_frame
self.data_pointer += 1
if self.data_pointer >= self.capacity: # replace when exceed the capacity
self.data_pointer = 0
def update(self, tree_idx, p):
change = p - self.tree[tree_idx]
self.tree[tree_idx] = p
self._propagate_change(tree_idx, change)
def _propagate_change(self, tree_idx, change):
"""change the sum of priority value in all parent nodes"""
parent_idx = (tree_idx - 1) // 2
self.tree[parent_idx] += change
if parent_idx != 0:
self._propagate_change(parent_idx, change)
def get_leaf(self, lower_bound):
leaf_idx = self._retrieve(lower_bound) # search the max leaf priority based on the lower_bound
data_idx = leaf_idx - self.capacity + 1
return [leaf_idx, self.tree[leaf_idx], self.data[data_idx]]
def _retrieve(self, lower_bound, parent_idx=0):
"""
Tree structure and array storage:
Tree index:
0 -> storing priority sum
/ \
1 2
/ \ / \
3 4 5 6 -> storing priority for transitions
Array type for storing:
[0,1,2,3,4,5,6]
"""
left_child_idx = 2 * parent_idx + 1
right_child_idx = left_child_idx + 1
if left_child_idx >= len(self.tree): # end search when no more child
return parent_idx
if self.tree[left_child_idx] == self.tree[right_child_idx]:
return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx]))
if lower_bound <= self.tree[left_child_idx]: # downward search, always search for a higher priority node
return self._retrieve(lower_bound, left_child_idx)
else:
return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx)
@property
def root_priority(self):
return self.tree[0] # the root
class Memory(object): # stored as ( s, a, r, s_ ) in SumTree
"""
This SumTree code is modified version and the original code is from:
https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
"""
epsilon = 0.001 # small amount to avoid zero priority
alpha = 0.6 # [0~1] convert the importance of TD error to priority
beta = 0.4 # importance-sampling, from initial value increasing to 1
beta_increment_per_sampling = 1e-5 # annealing the bias
abs_err_upper = 1 # for stability refer to paper
def __init__(self, capacity):
self.tree = SumTree(capacity)
def store(self, error, transition):
p = self._get_priority(error)
self.tree.add_new_priority(p, transition)
def prio_sample(self, n):
batch_idx, batch_memory, ISWeights = [], [], []
segment = self.tree.root_priority / n
self.beta = np.min([1, self.beta + self.beta_increment_per_sampling]) # max = 1
min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority
maxiwi = np.power(self.tree.capacity * min_prob, -self.beta) # for later normalizing ISWeights
for i in range(n):
a = segment * i
b = segment * (i + 1)
lower_bound = np.random.uniform(a, b)
while True:
idx, p, data = self.tree.get_leaf(lower_bound)
if type(data) is int:
i -= 1
lower_bound = np.random.uniform(segment * i, segment * (i+1))
else:
break
prob = p / self.tree.root_priority
ISWeights.append(self.tree.capacity * prob)
batch_idx.append(idx)
batch_memory.append(data)
ISWeights = np.vstack(ISWeights)
ISWeights = np.power(ISWeights, -self.beta) / maxiwi # normalize
return batch_idx, np.vstack(batch_memory), ISWeights
def random_sample(self, n):
idx = np.random.randint(0, self.tree.capacity, size=n, dtype=np.int)
return np.vstack(self.tree.data[idx])
def update(self, idx, error):
p = self._get_priority(error)
self.tree.update(idx, p)
def _get_priority(self, error):
error += self.epsilon # avoid 0
clipped_error = np.clip(error, 0, self.abs_err_upper)
return np.power(clipped_error, self.alpha)
sess = tf.Session()
# Create actor and critic.
actor = Actor(sess, ACTION_DIM, ACTION_BOUND, LR_A, REPLACE_ITER_A)
critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a, actor.a_)
actor.add_grad_to_graph(critic.a_grads)
M = Memory(MEMORY_CAPACITY)
saver = tf.train.Saver(max_to_keep=100)
if LOAD_MODEL:
all_ckpt = tf.train.get_checkpoint_state('./data', 'checkpoint').all_model_checkpoint_paths
saver.restore(sess, all_ckpt[-1])
else:
if os.path.isdir(DATA_PATH): shutil.rmtree(DATA_PATH)
os.mkdir(DATA_PATH)
sess.run(tf.global_variables_initializer())
if OUTPUT_GRAPH:
tf.summary.FileWriter('logs', graph=sess.graph)
var = 3 # control exploration
var_min = 0.01
dataarr = []
for i_episode in range(MAX_EPISODES):
# s = (hull angle speed, angular velocity, horizontal speed, vertical speed, position of joints and joints angular speed, legs contact with ground, and 10 lidar rangefinder measurements.)
s = env.reset()
ep_r = 0
while True:
if RENDER:
env.render()
a = actor.choose_action(s)
a = np.clip(np.random.normal(a, var), -1, 1) # add randomness to action selection for exploration
s_, r, done, _ = env.step(a) # r = total 300+ points up to the far end. If the robot falls, it gets -100.
if r == -100: r = -2
ep_r += r
transition = np.hstack((s, a, [r], s_))
max_p = np.max(M.tree.tree[-M.tree.capacity:])
M.store(max_p, transition)
if GLOBAL_STEP.eval(sess) > MEMORY_CAPACITY/20:
var = max([var*0.9999, var_min]) # decay the action randomness
tree_idx, b_M, ISWeights = M.prio_sample(BATCH_SIZE) # for critic update
b_s = b_M[:, :STATE_DIM]
b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM]
b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM]
b_s_ = b_M[:, -STATE_DIM:]
abs_td = critic.learn(b_s, b_a, b_r, b_s_, ISWeights)
actor.learn(b_s)
for i in range(len(tree_idx)): # update priority
idx = tree_idx[i]
M.update(idx, abs_td[i])
if GLOBAL_STEP.eval(sess) % SAVE_MODEL_ITER == 0:
ckpt_path = os.path.join(DATA_PATH, 'DDPG.ckpt')
save_path = saver.save(sess, ckpt_path, global_step=GLOBAL_STEP, write_meta_graph=False)
print("\nSave Model %s\n" % save_path)
if done:
if "running_r" not in globals():
running_r = ep_r
else:
running_r = 0.95*running_r + 0.05*ep_r
#if running_r > DISPLAY_THRESHOLD: RENDER = True
#else: RENDER = False
stats = np.array([running_r, env.unwrapped.hull.position[0] ])
dataarr.append(stats)
done = '| Achieve ' if env.unwrapped.hull.position[0] >= END_POINT else '| -----'
print('Episode:', i_episode,
done,
'| Running_r: %i' % int(running_r),
'| Epi_r: %.2f' % ep_r,
'| Exploration: %.3f' % var,
'| Pos: %.i' % int(env.unwrapped.hull.position[0]),
'| LR_A: %.6f' % sess.run(LR_A),
'| LR_C: %.6f' % sess.run(LR_C),
)
break
s = s_
sess.run(INCREASE_GS)
elapsed_time = time.process_time() - st
data_dir = "time_" + str(fold) + ".npy"
np.save(data_dir, np.array([elapsed_time]))
dataarr = np.asarray(dataarr)
data_dir = "stats_" + str(fold) + ".npy"
np.save(data_dir, dataarr)
|