import tensorflow as tf import numpy as np import gym import os import shutil import sys import time np.random.seed(1) tf.set_random_seed(1) fold = int(sys.argv[1]) st = time.process_time() MAX_EPISODES = 1500 LR_A = 0.0005 # learning rate for actor LR_C = 0.0005 # learning rate for critic GAMMA = 0.999 # reward discount REPLACE_ITER_A = 1700 REPLACE_ITER_C = 1500 MEMORY_CAPACITY = 200000 BATCH_SIZE = 32 DISPLAY_THRESHOLD = 100 # display until the running reward > 100 DATA_PATH = './data' LOAD_MODEL = False SAVE_MODEL_ITER = 100000 RENDER = False OUTPUT_GRAPH = False ENV_NAME = 'BipedalWalker-v2' GLOBAL_STEP = tf.Variable(0, trainable=False) INCREASE_GS = GLOBAL_STEP.assign(tf.add(GLOBAL_STEP, 1)) LR_A = tf.train.exponential_decay(LR_A, GLOBAL_STEP, 10000, .97, staircase=True) LR_C = tf.train.exponential_decay(LR_C, GLOBAL_STEP, 10000, .97, staircase=True) END_POINT = (200 - 10) * (14/30) # from game env = gym.make(ENV_NAME) env.seed(1) STATE_DIM = env.observation_space.shape[0] # 24 ACTION_DIM = env.action_space.shape[0] # 4 ACTION_BOUND = env.action_space.high # [1, 1, 1, 1] # all placeholder for tf with tf.name_scope('S'): S = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s') with tf.name_scope('R'): R = tf.placeholder(tf.float32, [None, 1], name='r') with tf.name_scope('S_'): S_ = tf.placeholder(tf.float32, shape=[None, STATE_DIM], name='s_') ############################### Actor #################################### class Actor(object): def __init__(self, sess, action_dim, action_bound, learning_rate, t_replace_iter): self.sess = sess self.a_dim = action_dim self.action_bound = action_bound = learning_rate self.t_replace_iter = t_replace_iter self.t_replace_counter = 0 with tf.variable_scope('Actor'): # input s, output a self.a = self._build_net(S, scope='eval_net', trainable=True) # input s_, output a, get a_ for critic self.a_ = self._build_net(S_, scope='target_net', trainable=False) self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/eval_net') self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/target_net') def _build_net(self, s, scope, trainable): with tf.variable_scope(scope): init_w = tf.random_normal_initializer(0., 0.01) init_b = tf.constant_initializer(0.01) two = tf.constant(2.0, dtype=tf.float32) def UAF(x): UAF_A = tf.Variable(tf.random_normal([1], mean=1.1, stddev=0.0, dtype=tf.float32), dtype=tf.float32) UAF_B = tf.Variable(tf.random_normal([1], mean=-0.01, stddev=0.0, dtype=tf.float32), dtype=tf.float32) UAF_C = tf.Variable(tf.random_normal([1], mean=-0.1, stddev=0.0, dtype=tf.float32), dtype=tf.float32) UAF_D = tf.Variable(tf.random_normal([1], mean=-0.9, stddev=0.0, dtype=tf.float32), dtype=tf.float32) UAF_E = tf.Variable(tf.random_normal([1], mean=0.01, stddev=0.0, dtype=tf.float32), dtype=tf.float32) P1 = tf.multiply(UAF_A,(x+UAF_B)) +tf.multiply(UAF_C,tf.pow(x,two)) P2 = tf.multiply(UAF_D,(x-UAF_B)) P3 = tf.nn.relu(P1) + tf.math.log1p(tf.exp(-tf.abs(P1))) P4 = tf.nn.relu(P2) + tf.math.log1p(tf.exp(-tf.abs(P2))) return P3 - P4 + UAF_E net = tf.layers.dense(s, 500, activation=UAF, kernel_initializer=init_w, bias_initializer=init_b, name='l1', trainable=trainable) net = tf.layers.dense(net, 200, activation=UAF, kernel_initializer=init_w, bias_initializer=init_b, name='l2', trainable=trainable) with tf.variable_scope('a'): actions = tf.layers.dense(net, self.a_dim, activation=tf.nn.tanh, kernel_initializer=init_w, bias_initializer=init_b, name='a', trainable=trainable) scaled_a = tf.multiply(actions, self.action_bound, name='scaled_a') # Scale output to -action_bound to action_bound return scaled_a def learn(self, s): # batch update, feed_dict={S: s}) if self.t_replace_counter % self.t_replace_iter == 0:[tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]) self.t_replace_counter += 1 def choose_action(self, s): s = s[np.newaxis, :] # single state return, feed_dict={S: s})[0] # single action def add_grad_to_graph(self, a_grads): with tf.variable_scope('policy_grads'): # ys = policy; # xs = policy's parameters; # self.a_grads = the gradients of the policy to get more Q # tf.gradients will calculate dys/dxs with a initial gradients for ys, so this is dq/da * da/dparams self.policy_grads_and_vars = tf.gradients(ys=self.a, xs=self.e_params, grad_ys=a_grads) with tf.variable_scope('A_train'): opt = tf.train.RMSPropOptimizer( # (- learning rate) for ascent policy self.train_op = opt.apply_gradients(zip(self.policy_grads_and_vars, self.e_params), global_step=GLOBAL_STEP) ############################### Critic #################################### class Critic(object): def __init__(self, sess, state_dim, action_dim, learning_rate, gamma, t_replace_iter, a, a_): self.sess = sess self.s_dim = state_dim self.a_dim = action_dim = learning_rate self.gamma = gamma self.t_replace_iter = t_replace_iter self.t_replace_counter = 0 with tf.variable_scope('Critic'): # Input (s, a), output q self.a = a self.q = self._build_net(S, self.a, 'eval_net', trainable=True) # Input (s_, a_), output q_ for q_target self.q_ = self._build_net(S_, a_, 'target_net', trainable=False) # target_q is based on a_ from Actor's target_net self.e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/eval_net') self.t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/target_net') with tf.variable_scope('target_q'): self.target_q = R + self.gamma * self.q_ with tf.variable_scope('abs_TD'): self.abs_td = tf.abs(self.target_q - self.q) self.ISWeights = tf.placeholder(tf.float32, [None, 1], name='IS_weights') with tf.variable_scope('TD_error'): self.loss = tf.reduce_mean(self.ISWeights * tf.squared_difference(self.target_q, self.q)) with tf.variable_scope('C_train'): self.train_op = tf.train.AdamOptimizer(, global_step=GLOBAL_STEP) with tf.variable_scope('a_grad'): self.a_grads = tf.gradients(self.q, a)[0] # tensor of gradients of each sample (None, a_dim) def _build_net(self, s, a, scope, trainable): with tf.variable_scope(scope): init_w = tf.random_normal_initializer(0., 0.01) init_b = tf.constant_initializer(0.01) two = tf.constant(2.0, dtype=tf.float32) def UAF2(x): return tf.nn.relu(x) with tf.variable_scope('l1'): n_l1 = 700 # combine the action and states together in this way w1_s = tf.get_variable('w1_s', [self.s_dim, n_l1], initializer=init_w, trainable=trainable) w1_a = tf.get_variable('w1_a', [self.a_dim, n_l1], initializer=init_w, trainable=trainable) b1 = tf.get_variable('b1', [1, n_l1], initializer=init_b, trainable=trainable) net = UAF2(tf.matmul(s, w1_s) + tf.matmul(a, w1_a) + b1) with tf.variable_scope('l2'): net = tf.layers.dense(net, 20, activation=UAF2, kernel_initializer=init_w, bias_initializer=init_b, name='l2', trainable=trainable) with tf.variable_scope('q'): q = tf.layers.dense(net, 1, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable) # Q(s,a) return q def learn(self, s, a, r, s_, ISW): _, abs_td =[self.train_op, self.abs_td], feed_dict={S: s, self.a: a, R: r, S_: s_, self.ISWeights: ISW}) if self.t_replace_counter % self.t_replace_iter == 0:[tf.assign(t, e) for t, e in zip(self.t_params, self.e_params)]) self.t_replace_counter += 1 return abs_td class SumTree(object): """ This SumTree code is modified version and the original code is from: Story the data with it priority in tree and data frameworks. """ data_pointer = 0 def __init__(self, capacity): self.capacity = capacity # for all priority values self.tree = np.zeros(2 * capacity - 1)+1e-5 # [--------------Parent nodes-------------][-------leaves to recode priority-------] # size: capacity - 1 size: capacity = np.zeros(capacity, dtype=object) # for all transitions # [--------------data frame-------------] # size: capacity def add_new_priority(self, p, data): leaf_idx = self.data_pointer + self.capacity - 1[self.data_pointer] = data # update data_frame self.update(leaf_idx, p) # update tree_frame self.data_pointer += 1 if self.data_pointer >= self.capacity: # replace when exceed the capacity self.data_pointer = 0 def update(self, tree_idx, p): change = p - self.tree[tree_idx] self.tree[tree_idx] = p self._propagate_change(tree_idx, change) def _propagate_change(self, tree_idx, change): """change the sum of priority value in all parent nodes""" parent_idx = (tree_idx - 1) // 2 self.tree[parent_idx] += change if parent_idx != 0: self._propagate_change(parent_idx, change) def get_leaf(self, lower_bound): leaf_idx = self._retrieve(lower_bound) # search the max leaf priority based on the lower_bound data_idx = leaf_idx - self.capacity + 1 return [leaf_idx, self.tree[leaf_idx],[data_idx]] def _retrieve(self, lower_bound, parent_idx=0): """ Tree structure and array storage: Tree index: 0 -> storing priority sum / \ 1 2 / \ / \ 3 4 5 6 -> storing priority for transitions Array type for storing: [0,1,2,3,4,5,6] """ left_child_idx = 2 * parent_idx + 1 right_child_idx = left_child_idx + 1 if left_child_idx >= len(self.tree): # end search when no more child return parent_idx if self.tree[left_child_idx] == self.tree[right_child_idx]: return self._retrieve(lower_bound, np.random.choice([left_child_idx, right_child_idx])) if lower_bound <= self.tree[left_child_idx]: # downward search, always search for a higher priority node return self._retrieve(lower_bound, left_child_idx) else: return self._retrieve(lower_bound - self.tree[left_child_idx], right_child_idx) @property def root_priority(self): return self.tree[0] # the root class Memory(object): # stored as ( s, a, r, s_ ) in SumTree """ This SumTree code is modified version and the original code is from: """ epsilon = 0.001 # small amount to avoid zero priority alpha = 0.6 # [0~1] convert the importance of TD error to priority beta = 0.4 # importance-sampling, from initial value increasing to 1 beta_increment_per_sampling = 1e-5 # annealing the bias abs_err_upper = 1 # for stability refer to paper def __init__(self, capacity): self.tree = SumTree(capacity) def store(self, error, transition): p = self._get_priority(error) self.tree.add_new_priority(p, transition) def prio_sample(self, n): batch_idx, batch_memory, ISWeights = [], [], [] segment = self.tree.root_priority / n self.beta = np.min([1, self.beta + self.beta_increment_per_sampling]) # max = 1 min_prob = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.root_priority maxiwi = np.power(self.tree.capacity * min_prob, -self.beta) # for later normalizing ISWeights for i in range(n): a = segment * i b = segment * (i + 1) lower_bound = np.random.uniform(a, b) while True: idx, p, data = self.tree.get_leaf(lower_bound) if type(data) is int: i -= 1 lower_bound = np.random.uniform(segment * i, segment * (i+1)) else: break prob = p / self.tree.root_priority ISWeights.append(self.tree.capacity * prob) batch_idx.append(idx) batch_memory.append(data) ISWeights = np.vstack(ISWeights) ISWeights = np.power(ISWeights, -self.beta) / maxiwi # normalize return batch_idx, np.vstack(batch_memory), ISWeights def random_sample(self, n): idx = np.random.randint(0, self.tree.capacity, size=n, return np.vstack([idx]) def update(self, idx, error): p = self._get_priority(error) self.tree.update(idx, p) def _get_priority(self, error): error += self.epsilon # avoid 0 clipped_error = np.clip(error, 0, self.abs_err_upper) return np.power(clipped_error, self.alpha) sess = tf.Session() # Create actor and critic. actor = Actor(sess, ACTION_DIM, ACTION_BOUND, LR_A, REPLACE_ITER_A) critic = Critic(sess, STATE_DIM, ACTION_DIM, LR_C, GAMMA, REPLACE_ITER_C, actor.a, actor.a_) actor.add_grad_to_graph(critic.a_grads) M = Memory(MEMORY_CAPACITY) saver = tf.train.Saver(max_to_keep=100) if LOAD_MODEL: all_ckpt = tf.train.get_checkpoint_state('./data', 'checkpoint').all_model_checkpoint_paths saver.restore(sess, all_ckpt[-1]) else: if os.path.isdir(DATA_PATH): shutil.rmtree(DATA_PATH) os.mkdir(DATA_PATH) if OUTPUT_GRAPH: tf.summary.FileWriter('logs', graph=sess.graph) var = 3 # control exploration var_min = 0.01 dataarr = [] for i_episode in range(MAX_EPISODES): # s = (hull angle speed, angular velocity, horizontal speed, vertical speed, position of joints and joints angular speed, legs contact with ground, and 10 lidar rangefinder measurements.) s = env.reset() ep_r = 0 while True: if RENDER: env.render() a = actor.choose_action(s) a = np.clip(np.random.normal(a, var), -1, 1) # add randomness to action selection for exploration s_, r, done, _ = env.step(a) # r = total 300+ points up to the far end. If the robot falls, it gets -100. if r == -100: r = -2 ep_r += r transition = np.hstack((s, a, [r], s_)) max_p = np.max(M.tree.tree[-M.tree.capacity:]), transition) if GLOBAL_STEP.eval(sess) > MEMORY_CAPACITY/20: var = max([var*0.9999, var_min]) # decay the action randomness tree_idx, b_M, ISWeights = M.prio_sample(BATCH_SIZE) # for critic update b_s = b_M[:, :STATE_DIM] b_a = b_M[:, STATE_DIM: STATE_DIM + ACTION_DIM] b_r = b_M[:, -STATE_DIM - 1: -STATE_DIM] b_s_ = b_M[:, -STATE_DIM:] abs_td = critic.learn(b_s, b_a, b_r, b_s_, ISWeights) actor.learn(b_s) for i in range(len(tree_idx)): # update priority idx = tree_idx[i] M.update(idx, abs_td[i]) if GLOBAL_STEP.eval(sess) % SAVE_MODEL_ITER == 0: ckpt_path = os.path.join(DATA_PATH, 'DDPG.ckpt') save_path =, ckpt_path, global_step=GLOBAL_STEP, write_meta_graph=False) print("\nSave Model %s\n" % save_path) if done: if "running_r" not in globals(): running_r = ep_r else: running_r = 0.95*running_r + 0.05*ep_r #if running_r > DISPLAY_THRESHOLD: RENDER = True #else: RENDER = False stats = np.array([running_r, env.unwrapped.hull.position[0] ]) dataarr.append(stats) done = '| Achieve ' if env.unwrapped.hull.position[0] >= END_POINT else '| -----' print('Episode:', i_episode, done, '| Running_r: %i' % int(running_r), '| Epi_r: %.2f' % ep_r, '| Exploration: %.3f' % var, '| Pos: %.i' % int(env.unwrapped.hull.position[0]), '| LR_A: %.6f' %, '| LR_C: %.6f' %, ) break s = s_ elapsed_time = time.process_time() - st data_dir = "time_" + str(fold) + ".npy", np.array([elapsed_time])) dataarr = np.asarray(dataarr) data_dir = "stats_" + str(fold) + ".npy", dataarr)