htqcheng · htqcheng · Apr 5, 2020 · Apr 9, 2020 · Apr 12, 2020 · Apr 12, 2020
diff --git a/README.md b/README.md
@@ -15,6 +15,7 @@ Please use Python 3.6
 1. Install [PyRep](https://github.com/stepjam/PyRep)
 2. Install [RLBench](https://github.com/stepjam/RLBench)
 3. `pip install -r requirements.txt`
+4. The requirements.txt include TensorForce and TensorFlow for Reinforcement Learning
 
 ## Example RLBench Usage
 Run `python rlbench_example.py` to launch the example script.
@@ -25,4 +26,4 @@ This script contains example code on how to control the robot, get observations,
 ## Useful Files
 The following files may be useful to reference from the In the `rlbench` folder in the `RLBench` repo:
 * `rlbench/action_modes.py` - Different action modes to control the robot
-* `rlbench/backend/observation.py` - All fields available in the observation object
+* `rlbench/backend/observation.py` - All fields available in the observation object
diff --git a/TensorForceFiles/DQN_class.py b/TensorForceFiles/DQN_class.py
@@ -0,0 +1,29 @@
+from tensorforce import Agent
+import sys
+sys.path.append('../')
+sys.path.append(sys.path[0] + '/TensorForceFiles')
+from TensorForce_class import *
+import numpy as np
+
+
+class TensorForceDQN(TensorForceClass):
+
+    def __init__(self,num_states=6, num_actions=4, load=None):
+        super().__init__(num_states=num_states, num_actions=num_actions,load=load)
+        self.num_states = num_states
+        self.num_actions = num_actions
+
+
+
+    def createRLagent(self, load):
+        states_dict = {'type': 'float', 'shape': self.num_states}
+        actions_dict = {'type': 'float', 'shape': self.num_actions, 'min_value': self.input_low, 'max_value': self.input_high}
+
+        return Agent.create(
+            agent='dqn',
+            states = states_dict,  # alternatively: states, actions, (max_episode_timesteps)
+            actions = actions_dict,
+            memory=10000,
+            exploration=0.3,
+            max_episode_timesteps= self.len_episode,
+        )
diff --git a/TensorForceFiles/TensorForce_class.py b/TensorForceFiles/TensorForce_class.py
@@ -0,0 +1,118 @@
+from tensorforce import Agent
+import numpy as np
+
+
+class TensorForceClass:
+
+    def __init__(self, num_states=6, num_actions=4, load=None):
+
+        self.num_states = num_states
+        self.num_actions = num_actions
+        self.input_high = 1.0
+        self.input_low  = 0.0
+
+
+        self.len_episode = 10
+        self.explore = 0.5
+
+        self.x_r = [-0.025, 0.52]   ## X Range: -0.025 - 0.52
+        self.y_r = [-0.45, 0.45]    ## Y Range: -0.45 - 0.45 
+        self.z_r = [0.751, 1.75]    ## Z Range: 0.751 - 1.75 (Maybe a little higher)
+
+        self.dist_before_action = 0
+        self.dist_after_action = 0
+
+        self.has_object = False
+
+        self.agent = self.createRLagent(load=load)
+        self.target_state = []
+
+
+    def createRLagent(self, load=None):
+        states_dict = {'type': 'float', 'shape': self.num_states}
+        actions_dict = {'type': 'float', 'shape': self.num_actions, 'min_value': self.input_low, 'max_value': self.input_high}
+
+        agent =   Agent.create(
+            agent='tensorforce',
+            states = states_dict,  # alternatively: states, actions, (max_episode_timesteps)
+            actions = actions_dict,
+            memory=10000,
+            update=dict(unit='timesteps', batch_size=64),
+            max_episode_timesteps= self.len_episode,
+            optimizer=dict(type='adam', learning_rate=3e-4),
+            policy=dict(network='auto'),
+            objective='policy_gradient',
+            reward_estimation=dict(horizon=20)
+        )
+
+        if not load ==None:
+            agent.restore(directory=load)
+
+        return agent
+
+    def act(self, obs, obj_poses):
+        gripper_pose = obs.gripper_pose
+
+
+        key = 'sugar'
+        ###########################################################
+        ###### PREPARE INPUT STATES TO RL FUNCTION ################
+        if key in obj_poses:
+            target_state = list(obj_poses[key])
+            target_state[2] += 0.1
+        else:
+            self.has_object = True
+            target_state = [0.2, 0.0, 1.1]
+        # in_states = list(gripper_pose)
+        # in_states.extend(target_state)
+
+        in_states = list(gripper_pose[:3])
+        in_states.extend(list(target_state[:3]))
+        # in_states.extend(list(obj_poses['cupboard']))
+        ###### PREPARE INPUT STATES TO RL FUNCTION ################
+        ###########################################################
+
+        actions = self.agent.act(states= in_states)
+        if self.explore > np.random.uniform():
+            actions = np.random.uniform(low=0.25, high=0.75, size=self.num_actions)
+
+        a_in = self.scaleActions(actions)
+
+        actions2 = list(a_in[:3]) + [0,1,0,0] + list([actions[3]>0.5])
+
+        self.dist_before_action = np.linalg.norm(target_state[:3] - gripper_pose[:3])
+        return actions2
+
+
+    def scaleActions(self, actions):
+
+        actions[0] = actions[0]*(self.x_r[1] - self.x_r[0]) + self.x_r[0]
+        actions[1] = actions[1]*(self.y_r[1] - self.y_r[0]) + self.y_r[0]
+        actions[2] = actions[2]*(self.z_r[1] - self.z_r[0]) + self.z_r[0]
+
+        return actions
+
+    def calculateReward(self):
+        terminal = False
+        reward = -self.dist_before_action/4
+
+
+        if self.dist_after_action < 0.2:
+            reward +=  20 + 1/self.dist_after_action
+
+        temp = (self.dist_before_action - self.dist_after_action) / self.dist_before_action * 3
+        if temp > 0:
+            reward += temp
+        else:
+            reward += min(temp,-0.1)
+
+
+
+        if self.has_object: 
+            reward += 100.0
+            terminal = True
+
+
+
+        return reward, terminal
+
diff --git a/TensorForceFiles/__pycache__/DQN_class.cpython-36.pyc b/TensorForceFiles/__pycache__/DQN_class.cpython-36.pyc
diff --git a/TensorForceFiles/__pycache__/TensorForce_class.cpython-36.pyc b/TensorForceFiles/__pycache__/TensorForce_class.cpython-36.pyc
diff --git a/TensorForceFiles/__pycache__/dqn_grasp_class.cpython-36.pyc b/TensorForceFiles/__pycache__/dqn_grasp_class.cpython-36.pyc
diff --git a/TensorForceFiles/dqn_grasp_class.py b/TensorForceFiles/dqn_grasp_class.py
@@ -0,0 +1,127 @@
+from tensorforce import Agent
+import sys
+sys.path.append('../')
+sys.path.append(sys.path[0] + '/TensorForceFiles')
+from DQN_class import *
+import numpy as np
+
+
+class DQN_grasp(TensorForceDQN):
+
+    def __init__(self, num_actions=5, num_states=21, load=None):
+        self.num_states = num_states # Gripper pose, object pose
+        self.num_actions = num_actions # X, Y, Z, Yaw, Grasp
+        super().__init__(num_states= self.num_states, num_actions=self.num_actions, load=load)
+
+        self.x_r = [-.001, .001]
+        self.y_r = [-.001, .001]
+        self.z_r = [0.752, 0.7] ## Z Range: 0.751 - 1.75 (Maybe a little higher)
+        self.yaw_r = [0, np.pi]
+        self.gripper_open = True
+        self.target_start_pose = [0,0,0]
+        self.ee_pos = [0,0,0]
+        self.explore = 0.3
+        self.target_num = 0
+        self.target_name=''
+
+    def act(self, obs, obj_poses, key='sugar'):
+        gripper_pose = obs.gripper_pose
+        large_container_state = obj_poses['large_container']
+        self.ee_pos = gripper_pose
+        ###########################################################
+        ###### PREPARE INPUT STATES TO RL FUNCTION ################
+        if key in obj_poses:
+            target_state = list(obj_poses[key])
+            self.has_object = False
+        else:
+            self.has_object = True
+            target_state = gripper_pose
+            target_state[3] +=0.1
+
+        in_states = list(gripper_pose)
+        in_states.extend(list(target_state))
+        in_states.extend(list(large_container_state))
+        ###### PREPARE INPUT STATES TO RL FUNCTION ################
+        ###########################################################
+
+        actions = self.agent.act(states=in_states)
+
+        if self.explore > np.random.uniform():
+            actions = np.random.uniform(low=0.0, high=1, size=self.num_actions)
+
+        a_in = self.scaleActions(actions)
+        self.gripper_open = a_in[-1]>0.3
+
+        if self.num_actions == 5:
+            a_in[:2] += target_state[:2]
+            # a_in[:3] = gripper_pose[:3]
+            self.ee_pos = a_in[:3]
+            actions2 = list(self.ee_pos) + self.calculateQuaternion(a_in[3]) + list([self.gripper_open])
+
+        elif self.num_actions == 3:
+            self.ee_pos = [target_state[0], target_state[1], a_in[0]]
+            actions2 = list(self.ee_pos) + self.calculateQuaternion(a_in[1]) + list([self.gripper_open])
+
+
+        self.dist_before_action = max(0.05,np.linalg.norm(target_state[:3] - gripper_pose[:3]))
+        return actions2
+
+
+    def scaleActions(self, actions):
+
+        if self.num_actions == 5:
+            actions[0] = actions[0]*(self.x_r[1] - self.x_r[0]) + self.x_r[0]
+            actions[1] = actions[1]*(self.y_r[1] - self.y_r[0]) + self.y_r[0]
+            actions[2] = actions[2]*(self.z_r[1] - self.z_r[0]) + self.z_r[0]
+            actions[3] = actions[3]*(self.yaw_r[1] - self.yaw_r[0]) + self.yaw_r[0]
+        else:
+            actions[0] = actions[0]*(self.z_r[1] - self.z_r[0]) + self.z_r[0]
+            actions[1] = actions[1]*(self.yaw_r[1] - self.yaw_r[0]) + self.yaw_r[0]
+
+        if self.has_object: actions[-1] = 0
+
+        return actions
+
+
+    def calculateReward(self, i):
+        reward = 0
+        terminal = False
+        reward -= i
+
+        delta_dist = self.dist_before_action - self.dist_after_action
+        temp = (self.dist_before_action - self.dist_after_action) / self.dist_before_action * 3
+
+        if delta_dist > 0:
+            reward = temp
+        else:
+            reward += min(temp,-0.1)
+
+        # print(self.has_object)
+        if self.has_object: 
+            reward += 100
+            print("Reward after grasping: ", reward)
+            terminal = True
+            # if self.ee_pos[-1] > self.z_r[1] - 0.05:
+            #     reward += 250
+            #     terminal = True
+        print(self.dist_after_action)
+
+        if not self.gripper_open and self.dist_before_action>0.1:
+            reward -= 20
+
+        if self.gripper_open and self.dist_before_action>0.1:
+            reward += 10
+
+        if not self.gripper_open and not self.has_object:
+            reward -= 3
+
+        return reward, terminal
+
+
+    def calculateQuaternion(self, angle):
+        firstElement  = np.sin(angle/2)
+        secondElement = -np.cos(angle/2)
+        return [firstElement, secondElement, 0, 0]
+
+
+