RofuncRL PPO (Proximal Policy Optimization)#
Paper: “Proximal Policy Optimization Algorithms”. John Schulman. et al. 2017. https://arxiv.org/abs/1707.06347
1. Algorithm#
The PPO algorithm is a policy gradient method that uses a surrogate objective function to constrain the policy update. The objective function is defined as:
where ratio \(r_t(\theta)\) refers to the ratio of the probability of the action under the new and old policies. It is defined as:
and \(\hat{A}_t\) is the advantage estimation function. It is an estimate of how much better or worse an action is compared to the average action at that state. The advantage function is defined as:
where \(\delta_t^V\) is the TD error and \(V(s_t)\) is the value function. The value function is updated by minimizing the TD error:
2. Demos#
2.1. CURICabinet#
python examples/learning_rl/example_CURICabinet_RofuncRL.py --inference
3. Performance comparison#
We compare the performance of the PPO algorithm with different tricks and an open source baseline
(SKRL). These experiments were conducted on the CURICabinet
, FrankaCabinet
, Humanoid
and Ant
environment. The results are shown below:
3.1. CURICabinet#
Orange
: SKRL PPODark Blue
: Rofunc PPO sharing the backbone in the policy network and value networkRed
: Rofunc PPO with independent policy network and value networkLight Blue
: Rofunc PPO with independent policy network and value network, using network initializationPink
: Rofunc PPO with independent policy network and value network, using network initialization and entropy
3.2. FrankaCabinet#
Pink
: SKRL PPOBlue
: Rofunc PPO
3.3. Humanoid#
Orange
: SKRL PPOBlue
: Rofunc PPO
3.4. Ant#
Red
: SKRL PPOBlue
: Rofunc PPO
4. Tricks#
4.1. Normalization#
State Normalization
Value Normalization
Advantage Normalization
4.2. Reward Scaling#
4.3. Network Initialization#
4.4. Entropy#
4.5. Learning Rate Scheduler#
4.6. Gradient Clipping#
4.7. Activation Function#
4.8. Optimizer#
5. Network update function#
def update_net(self):
"""
Update the network
"""
'''Compute Generalized Advantage Estimator (GAE)'''
values = self.memory.get_tensor_by_name("values")
with torch.no_grad():
if self.cfg.Model.use_same_model:
next_values = self.value.get_value(self._state_preprocessor(self._current_next_states.float()))
else:
next_values = self.value(self._state_preprocessor(self._current_next_states.float()))
next_values = self._value_preprocessor(next_values, inverse=True)
advantage = 0
advantages = torch.zeros_like(self.memory.get_tensor_by_name("rewards"))
not_dones = self.memory.get_tensor_by_name("terminated").logical_not()
memory_size = self.memory.get_tensor_by_name("rewards").shape[0]
# advantages computation
for i in reversed(range(memory_size)):
next_values = values[i + 1] if i < memory_size - 1 else next_values
advantage = self.memory.get_tensor_by_name("rewards")[i] - values[i] + self._discount * not_dones[i] * (
next_values + self._td_lambda * advantage)
advantages[i] = advantage
# returns computation
values_target = advantages + values
# advantage normalization
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
self.memory.set_tensor_by_name("values", self._value_preprocessor(values, train=True))
self.memory.set_tensor_by_name("returns", self._value_preprocessor(values_target, train=True))
self.memory.set_tensor_by_name("advantages", advantages)
'''Sample mini-batches from memory and update the network'''
sampled_batches = self.memory.sample_all(names=self._tensors_names, mini_batches=self._mini_batch_size)
cumulative_policy_loss = 0
cumulative_entropy_loss = 0
cumulative_value_loss = 0
# learning epochs
for epoch in range(self._learning_epochs):
kl_divergences = []
# mini-batches loop
for i, (sampled_states, sampled_actions, sampled_dones, sampled_log_prob, sampled_values, sampled_returns,
sampled_advantages) in enumerate(sampled_batches):
sampled_states = self._state_preprocessor(sampled_states, train=not epoch)
_, log_prob_now = self.policy(sampled_states, sampled_actions)
# compute approximate KL divergence
with torch.no_grad():
ratio = log_prob_now - sampled_log_prob
kl_divergence = ((torch.exp(ratio) - 1) - ratio).mean()
kl_divergences.append(kl_divergence)
# early stopping with KL divergence
if self._kl_threshold and kl_divergence > self._kl_threshold:
break
# compute entropy loss
entropy_loss = -self._entropy_loss_scale * self.policy.get_entropy().mean()
# compute policy loss
ratio = torch.exp(log_prob_now - sampled_log_prob)
surrogate = sampled_advantages * ratio
surrogate_clipped = sampled_advantages * torch.clip(ratio, 1.0 - self._ratio_clip,
1.0 + self._ratio_clip)
policy_loss = -torch.min(surrogate, surrogate_clipped).mean()
# compute value loss
if self.cfg.Model.use_same_model:
predicted_values = self.value.get_value(sampled_states)
else:
predicted_values = self.value(sampled_states)
if self._clip_predicted_values:
predicted_values = sampled_values + torch.clip(predicted_values - sampled_values,
min=-self._value_clip,
max=self._value_clip)
value_loss = self._value_loss_scale * F.mse_loss(sampled_returns, predicted_values)
if self.policy is self.value:
# optimization step
self.optimizer.zero_grad()
(policy_loss + entropy_loss + value_loss).backward()
if self._grad_norm_clip > 0:
nn.utils.clip_grad_norm_(self.policy.parameters(), self._grad_norm_clip)
self.optimizer.step()
else:
# Update policy network
self.optimizer_policy.zero_grad()
(policy_loss + entropy_loss).backward()
if self._grad_norm_clip > 0:
nn.utils.clip_grad_norm_(self.policy.parameters(), self._grad_norm_clip)
self.optimizer_policy.step()
# Update value network
self.optimizer_value.zero_grad()
value_loss.backward()
if self._grad_norm_clip > 0:
nn.utils.clip_grad_norm_(self.value.parameters(), self._grad_norm_clip)
self.optimizer_value.step()
# update cumulative losses
cumulative_policy_loss += policy_loss.item()
cumulative_value_loss += value_loss.item()
if self._entropy_loss_scale:
cumulative_entropy_loss += entropy_loss.item()
# update learning rate
if self._lr_scheduler:
if self.policy is self.value:
if isinstance(self.scheduler, KLAdaptiveRL):
self.scheduler.step(torch.tensor(kl_divergences).mean())
else:
self.scheduler.step()
else:
if isinstance(self.scheduler_policy, KLAdaptiveRL):
self.scheduler_policy.step(torch.tensor(kl_divergences).mean())
else:
self.scheduler_policy.step()
if isinstance(self.scheduler_value, KLAdaptiveRL):
self.scheduler_value.step(torch.tensor(kl_divergences).mean())
else:
self.scheduler_value.step()
# record data
self.track_data("Loss / Policy loss", cumulative_policy_loss / (self._learning_epochs * self._mini_batch_size))
self.track_data("Loss / Value loss", cumulative_value_loss / (self._learning_epochs * self._mini_batch_size))
if self._entropy_loss_scale:
self.track_data("Loss / Entropy loss",
cumulative_entropy_loss / (self._learning_epochs * self._mini_batch_size))
if self._lr_scheduler:
if self.policy is self.value:
self.track_data("Learning / Learning rate", self.scheduler.get_last_lr()[0])
else:
self.track_data("Learning / Learning rate (policy)", self.scheduler_policy.get_last_lr()[0])
self.track_data("Learning / Learning rate (value)", self.scheduler_value.get_last_lr()[0])