RofuncRL A2C (Advantage Actor-Critic)#
Paper: “Asynchronous Methods for Deep Reinforcement Learning”. Mnih et al. 2016. https://arxiv.org/abs/1602.01783
1. Algorithm#
def update_net(self):
"""
Update the network
"""
'''Compute Generalized Advantage Estimator (GAE)'''
values = self.memory.get_tensor_by_name("values")
with torch.no_grad():
self.value.train(False)
next_values = self.value(self._state_preprocessor(self._current_next_states.float()))
self.value.train(True)
next_values = self._value_preprocessor(next_values, inverse=True)
advantage = 0
advantages = torch.zeros_like(self.memory.get_tensor_by_name("rewards"))
not_dones = self.memory.get_tensor_by_name("terminated").logical_not()
memory_size = self.memory.get_tensor_by_name("rewards").shape[0]
# advantages computation
for i in reversed(range(memory_size)):
next_values = values[i + 1] if i < memory_size - 1 else next_values
advantage = self.memory.get_tensor_by_name("rewards")[i] - values[i] + self._discount * not_dones[i] * (
next_values + self._td_lambda * advantage)
advantages[i] = advantage
# returns computation
values_target = advantages + values
# advantage normalization
advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
self.memory.set_tensor_by_name("values", self._value_preprocessor(values, train=True))
self.memory.set_tensor_by_name("returns", self._value_preprocessor(values_target, train=True))
self.memory.set_tensor_by_name("advantages", advantages)
'''Sample mini-batches from memory and update the network'''
sampled_batches = self.memory.sample_all(names=self._tensors_names, mini_batches=self._mini_batch_size)
cumulative_policy_loss = 0
cumulative_entropy_loss = 0
cumulative_value_loss = 0
# learning epochs
for epoch in range(self._learning_epochs):
kl_divergences = []
# mini-batches loop
for i, (sampled_states, sampled_actions, sampled_dones, sampled_log_prob, sampled_values, sampled_returns,
sampled_advantages) in enumerate(sampled_batches):
sampled_states = self._state_preprocessor(sampled_states, train=not epoch)
_, log_prob_now = self.policy(sampled_states, sampled_actions)
# compute approximate KL divergence
with torch.no_grad():
ratio = log_prob_now - sampled_log_prob
kl_divergence = ((torch.exp(ratio) - 1) - ratio).mean()
kl_divergences.append(kl_divergence)
# compute entropy loss
entropy_loss = -self._entropy_loss_scale * self.policy.get_entropy().mean()
# compute policy loss
policy_loss = -(sampled_advantages * log_prob_now).mean()
# compute value loss
predicted_values = self.value(sampled_states)
if self._clip_predicted_values:
predicted_values = sampled_values + torch.clip(predicted_values - sampled_values,
min=-self._value_clip,
max=self._value_clip)
value_loss = F.mse_loss(sampled_returns, predicted_values)
if self.policy is self.value:
# optimization step
self.optimizer.zero_grad()
(policy_loss + entropy_loss + value_loss).backward()
if self._grad_norm_clip > 0:
nn.utils.clip_grad_norm_(self.policy.parameters(), self._grad_norm_clip)
self.optimizer.step()
else:
# Update policy network
self.optimizer_policy.zero_grad()
(policy_loss + entropy_loss).backward()
if self._grad_norm_clip > 0:
nn.utils.clip_grad_norm_(self.policy.parameters(), self._grad_norm_clip)
self.optimizer_policy.step()
# Update value network
self.optimizer_value.zero_grad()
value_loss.backward()
if self._grad_norm_clip > 0:
nn.utils.clip_grad_norm_(self.value.parameters(), self._grad_norm_clip)
self.optimizer_value.step()
# update cumulative losses
cumulative_policy_loss += policy_loss.item()
cumulative_value_loss += value_loss.item()
if self._entropy_loss_scale:
cumulative_entropy_loss += entropy_loss.item()
# update learning rate
if self._lr_scheduler:
if self.policy is self.value:
if isinstance(self.scheduler, KLAdaptiveRL):
self.scheduler.step(torch.tensor(kl_divergences).mean())
else:
self.scheduler.step()
else:
if isinstance(self.scheduler_policy, KLAdaptiveRL):
self.scheduler_policy.step(torch.tensor(kl_divergences).mean())
else:
self.scheduler_policy.step()
if isinstance(self.scheduler_value, KLAdaptiveRL):
self.scheduler_value.step(torch.tensor(kl_divergences).mean())
else:
self.scheduler_value.step()
# record data
self.track_data("Loss / Policy loss", cumulative_policy_loss / (self._learning_epochs * self._mini_batch_size))
self.track_data("Loss / Value loss", cumulative_value_loss / (self._learning_epochs * self._mini_batch_size))
if self._entropy_loss_scale:
self.track_data("Loss / Entropy loss",
cumulative_entropy_loss / (self._learning_epochs * self._mini_batch_size))
if self._lr_scheduler:
if self.policy is self.value:
self.track_data("Learning / Learning rate", self.scheduler.get_last_lr()[0])
else:
self.track_data("Learning / Learning rate (policy)", self.scheduler_policy.get_last_lr()[0])
self.track_data("Learning / Learning rate (value)", self.scheduler_value.get_last_lr()[0])
2. Performance comparison#
We compare the performance of the A2C algorithm with different tricks and an open source baseline
(SKRL). These experiments were conducted on the Pendulum-v1
environment.
The results are shown below:
2.1. Pendulum#
Dark Blue
: SKRL A2CGreen
: Rofunc A2C with SKRL policy and value networkPink
: Rofunc A2C with self-defined policy and value network