Source code for chainerrl.agents.pcl

from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import absolute_import
from builtins import *  # NOQA
from future import standard_library
standard_library.install_aliases()

import copy
from logging import getLogger

import chainer
from chainer import functions as F

import chainerrl
from chainerrl import agent
from chainerrl.agents import a3c
from chainerrl.misc import async
from chainerrl.misc.batch_states import batch_states
from chainerrl.misc import copy_param
from chainerrl.recurrent import Recurrent
from chainerrl.recurrent import state_kept
from chainerrl.recurrent import state_reset
from chainerrl.replay_buffer import batch_experiences


def asfloat(x):
    if isinstance(x, chainer.Variable):
        return float(x.data)
    else:
        return float(x)


PCLSeparateModel = a3c.A3CSeparateModel
PCLSharedModel = a3c.A3CSharedModel


[docs]class PCL(agent.AttributeSavingMixin, agent.AsyncAgent): """PCL (Path Consistency Learning). Not only the batch PCL algorithm proposed in the paper but also its asynchronous variant is implemented. See https://arxiv.org/abs/1702.08892 Args: model (chainer.Link): Model to train. It must be a callable that accepts a batch of observations as input and return two values: - action distributions (Distribution) - state values (chainer.Variable) optimizer (chainer.Optimizer): optimizer used to train the model t_max (int or None): The model is updated after every t_max local steps. If set None, the model is updated after every episode. gamma (float): Discount factor [0,1] tau (float): Weight coefficient for the entropy regularizaiton term. phi (callable): Feature extractor function pi_loss_coef (float): Weight coefficient for the loss of the policy v_loss_coef (float): Weight coefficient for the loss of the value function rollout_len (int): Number of rollout steps batchsize (int): Number of episodes or sub-trajectories used for an update. The total number of transitions used will be (batchsize x t_max). disable_online_update (bool): If set true, disable online on-policy update and rely only on experience replay. n_times_replay (int): Number of times experience replay is repeated per one time of online update. replay_start_size (int): Experience replay is disabled if the number of transitions in the replay buffer is lower than this value. normalize_loss_by_steps (bool): If set true, losses are normalized by the number of steps taken to accumulate the losses act_deterministically (bool): If set true, choose most probable actions in act method. average_loss_decay (float): Decay rate of average loss. Used only to record statistics. average_entropy_decay (float): Decay rate of average entropy. Used only to record statistics. average_value_decay (float): Decay rate of average value. Used only to record statistics. explorer (Explorer or None): If not None, this explorer is used for selecting actions. logger (None or Logger): Logger to be used batch_states (callable): Method which makes a batch of observations. default is `chainerrl.misc.batch_states.batch_states` backprop_future_values (bool): If set True, value gradients are computed not only wrt V(s_t) but also V(s_{t+d}). train_async (bool): If set True, use a process-local model to compute gradients and update the globally shared model. """ process_idx = None saved_attributes = ['model', 'optimizer'] shared_attributes = ['shared_model', 'optimizer'] def __init__(self, model, optimizer, replay_buffer=None, t_max=None, gamma=0.99, tau=1e-2, phi=lambda x: x, pi_loss_coef=1.0, v_loss_coef=0.5, rollout_len=10, batchsize=1, disable_online_update=False, n_times_replay=1, replay_start_size=10 ** 2, normalize_loss_by_steps=True, act_deterministically=False, average_loss_decay=0.999, average_entropy_decay=0.999, average_value_decay=0.999, explorer=None, logger=None, batch_states=batch_states, backprop_future_values=True, train_async=False): if train_async: # Globally shared model self.shared_model = model # Thread specific model self.model = copy.deepcopy(self.shared_model) async.assert_params_not_shared(self.shared_model, self.model) else: self.model = model self.xp = self.model.xp self.optimizer = optimizer self.replay_buffer = replay_buffer self.t_max = t_max self.gamma = gamma self.tau = tau self.phi = phi self.pi_loss_coef = pi_loss_coef self.v_loss_coef = v_loss_coef self.rollout_len = rollout_len if not self.xp.isscalar(batchsize): batchsize = self.xp.int32(batchsize) """Fix Chainer Issue #2807 batchsize should (look to) be scalar. """ self.batchsize = batchsize self.normalize_loss_by_steps = normalize_loss_by_steps self.act_deterministically = act_deterministically self.disable_online_update = disable_online_update self.n_times_replay = n_times_replay self.replay_start_size = replay_start_size self.average_loss_decay = average_loss_decay self.average_value_decay = average_value_decay self.average_entropy_decay = average_entropy_decay self.logger = logger if logger else getLogger(__name__) self.batch_states = batch_states self.backprop_future_values = backprop_future_values self.train_async = train_async self.t = 0 self.last_state = None self.last_action = None self.explorer = explorer self.online_batch_losses = [] # Stats self.average_loss = 0 self.average_value = 0 self.average_entropy = 0 self.init_history_data_for_online_update() def init_history_data_for_online_update(self): self.past_actions = {} self.past_rewards = {} self.past_values = {} self.past_action_distrib = {} self.t_start = self.t def sync_parameters(self): copy_param.copy_param(target_link=self.model, source_link=self.shared_model) def compute_loss(self, t_start, t_stop, rewards, values, next_values, log_probs): seq_len = t_stop - t_start assert len(rewards) == seq_len assert len(values) == seq_len assert len(next_values) == seq_len assert len(log_probs) == seq_len pi_losses = [] v_losses = [] for t in range(t_start, t_stop): d = min(t_stop - t, self.rollout_len) # Discounted sum of immediate rewards R_seq = sum(self.gamma ** i * rewards[t + i] for i in range(d)) # Discounted sum of log likelihoods G = chainerrl.functions.weighted_sum_arrays( xs=[log_probs[t + i] for i in range(d)], weights=[self.gamma ** i for i in range(d)]) G = F.expand_dims(G, -1) last_v = next_values[t + d - 1] if not self.backprop_future_values: last_v = chainer.Variable(last_v.data) # C_pi only backprop through pi C_pi = (- values[t].data + self.gamma ** d * last_v.data + R_seq - self.tau * G) # C_v only backprop through v C_v = (- values[t] + self.gamma ** d * last_v + R_seq - self.tau * G.data) pi_losses.append(C_pi ** 2) v_losses.append(C_v ** 2) pi_loss = chainerrl.functions.sum_arrays(pi_losses) / 2 v_loss = chainerrl.functions.sum_arrays(v_losses) / 2 # Re-scale pi loss so that it is independent from tau pi_loss /= self.tau pi_loss *= self.pi_loss_coef v_loss *= self.v_loss_coef if self.normalize_loss_by_steps: pi_loss /= seq_len v_loss /= seq_len if self.process_idx == 0: self.logger.debug('pi_loss:%s v_loss:%s', pi_loss.data, v_loss.data) return pi_loss + F.reshape(v_loss, pi_loss.data.shape) def update(self, loss): self.average_loss += ( (1 - self.average_loss_decay) * (asfloat(loss) - self.average_loss)) # Compute gradients using thread-specific model self.model.zerograds() loss.backward() if self.train_async: # Copy the gradients to the globally shared model self.shared_model.zerograds() copy_param.copy_grad( target_link=self.shared_model, source_link=self.model) if self.process_idx == 0: xp = self.xp norm = sum(xp.sum(xp.square(param.grad)) for param in self.optimizer.target.params()) self.logger.debug('grad norm:%s', norm) self.optimizer.update() if self.train_async: self.sync_parameters() if isinstance(self.model, Recurrent): self.model.unchain_backward() def update_from_replay(self): if self.replay_buffer is None: return if len(self.replay_buffer) < self.replay_start_size: return if self.process_idx == 0: self.logger.debug('update_from_replay') episodes = self.replay_buffer.sample_episodes( self.batchsize, max_len=self.t_max) if isinstance(episodes, tuple): # Prioritized replay episodes, weights = episodes else: weights = [1] * len(episodes) sorted_episodes = list(reversed(sorted(episodes, key=len))) max_epi_len = len(sorted_episodes[0]) with state_reset(self.model): # Batch computation of multiple episodes rewards = {} values = {} next_values = {} log_probs = {} next_action_distrib = None next_v = None for t in range(max_epi_len): transitions = [] for ep in sorted_episodes: if len(ep) <= t: break transitions.append(ep[t]) batch = batch_experiences(transitions, xp=self.xp, phi=self.phi, batch_states=self.batch_states) batchsize = batch['action'].shape[0] if next_action_distrib is not None: action_distrib = next_action_distrib[0:batchsize] v = next_v[0:batchsize] else: action_distrib, v = self.model(batch['state']) next_action_distrib, next_v = self.model(batch['next_state']) values[t] = v next_values[t] = next_v * \ (1 - batch['is_state_terminal'].reshape(next_v.shape)) rewards[t] = chainer.cuda.to_cpu(batch['reward']) log_probs[t] = action_distrib.log_prob(batch['action']) # Loss is computed one by one episode losses = [] for i, ep in enumerate(sorted_episodes): e_values = {} e_next_values = {} e_rewards = {} e_log_probs = {} for t in range(len(ep)): assert values[t].shape[0] > i assert next_values[t].shape[0] > i assert rewards[t].shape[0] > i assert log_probs[t].shape[0] > i e_values[t] = values[t][i:i + 1] e_next_values[t] = next_values[t][i:i + 1] e_rewards[t] = float(rewards[t][i:i + 1]) e_log_probs[t] = log_probs[t][i:i + 1] losses.append(self.compute_loss( t_start=0, t_stop=len(ep), rewards=e_rewards, values=e_values, next_values=e_next_values, log_probs=e_log_probs)) loss = chainerrl.functions.weighted_sum_arrays( losses, weights) / self.batchsize self.update(loss) def update_on_policy(self, statevar): assert self.t_start < self.t if not self.disable_online_update: next_values = {} for t in range(self.t_start + 1, self.t): next_values[t - 1] = self.past_values[t] if statevar is None: next_values[self.t - 1] = chainer.Variable( self.xp.zeros_like(self.past_values[self.t - 1].data)) else: with state_kept(self.model): _, v = self.model(statevar) next_values[self.t - 1] = v log_probs = {t: self.past_action_distrib[t].log_prob( self.xp.asarray(self.xp.expand_dims(a, 0))) for t, a in self.past_actions.items()} self.online_batch_losses.append(self.compute_loss( t_start=self.t_start, t_stop=self.t, rewards=self.past_rewards, values=self.past_values, next_values=next_values, log_probs=log_probs)) if len(self.online_batch_losses) == self.batchsize: loss = chainerrl.functions.sum_arrays( self.online_batch_losses) / self.batchsize self.update(loss) self.online_batch_losses = [] self.init_history_data_for_online_update() def act_and_train(self, state, reward): statevar = self.batch_states([state], self.xp, self.phi) if self.last_state is not None: self.past_rewards[self.t - 1] = reward if self.t - self.t_start == self.t_max: self.update_on_policy(statevar) if len(self.online_batch_losses) == 0: for _ in range(self.n_times_replay): self.update_from_replay() action_distrib, v = self.model(statevar) action = chainer.cuda.to_cpu(action_distrib.sample().data)[0] if self.explorer is not None: action = self.explorer.select_action(self.t, lambda: action) # Save values for a later update self.past_values[self.t] = v self.past_actions[self.t] = action self.past_action_distrib[self.t] = action_distrib self.t += 1 if self.process_idx == 0: self.logger.debug( 't:%s r:%s a:%s action_distrib:%s v:%s', self.t, reward, action, action_distrib, float(v.data)) # Update stats self.average_value += ( (1 - self.average_value_decay) * (float(v.data[0]) - self.average_value)) self.average_entropy += ( (1 - self.average_entropy_decay) * (float(action_distrib.entropy.data[0]) - self.average_entropy)) if self.last_state is not None: assert self.last_action is not None assert self.last_action_distrib is not None # Add a transition to the replay buffer self.replay_buffer.append( state=self.last_state, action=self.last_action, reward=reward, next_state=state, next_action=action, is_state_terminal=False, mu=self.last_action_distrib, ) self.last_state = state self.last_action = action self.last_action_distrib = action_distrib.copy() return action def act(self, obs): # Use the process-local model for acting with chainer.no_backprop_mode(): statevar = self.batch_states([obs], self.xp, self.phi) action_distrib, _ = self.model(statevar) if self.act_deterministically: return chainer.cuda.to_cpu( action_distrib.most_probable.data)[0] else: return chainer.cuda.to_cpu(action_distrib.sample().data)[0] def stop_episode_and_train(self, state, reward, done=False): assert self.last_state is not None assert self.last_action is not None self.past_rewards[self.t - 1] = reward if done: self.update_on_policy(None) else: statevar = self.batch_states([state], self.xp, self.phi) self.update_on_policy(statevar) if len(self.online_batch_losses) == 0: for _ in range(self.n_times_replay): self.update_from_replay() if isinstance(self.model, Recurrent): self.model.reset_state() # Add a transition to the replay buffer self.replay_buffer.append( state=self.last_state, action=self.last_action, reward=reward, next_state=state, next_action=self.last_action, is_state_terminal=done, mu=self.last_action_distrib) self.replay_buffer.stop_current_episode() self.last_state = None self.last_action = None self.last_action_distrib = None def stop_episode(self): if isinstance(self.model, Recurrent): self.model.reset_state() def load(self, dirname): super().load(dirname) if self.train_async: copy_param.copy_param(target_link=self.shared_model, source_link=self.model) def get_statistics(self): return [ ('average_loss', self.average_loss), ('average_value', self.average_value), ('average_entropy', self.average_entropy), ]