Source code for chainerrl.agents.reinforce

from logging import getLogger
import warnings

import chainer
import chainer.functions as F
import numpy as np

import chainerrl
from chainerrl import agent
from chainerrl.recurrent import Recurrent

[docs]class REINFORCE(agent.AttributeSavingMixin, agent.Agent): """William's episodic REINFORCE. Args: model (Policy): Model to train. It must be a callable that accepts observations as input and return action distributions (Distribution). optimizer (chainer.Optimizer): optimizer used to train the model beta (float): Weight coefficient for the entropy regularizaiton term. normalize_loss_by_steps (bool): If set true, losses are normalized by the number of steps taken to accumulate the losses act_deterministically (bool): If set true, choose most probable actions in act method. batchsize (int): Number of episodes used for each update backward_separately (bool): If set true, call backward separately for each episode and accumulate only gradients. average_entropy_decay (float): Decay rate of average entropy. Used only to record statistics. batch_states (callable): Method which makes a batch of observations. default is `chainerrl.misc.batch_states` logger (logging.Logger): Logger to be used. """ saved_attributes = ['model', 'optimizer'] def __init__(self, model, optimizer, beta=0, phi=lambda x: x, batchsize=1, act_deterministically=False, average_entropy_decay=0.999, backward_separately=False, batch_states=chainerrl.misc.batch_states, logger=None): self.model = model self.xp = self.model.xp self.optimizer = optimizer self.beta = beta self.phi = phi self.batchsize = batchsize self.backward_separately = backward_separately self.act_deterministically = act_deterministically self.average_entropy_decay = average_entropy_decay self.batch_states = batch_states self.logger = logger or getLogger(__name__) # Statistics self.average_entropy = 0 self.t = 0 self.reward_sequences = [[]] self.log_prob_sequences = [[]] self.entropy_sequences = [[]] self.n_backward = 0 def act_and_train(self, obs, reward): batch_obs = self.batch_states([obs], self.xp, self.phi) action_distrib = self.model(batch_obs) batch_action = action_distrib.sample().array # Do not backprop action = chainer.cuda.to_cpu(batch_action)[0] # Save values used to compute losses self.reward_sequences[-1].append(reward) self.log_prob_sequences[-1].append( action_distrib.log_prob(batch_action)) self.entropy_sequences[-1].append( action_distrib.entropy) self.t += 1 self.logger.debug('t:%s r:%s a:%s action_distrib:%s', self.t, reward, action, action_distrib) # Update stats self.average_entropy += ( (1 - self.average_entropy_decay) * (float(action_distrib.entropy.array[0]) - self.average_entropy)) return action def act(self, obs): with chainer.no_backprop_mode(): batch_obs = self.batch_states([obs], self.xp, self.phi) action_distrib = self.model(batch_obs) if self.act_deterministically: return chainer.cuda.to_cpu( action_distrib.most_probable.array)[0] else: return chainer.cuda.to_cpu(action_distrib.sample().array)[0] def stop_episode_and_train(self, obs, reward, done=False): if not done: warnings.warn( 'Since REINFORCE supports episodic environments only, ' 'calling stop_episode_and_train with done=False will throw ' 'away the last episode.') self.reward_sequences[-1] = [] self.log_prob_sequences[-1] = [] self.entropy_sequences[-1] = [] else: self.reward_sequences[-1].append(reward) if self.backward_separately: self.accumulate_grad() if self.n_backward == self.batchsize: self.update_with_accumulated_grad() else: if len(self.reward_sequences) == self.batchsize: self.batch_update() else: # Prepare for the next episode self.reward_sequences.append([]) self.log_prob_sequences.append([]) self.entropy_sequences.append([]) if isinstance(self.model, Recurrent): self.model.reset_state() def accumulate_grad(self): if self.n_backward == 0: self.model.cleargrads() # Compute losses losses = [] for r_seq, log_prob_seq, ent_seq in zip(self.reward_sequences, self.log_prob_sequences, self.entropy_sequences): assert len(r_seq) - 1 == len(log_prob_seq) == len(ent_seq) # Convert rewards into returns (=sum of future rewards) R_seq = np.cumsum(list(reversed(r_seq[1:])))[::-1] for R, log_prob, entropy in zip(R_seq, log_prob_seq, ent_seq): loss = -R * log_prob - self.beta * entropy losses.append(loss) total_loss = chainerrl.functions.sum_arrays(losses) # When self.batchsize is future.types.newint.newint, dividing a # Variable with it will raise an error, so it is manually converted to # float here. total_loss /= float(self.batchsize) F.squeeze(total_loss).backward() self.reward_sequences = [[]] self.log_prob_sequences = [[]] self.entropy_sequences = [[]] self.n_backward += 1 def batch_update(self): assert len(self.reward_sequences) == self.batchsize assert len(self.log_prob_sequences) == self.batchsize assert len(self.entropy_sequences) == self.batchsize # Update the model assert self.n_backward == 0 self.accumulate_grad() self.optimizer.update() self.n_backward = 0 def update_with_accumulated_grad(self): assert self.n_backward == self.batchsize self.optimizer.update() self.n_backward = 0 def stop_episode(self): if isinstance(self.model, Recurrent): self.model.reset_state() def get_statistics(self): return [ ('average_entropy', self.average_entropy), ]