from logging import getLogger
import warnings
import chainer
import chainer.functions as F
import numpy as np
import chainerrl
from chainerrl import agent
from chainerrl.recurrent import Recurrent
[docs]class REINFORCE(agent.AttributeSavingMixin, agent.Agent):
"""William's episodic REINFORCE.
Args:
model (Policy): Model to train. It must be a callable that accepts
observations as input and return action distributions
(Distribution).
optimizer (chainer.Optimizer): optimizer used to train the model
beta (float): Weight coefficient for the entropy regularizaiton term.
normalize_loss_by_steps (bool): If set true, losses are normalized by
the number of steps taken to accumulate the losses
act_deterministically (bool): If set true, choose most probable actions
in act method.
batchsize (int): Number of episodes used for each update
backward_separately (bool): If set true, call backward separately for
each episode and accumulate only gradients.
average_entropy_decay (float): Decay rate of average entropy. Used only
to record statistics.
batch_states (callable): Method which makes a batch of observations.
default is `chainerrl.misc.batch_states`
logger (logging.Logger): Logger to be used.
"""
saved_attributes = ['model', 'optimizer']
def __init__(self, model, optimizer,
beta=0,
phi=lambda x: x,
batchsize=1,
act_deterministically=False,
average_entropy_decay=0.999,
backward_separately=False,
batch_states=chainerrl.misc.batch_states,
logger=None):
self.model = model
self.xp = self.model.xp
self.optimizer = optimizer
self.beta = beta
self.phi = phi
self.batchsize = batchsize
self.backward_separately = backward_separately
self.act_deterministically = act_deterministically
self.average_entropy_decay = average_entropy_decay
self.batch_states = batch_states
self.logger = logger or getLogger(__name__)
# Statistics
self.average_entropy = 0
self.t = 0
self.reward_sequences = [[]]
self.log_prob_sequences = [[]]
self.entropy_sequences = [[]]
self.n_backward = 0
def act_and_train(self, obs, reward):
batch_obs = self.batch_states([obs], self.xp, self.phi)
action_distrib = self.model(batch_obs)
batch_action = action_distrib.sample().array # Do not backprop
action = chainer.cuda.to_cpu(batch_action)[0]
# Save values used to compute losses
self.reward_sequences[-1].append(reward)
self.log_prob_sequences[-1].append(
action_distrib.log_prob(batch_action))
self.entropy_sequences[-1].append(
action_distrib.entropy)
self.t += 1
self.logger.debug('t:%s r:%s a:%s action_distrib:%s',
self.t, reward, action, action_distrib)
# Update stats
self.average_entropy += (
(1 - self.average_entropy_decay) *
(float(action_distrib.entropy.array[0]) - self.average_entropy))
return action
def act(self, obs):
with chainer.no_backprop_mode():
batch_obs = self.batch_states([obs], self.xp, self.phi)
action_distrib = self.model(batch_obs)
if self.act_deterministically:
return chainer.cuda.to_cpu(
action_distrib.most_probable.array)[0]
else:
return chainer.cuda.to_cpu(action_distrib.sample().array)[0]
def stop_episode_and_train(self, obs, reward, done=False):
if not done:
warnings.warn(
'Since REINFORCE supports episodic environments only, '
'calling stop_episode_and_train with done=False will throw '
'away the last episode.')
self.reward_sequences[-1] = []
self.log_prob_sequences[-1] = []
self.entropy_sequences[-1] = []
else:
self.reward_sequences[-1].append(reward)
if self.backward_separately:
self.accumulate_grad()
if self.n_backward == self.batchsize:
self.update_with_accumulated_grad()
else:
if len(self.reward_sequences) == self.batchsize:
self.batch_update()
else:
# Prepare for the next episode
self.reward_sequences.append([])
self.log_prob_sequences.append([])
self.entropy_sequences.append([])
if isinstance(self.model, Recurrent):
self.model.reset_state()
def accumulate_grad(self):
if self.n_backward == 0:
self.model.cleargrads()
# Compute losses
losses = []
for r_seq, log_prob_seq, ent_seq in zip(self.reward_sequences,
self.log_prob_sequences,
self.entropy_sequences):
assert len(r_seq) - 1 == len(log_prob_seq) == len(ent_seq)
# Convert rewards into returns (=sum of future rewards)
R_seq = np.cumsum(list(reversed(r_seq[1:])))[::-1]
for R, log_prob, entropy in zip(R_seq, log_prob_seq, ent_seq):
loss = -R * log_prob - self.beta * entropy
losses.append(loss)
total_loss = chainerrl.functions.sum_arrays(losses)
# When self.batchsize is future.types.newint.newint, dividing a
# Variable with it will raise an error, so it is manually converted to
# float here.
total_loss /= float(self.batchsize)
F.squeeze(total_loss).backward()
self.reward_sequences = [[]]
self.log_prob_sequences = [[]]
self.entropy_sequences = [[]]
self.n_backward += 1
def batch_update(self):
assert len(self.reward_sequences) == self.batchsize
assert len(self.log_prob_sequences) == self.batchsize
assert len(self.entropy_sequences) == self.batchsize
# Update the model
assert self.n_backward == 0
self.accumulate_grad()
self.optimizer.update()
self.n_backward = 0
def update_with_accumulated_grad(self):
assert self.n_backward == self.batchsize
self.optimizer.update()
self.n_backward = 0
def stop_episode(self):
if isinstance(self.model, Recurrent):
self.model.reset_state()
def get_statistics(self):
return [
('average_entropy', self.average_entropy),
]