In this post we’ll go through how one could build a Keras model using stable-baselines library as well as the conditions to create a default gym environment. The simple question is if we want to use the Keras APIs to build the basis for a Policy using PPO; we should be able to do this in a fairly straightforward manner.

Suppose you wanted to create a simple MLP model:

import tensorflow as tf

# ...think of this as Input...
# flat = tf.keras.Input(shape=(784,))
flat = tf.keras.layers.Flatten()(self.processed_obs)

x = tf.keras.layers.Dense(64, activation="tanh", name="pi_fc_0")(flat)
pi_latent = tf.keras.layers.Dense(64, activation="tanh", name="pi_fc_1")(x)

x1 = tf.keras.layers.Dense(64, activation="tanh", name="vf_fc_0")(flat)
vf_latent = tf.keras.layers.Dense(64, activation="tanh", name="vf_fc_1")(x1)

value_fn = tf.keras.layers.Dense(1, name="vf")(vf_latent)

This allows one to define the latent value function and the latent policy for modelling. This is an important part of PPO versus other methods as intuitively it also allows for parameter sharing.

This can be added to the Policy object which furthermore requires us to define probability distribution, policy and q values.

from stable_baselines.common.distributions import ProbabilityDistribution

pdtype = ProbabilityDistribution()
proba_distribution, policy, q_value = pdtype.proba_distribution_from_latent(
    pi_latent, vf_latent, init_scale=0.01
)

Using this in conjunction with some boilerplate code allows us to construct a policy which can be used for RL purposes.

Full Example using Cart Pole is shown below.

import tensorflow as tf

from stable_baselines import PPO2
from stable_baselines.common.policies import ActorCriticPolicy


class KerasPolicy(ActorCriticPolicy):
    def __init__(
        self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **kwargs
    ):
        super(KerasPolicy, self).__init__(
            sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse, scale=False
        )

        with tf.variable_scope("model", reuse=reuse):
            flat = tf.keras.layers.Flatten()(self.processed_obs)

            x = tf.keras.layers.Dense(64, activation="tanh", name="pi_fc_0")(flat)
            pi_latent = tf.keras.layers.Dense(64, activation="tanh", name="pi_fc_1")(x)

            x1 = tf.keras.layers.Dense(64, activation="tanh", name="vf_fc_0")(flat)
            vf_latent = tf.keras.layers.Dense(64, activation="tanh", name="vf_fc_1")(x1)

            value_fn = tf.keras.layers.Dense(1, name="vf")(vf_latent)

            self._proba_distribution, self._policy, self.q_value = self.pdtype.proba_distribution_from_latent(
                pi_latent, vf_latent, init_scale=0.01
            )

        self._value_fn = value_fn
        # self.initial_state = None
        self._setup_init()

    def step(self, obs, state=None, mask=None, deterministic=False):
        if deterministic:
            action, value, neglogp = self.sess.run(
                [self.deterministic_action, self.value_flat, self.neglogp],
                {self.obs_ph: obs},
            )
        else:
            action, value, neglogp = self.sess.run(
                [self.action, self.value_flat, self.neglogp], {self.obs_ph: obs}
            )
        return action, value, self.initial_state, neglogp

    def proba_step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy_proba, {self.obs_ph: obs})

    def value(self, obs, state=None, mask=None):
        return self.sess.run(self.value_flat, {self.obs_ph: obs})


model = PPO2(KerasPolicy, "CartPole-v1", verbose=1)
model.learn(25000)

env = model.get_env()
obs = env.reset()

reward_sum = 0.0
for _ in range(1000):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    reward_sum += reward
    env.render()
    if done:
        print("Reward: ", reward_sum)
        reward_sum = 0.0
        obs = env.reset()

env.close()