-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathpolicies.py
158 lines (133 loc) · 7.31 KB
/
policies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""
Custom policies for the Cycle-of-Learning.
"""
import gym
import tensorflow as tf
import numpy as np
from stable_baselines.common.policies import BasePolicy, nature_cnn, register_policy
from stable_baselines.ddpg.policies import FeedForwardPolicy, DDPGPolicy
from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines import DDPG
from stable_baselines import DDPG
from stable_baselines.common.vec_env import DummyVecEnv
from ddpg_col import DDPG_CoL
class FeedForwardPolicyDropout(DDPGPolicy):
"""
Policy object that implements a DDPG-like actor critic, using a feed forward neural network.
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param layers: ([int]) The size of the Neural network for the policy (if None, default to [64, 64])
:param cnn_extractor: (function (TensorFlow Tensor, ``**kwargs``): (TensorFlow Tensor)) the CNN feature extraction
:param feature_extraction: (str) The feature extraction type ("cnn" or "mlp")
:param layer_norm: (bool) enable layer normalisation
:param act_fun: (tf.func) the activation function to use in the neural network.
:param kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, layers=None,
cnn_extractor=nature_cnn, feature_extraction="cnn",
layer_norm=False, act_fun=tf.nn.relu, **kwargs):
super(FeedForwardPolicyDropout, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=reuse,
scale=(feature_extraction == "cnn"))
self._kwargs_check(feature_extraction, kwargs)
self.layer_norm = layer_norm
self.feature_extraction = feature_extraction
self.cnn_kwargs = kwargs
self.cnn_extractor = cnn_extractor
self.reuse = reuse
self._qvalue = None
if layers is None:
layers = [64, 64]
self.layers = layers
assert len(layers) >= 1, "Error: must have at least one hidden layer for the policy."
self.activ = act_fun
def make_actor(self, obs=None, reuse=False, scope="pi"):
if obs is None:
obs = self.processed_obs
with tf.variable_scope(scope, reuse=reuse):
# setup dropout and mechanisms to enable it during test time only
self.prob_dropout_ph = tf.placeholder_with_default(0.0, shape=(), name='prob_dropout_ph')
if self.feature_extraction == "cnn":
pi_h = self.cnn_extractor(obs, **self.cnn_kwargs)
else:
pi_h = tf.layers.flatten(obs)
for i, layer_size in enumerate(self.layers):
pi_h = tf.nn.dropout(pi_h, rate=self.prob_dropout_ph)
pi_h = tf.layers.dense(pi_h, layer_size, name='fc' + str(i))
if self.layer_norm:
pi_h = tf.contrib.layers.layer_norm(pi_h, center=True, scale=True)
pi_h = self.activ(pi_h)
pi_h = tf.nn.dropout(pi_h, rate=self.prob_dropout_ph)
self.policy = tf.nn.tanh(tf.layers.dense(pi_h, self.ac_space.shape[0], name=scope,
kernel_initializer=tf.random_uniform_initializer(minval=-3e-3,
maxval=3e-3)))
return self.policy
def make_critic(self, obs=None, action=None, reuse=False, scope="qf"):
if obs is None:
obs = self.processed_obs
if action is None:
action = self.action_ph
with tf.variable_scope(scope, reuse=reuse):
if self.feature_extraction == "cnn":
qf_h = self.cnn_extractor(obs, **self.cnn_kwargs)
else:
qf_h = tf.layers.flatten(obs)
for i, layer_size in enumerate(self.layers):
qf_h = tf.layers.dense(qf_h, layer_size, name='fc' + str(i))
if self.layer_norm:
qf_h = tf.contrib.layers.layer_norm(qf_h, center=True, scale=True)
qf_h = self.activ(qf_h)
if i == 0:
qf_h = tf.concat([qf_h, action], axis=-1)
# the name attribute is used in pop-art normalization
qvalue_fn = tf.layers.dense(qf_h, 1, name='qf_output',
kernel_initializer=tf.random_uniform_initializer(minval=-3e-3,
maxval=3e-3))
self.qvalue_fn = qvalue_fn
self._qvalue = qvalue_fn[:, 0]
return self.qvalue_fn
def step(self, obs, state=None, mask=None):
return self.sess.run(self.policy, {self.obs_ph: obs})
def proba_step(self, obs, state=None, mask=None):
return self.sess.run(self.policy, {self.obs_ph: obs})
def value(self, obs, action, state=None, mask=None):
return self.sess.run(self._qvalue, {self.obs_ph: obs, self.action_ph: action})
class MlpPolicyDropout(FeedForwardPolicyDropout):
"""
Policy object that implements actor critic, using a MLP (2 layers of 64)
:param sess: (TensorFlow session) The current TensorFlow session
:param ob_space: (Gym Space) The observation space of the environment
:param ac_space: (Gym Space) The action space of the environment
:param n_env: (int) The number of environments to run
:param n_steps: (int) The number of steps to run for each environment
:param n_batch: (int) The number of batch to run (n_envs * n_steps)
:param reuse: (bool) If the policy is reusable or not
:param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
"""
def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs):
super(MlpPolicyDropout, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
feature_extraction="mlp", **_kwargs)
def dump(obj):
""" Prints attributes of any object."""
for attr in dir(obj):
print("obj.%s = %r" % (attr, getattr(obj, attr)))
if __name__ == "__main__":
# create and wrap the environment
env = DummyVecEnv([lambda: gym.make('LunarLanderContinuous-v2')])
# the noise objects for DDPG
n_actions = env.action_space.shape[-1]
param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
# create model using custom policy
model = DDPG(MlpPolicyDropout, env, verbose=2, param_noise=param_noise, action_noise=action_noise)
model.learn(total_timesteps=1000)
# test dropout
print('****** Testing dropout ******')
obs = np.random.rand(8)
for i in range(20):
action, qval = model._policy(obs, apply_noise=False, compute_q=True)
print(f'action: {action} | qval: {qval[0]}')