SelfDriving / dqn.py
VarunKumarGupta2003's picture
Upload 7 files
b89a51c verified
raw
history blame contribute delete
No virus
11 kB
from __future__ import generator_stop
from exp_replay import ExperienceReplay
import numpy as np
import tensorflow.contrib.slim as slim
import tensorflow as tf
import re
from processimage import processimage
class DQN:
def __init__(self,
env,
batchsize=64,
pic_size=(96, 96),
num_frame_stack=3,
gamma=0.95,
frame_skip=3,
train_freq=3,
initial_epsilon=1,
min_epsilon=0.05,
render=False,
epsilon_decay_steps=int(100000),
min_experience_size=int(1000),
experience_capacity=int(100000),
target_network_update_freq=1000,
regularization = 1e-6,
optimizer_params = None,
action_map=None
):
self.exp_history = ExperienceReplay(
num_frame_stack,
capacity=experience_capacity,
pic_size=pic_size
)
# in playing mode we don't store the experience to agent history
# but this cache is still needed to get the current frame stack
self.playing_cache = ExperienceReplay(
num_frame_stack,
capacity=num_frame_stack * 5 + 10,
pic_size=pic_size
)
if action_map is not None:
self.dim_actions = len(action_map)
else:
self.dim_actions = env.action_space.n
self.target_network_update_freq = target_network_update_freq
self.action_map = action_map
self.env = env
self.batchsize = batchsize
self.num_frame_stack = num_frame_stack
self.gamma = gamma
self.frame_skip = frame_skip
self.train_freq = train_freq
self.initial_epsilon = initial_epsilon
self.min_epsilon = min_epsilon
self.epsilon_decay_steps = epsilon_decay_steps
self.render = render
self.min_experience_size = min_experience_size
self.pic_size = pic_size
self.regularization = regularization
# These default magic values always work with Adam
self.global_step = tf.Variable(0, trainable=False)
self.increment_global_step_op = tf.assign(self.global_step, self.global_step+1)
self.decayed_lr = tf.train.exponential_decay(0.001, self.global_step, 200000, 0.7, staircase=False)
lr = self.decayed_lr
# lr = 0.001
self.optimizer_params = optimizer_params or dict(learning_rate=lr, epsilon=1e-7)
self.do_training = True
self.playing_epsilon = 0.0
self.session = None
self.state_size = (self.num_frame_stack,) + self.pic_size
self.global_counter = 0
self.episode_counter = 0
def build_graph(self):
input_dim_general = (None, self.pic_size[0], self.pic_size[1], self.num_frame_stack) # (None, 4, 96, 96) changed to (None, 96, 96, 4)
input_dim_with_batch = (self.batchsize, self.pic_size[0], self.pic_size[1], self.num_frame_stack) #Input dimensions: (64, 4, 96, 96) changed to (64, 96, 96, 4)
self.input_prev_state = tf.compat.v1.placeholder(tf.float32, input_dim_general, "prev_state")
self.input_next_state = tf.compat.v1.placeholder(tf.float32, input_dim_with_batch, "next_state")
self.input_reward = tf.compat.v1.placeholder(tf.float32, self.batchsize, "reward")
self.input_actions = tf.compat.v1.placeholder(tf.int32, self.batchsize, "actions")
self.input_done_mask = tf.compat.v1.placeholder(tf.int32, self.batchsize, "done_mask")
# The target Q-values come from the fixed network
with tf.compat.v1.variable_scope("fixed"): #64 96 96 3
# Create target network which is gonna be fixed and updated every C parameters
qsa_targets = self.create_network(self.input_next_state, trainable=False)
with tf.compat.v1.variable_scope("train"): # ? 96 96 3
# Create Prediction/Estimate network which will be trained/updated every 3 frames
# Create Prediction/Estimate network which will be trained/updated every 3 frames
qsa_estimates = self.create_network(self.input_prev_state, trainable=True)
self.best_action = tf.argmax(qsa_estimates, axis=1)
not_done = tf.cast(tf.logical_not(tf.cast(self.input_done_mask, "bool")), "float32")
# select the chosen action from each row
# in numpy this is qsa_estimates[range(batchsize), self.input_actions]
action_slice = tf.stack([tf.range(0, self.batchsize), self.input_actions], axis=1)
#
q_estimates_for_input_action = tf.gather_nd(qsa_estimates, action_slice)
#Taken from paper : Loss = [(r + gamma*max Qtarget)-(Q estimate)^2]
q_target = tf.reduce_max(qsa_targets, -1) * self.gamma * not_done + self.input_reward
training_loss = tf.nn.l2_loss(q_target - q_estimates_for_input_action) / self.batchsize
# reg_loss = tf.add_n(tf.losses.get_regularization_losses())
reg_loss = [0]
#Adam optimizer
optimizer = tf.train.AdamOptimizer(**(self.optimizer_params))
#Adadelta optimizer:
# optimizer = tf.train.RMSPropOptimizer(**(self.optimizer_params))
self.train_op = optimizer.minimize(reg_loss + training_loss)
train_params = self.get_variables("train")
fixed_params = self.get_variables("fixed")
assert (len(train_params) == len(fixed_params))
self.copy_network_ops = [tf.assign(fixed_v, train_v) for train_v, fixed_v in zip(train_params, fixed_params)]
def get_variables(self, scope):
vars = [t for t in tf.compat.v1.global_variables()
if "%s/" % scope in t.name and "Adam" not in t.name]
return sorted(vars, key=lambda v: v.name)
def create_network(self, input, trainable):
if trainable:
# wr = None
wr = tf.compat.v1.keras.regularizers.l2(l=self.regularization)
else:
wr = None
net = tf.layers.conv2d(inputs=input, filters=8, kernel_size=(7,7), strides=4, name='conv1', kernel_regularizer=wr)
net = tf.nn.relu(net)
net = tf.nn.max_pool2d(net, ksize=2, strides=2, padding='SAME')
net = tf.layers.conv2d(inputs=net, filters=16, kernel_size=(3, 3), strides=1, name='conv2',
kernel_regularizer=wr)
net = tf.nn.relu(net)
net = tf.nn.max_pool2d(net, ksize=2, strides=2, padding='SAME')
net = tf.layers.flatten(net)
net = tf.layers.dense(net, 400, activation=tf.nn.relu, kernel_regularizer=wr)
# net = tf.layers.dropout(net, 0.5)
q_state_action_values = tf.layers.dense(net, self.dim_actions, activation=None, kernel_regularizer=wr)
return q_state_action_values
# def check_early_stop(self, reward, totalreward):
# return False, 0.0
def get_random_action(self):
return np.random.choice(self.dim_actions)
def get_epsilon(self):
if not self.do_training:
return self.playing_epsilon
elif self.global_counter >= self.epsilon_decay_steps:
return self.min_epsilon
else:
# linear decay
r = 1.0 - self.global_counter / float(self.epsilon_decay_steps)
return self.min_epsilon + (self.initial_epsilon - self.min_epsilon) * r
def train(self):
batch = self.exp_history.sample_mini_batch(self.batchsize)
# Feed dict
fd = {
self.input_reward: "reward",
self.input_prev_state: "prev_state",
self.input_next_state: "next_state",
self.input_actions: "actions",
self.input_done_mask: "done_mask"
}
fd1 = {ph: batch[k] for ph, k in fd.items()}
self.session.run([self.train_op], fd1)
def play_episode(self, render, load_checkpoint):
eh = (
self.exp_history if self.do_training
else self.playing_cache
)
total_reward = 0
total_score = 0
frames_in_episode = 0
first_frame = self.env.reset()
first_frame_pp = processimage.process_image(first_frame)
eh.start_new_episode(first_frame_pp)
epsilon = self.get_epsilon()
while True:
if np.random.rand() > epsilon and not load_checkpoint:
action_idx = self.session.run(
self.best_action,
{self.input_prev_state: eh.current_state()[np.newaxis, ...]}
)[0]
elif not load_checkpoint:
action_idx = self.get_random_action()
elif load_checkpoint:
action_idx = self.session.run(
self.best_action,
{self.input_prev_state: eh.current_state()[np.newaxis, ...]}
)[0]
if self.action_map is not None:
action = self.action_map[action_idx]
else:
action = action_idx
reward = 0
score = 0
for _ in range(self.frame_skip):
observation, r, done, info = self.env.step(action)
if render:
self.env.render()
score += r
#Increase rewards on the last frames if reward is positive
if r > 0:
r = r + frames_in_episode*0.2 #in 230 frames late game it adds +- 50 reward to tiles
reward += r
if done:
break
early_done, punishment = self.check_early_stop(reward, total_reward, frames_in_episode)
if early_done:
reward += punishment
done = done or early_done
total_reward += reward
total_score += score
frames_in_episode += 1
observation = processimage.process_image(observation)
eh.add_experience(observation, action_idx, done, reward)
if self.do_training:
self.global_counter += 1
step = self.session.run(self.increment_global_step_op)
if self.global_counter % self.target_network_update_freq:
self.update_target_network()
train_cond = (
self.exp_history.counter >= self.min_experience_size and
self.global_counter % self.train_freq == 0
)
if train_cond:
self.train()
if done:
if self.do_training:
self.episode_counter += 1
return total_score, total_reward, frames_in_episode, epsilon
def update_target_network(self):
self.session.run(self.copy_network_ops)