|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import tensorflow as tf |
|
import numpy as np |
|
import gym, time, random, threading |
|
from keras.callbacks import TensorBoard |
|
from keras.models import * |
|
from keras.layers import * |
|
from keras import backend as K |
|
|
|
from tcl_env_dqn_1 import * |
|
print("after import") |
|
import os |
|
|
|
|
|
|
|
|
|
path = os.getcwd() |
|
|
|
MODELS_DIRECTORY = path + '/success1' |
|
|
|
|
|
|
|
NAME= "A3C++logs/A3C++{}".format(int(time.time())) |
|
|
|
|
|
RUN_TIME = 5000 |
|
THREADS = 16 |
|
OPTIMIZERS = 2 |
|
THREAD_DELAY = 0.000001 |
|
|
|
N_STEP_RETURN = 15 |
|
GAMMA = 1.0 |
|
GAMMA_N = GAMMA ** N_STEP_RETURN |
|
|
|
EPS_START = .5 |
|
EPS_STOP = .001 |
|
EPS_DECAY = 5e-6 |
|
|
|
MIN_BATCH = 200 |
|
TR_FREQ = 100 |
|
|
|
LOSS_V = 0.4 |
|
LOSS_ENTROPY = 1.0 |
|
|
|
max_reward = -100.0 |
|
|
|
TRAINING_ITERATIONS = 1 |
|
LEARNING_RATE = 1e-3 |
|
|
|
|
|
|
|
class Brain: |
|
|
|
train_queue = [[], [], [], [], []] |
|
train_queue_copy = [[], [], [], [], []] |
|
lock_queue = threading.Lock() |
|
|
|
def __init__(self, **kwargs): |
|
self.env = kwargs.get("environment") |
|
self.learning_rate = kwargs.get('learning_rate', LEARNING_RATE) |
|
self.tr_freq = kwargs.get('training_frequency', TR_FREQ) |
|
self.min_batch = kwargs.get('min_batch', MIN_BATCH) |
|
self.gamman = kwargs.get('gamma_n', GAMMA_N) |
|
self.models_directory = kwargs.get('models_directory', MODELS_DIRECTORY) |
|
self.num_state = self.env.env.observation_space.shape[0] |
|
self.num_tcl =self.env.env.num_tcls |
|
self.num_actions= self.env.env.action_space.n |
|
self.none_state=np.zeros(self.num_state) |
|
tf.compat.v1.disable_eager_execution() |
|
|
|
|
|
K.manual_variable_initialization(True) |
|
|
|
self.model = self._build_model(num_state=self.num_state, num_tcls=self.num_tcl) |
|
self.graph = self._build_graph(self.model) |
|
|
|
|
|
|
|
self.max_reward = max_reward |
|
self.rewards = {} |
|
for i in range(self.env.env.day0, self.env.env.dayn): |
|
self.rewards[i] = self.max_reward |
|
|
|
|
|
|
|
def _build_model(self, num_state, num_tcls): |
|
|
|
l_input = Input(batch_shape=(None,num_state)) |
|
print('input shape') |
|
print(format(l_input.shape.as_list())) |
|
|
|
l_input1 = Lambda(lambda x: x[:, 0:num_tcls])(l_input) |
|
l_input2 = Lambda(lambda x: x[:, num_tcls:])(l_input) |
|
print(self.env.env.num_tcls) |
|
l_input1 = Reshape((num_tcls, 1))(l_input1) |
|
l_Pool = AveragePooling1D(pool_size=num_tcls)(l_input1) |
|
l_Pool = Reshape([1])(l_Pool) |
|
l_dense = Concatenate()([l_Pool, l_input2]) |
|
l_dense = Dense(100, activation='relu')(l_dense) |
|
l_dense = Dropout(0.3)(l_dense) |
|
out = Dense(self.num_actions, activation='softmax')(l_dense) |
|
out_value = Dense(1, activation='linear')(l_dense) |
|
model = Model(inputs=l_input, outputs=[out, out_value]) |
|
model._make_predict_function() |
|
return model |
|
|
|
def _build_graph(self, model): |
|
s_t = tf.compat.v1.placeholder(tf.float32, shape=(None, self.num_state)) |
|
a_t = tf.compat.v1.placeholder(tf.float32, shape=(None, self.num_actions)) |
|
r_t = tf.compat.v1.placeholder(tf.float32, shape=(None, 1)) |
|
p, v = model(s_t) |
|
log_prob = tf.math.log(tf.reduce_sum(input_tensor=p * a_t, axis=1, keepdims=True) + 1e-10) |
|
advantage = r_t - v |
|
loss_policy = -log_prob * tf.stop_gradient(advantage) |
|
loss_value = LOSS_V * tf.square(advantage) |
|
entropy = LOSS_ENTROPY * (tf.reduce_sum(input_tensor=p * tf.math.log(p + 1e-10), axis=1, keepdims=True)) |
|
loss_total = tf.reduce_mean(input_tensor=loss_policy + loss_value + entropy) |
|
optimizer = tf.compat.v1.train.RMSPropOptimizer(self.learning_rate) |
|
minimize = optimizer.minimize(loss_total) |
|
return s_t, a_t, r_t, minimize, loss_total |
|
|
|
def optimize(self): |
|
|
|
if len(self.train_queue_copy[0])<self.tr_freq or len(self.train_queue_copy[0])<self.min_batch : |
|
time.sleep(0) |
|
return |
|
|
|
with self.lock_queue: |
|
if len(self.train_queue_copy[0])<self.tr_freq: |
|
return |
|
|
|
|
|
self.train_queue = random.sample(np.array(self.train_queue).T.tolist(), self.min_batch) |
|
self.train_queue = np.array(self.train_queue).T.tolist() |
|
s, a, r, s_, s_mask = self.train_queue_copy |
|
self.train_queue_copy = [[], [], [], [], []] |
|
|
|
s = np.vstack(s) |
|
a = np.vstack(a) |
|
r = np.vstack(r) |
|
s_ = np.vstack(s_) |
|
s_mask = np.vstack(s_mask) |
|
|
|
if len(s) > 5 * self.min_batch: print("Optimizer alert! Minimizing batch of %d" % len(s)) |
|
|
|
v = self.predict_v(s_) |
|
r = r + self.gamman * v * s_mask |
|
|
|
s_t, a_t, r_t, minimize, loss = self.graph |
|
print("Training...") |
|
|
|
minimize(s,a,r) |
|
|
|
print("Done...") |
|
|
|
|
|
|
|
def train_push(self, s, a, r, s_): |
|
with self.lock_queue: |
|
self.train_queue[0].append(s) |
|
self.train_queue[1].append(a) |
|
self.train_queue[2].append(r) |
|
|
|
self.train_queue_copy[0].append(s) |
|
self.train_queue_copy[1].append(a) |
|
self.train_queue_copy[2].append(r) |
|
|
|
if s_ is None: |
|
self.train_queue[3].append(self.none_state) |
|
self.train_queue[4].append(0.) |
|
|
|
self.train_queue_copy[3].append(self.none_state) |
|
self.train_queue_copy[4].append(0.) |
|
else: |
|
self.train_queue[3].append(s_) |
|
self.train_queue[4].append(1.) |
|
|
|
self.train_queue_copy[3].append(s_) |
|
self.train_queue_copy[4].append(1.) |
|
|
|
def predict(self, s): |
|
|
|
p, v = self.model.predict(s) |
|
return p, v |
|
|
|
def predict_p(self, s): |
|
|
|
p, v = self.model.predict(s) |
|
return p |
|
|
|
def predict_p_vote(self, s): |
|
|
|
votes=[] |
|
|
|
for filename in os.listdir(self.models_directory): |
|
if filename.endswith(".h5"): |
|
|
|
|
|
try: |
|
|
|
self.model.load_weights(self.models_directory+"/"+filename) |
|
|
|
p = self.model.predict(s)[0][0] |
|
|
|
|
|
votes.append(ACTIONS[np.argmax(p)]) |
|
except : |
|
print(filename+"didn't vote!") |
|
pass |
|
boosted_p = np.average(np.array(votes),axis=0) |
|
return np.rint(boosted_p).astype(int) |
|
|
|
|
|
def predict_v(self, s): |
|
|
|
p, v = self.model.predict(s) |
|
return v |
|
|
|
|
|
|
|
|
|
frames = 0 |
|
|
|
class Agent: |
|
def __init__(self, eps_start, eps_end, eps_decay, num_actions): |
|
self.eps_start = eps_start |
|
self.eps_end = eps_end |
|
self.eps_decay = eps_decay |
|
self.memory = [] |
|
self.R = 0. |
|
self.num_actions = num_actions |
|
|
|
def getEpsilon(self): |
|
return max(self.eps_start - frames * self.eps_decay,self.eps_end) |
|
|
|
def act(self, s,render=False, br=None): |
|
global frames, brain |
|
if br != None: |
|
brain = br |
|
eps = self.getEpsilon() |
|
frames = frames + 1 |
|
|
|
if random.random() < eps: |
|
p = np.random.dirichlet(np.ones(self.num_actions), size=1) |
|
else: |
|
s = np.array([s]) |
|
if render: |
|
print('starting the vote') |
|
a = brain.predict_p_vote(s) |
|
p= np.random.dirichlet(np.ones(self.num_actions), size=1) |
|
print(a) |
|
return list(a),p |
|
p = brain.predict_p(s) |
|
|
|
|
|
|
|
a = np.argmax(p.reshape(self.num_actions,)) |
|
return a,p |
|
|
|
def train(self, s, a, r, s_): |
|
def get_sample(memory, n): |
|
s, a, _, _ = memory[0] |
|
_, _, _, s_ = memory[n - 1] |
|
|
|
return s, a, self.R, s_ |
|
|
|
a_cats = a |
|
|
|
|
|
self.memory.append((s, a_cats, r, s_)) |
|
|
|
self.R = (self.R + r * GAMMA_N) / GAMMA |
|
if s_ is None: |
|
while len(self.memory) > 0: |
|
n = len(self.memory) |
|
s, a, r, s_ = get_sample(self.memory, n) |
|
brain.train_push(s, a, r, s_) |
|
self.R = (self.R - self.memory[0][2]) / GAMMA |
|
self.memory.pop(0) |
|
self.R = 0 |
|
|
|
if len(self.memory) >= N_STEP_RETURN: |
|
s, a, r, s_ = get_sample(self.memory, N_STEP_RETURN) |
|
brain.train_push(s, a, r, s_) |
|
self.R = self.R - self.memory[0][2] |
|
self.memory.pop(0) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Environment(threading.Thread): |
|
stop_signal = False |
|
|
|
def __init__(self, render=False, eps_start=EPS_START, eps_end=EPS_STOP, eps_decay=EPS_DECAY, **kwargs): |
|
threading.Thread.__init__(self) |
|
|
|
self.render = render |
|
self.env = MicroGridEnv(**kwargs) |
|
self.agent = Agent(eps_start, eps_end, eps_decay,num_actions=self.env.action_space.n) |
|
self.brain = None |
|
|
|
|
|
def runEpisode(self,day=None, pplt=True, web = False): |
|
|
|
if web==False: |
|
s = self.env.reset_all(day=day) |
|
else: |
|
s = self.env.reset(day=day) |
|
R = 0 |
|
while True: |
|
time.sleep(THREAD_DELAY) |
|
|
|
a, p = self.agent.act(s,self.render, self.brain) |
|
|
|
s_, r, done, _ = self.env.step(a) |
|
R += r |
|
|
|
if self.render: |
|
self.env.render(R) |
|
|
|
if done: |
|
s_ = None |
|
|
|
if not self.render: |
|
aa = np.zeros(shape=(NUM_ACTIONS,)) |
|
aa[a] = 1 |
|
self.agent.train(s, aa, r, s_) |
|
s = s_ |
|
|
|
if done: |
|
break |
|
print("episode has been ran") |
|
print(R) |
|
if web==False: |
|
REWARDS[self.env.day].append(R) |
|
|
|
if self.render: |
|
return R |
|
if R > brain.rewards[self.env.day] and self.agent.getEpsilon()<0.2: |
|
print('new max found: '+str(R)) |
|
print("-------------------------------------------------------------------------------------------------") |
|
try: |
|
|
|
writer = tf.compat.v1.summary.FileWriter(NAME, brain.session.graph) |
|
brain.model.save(MODELS_DIRECTORY+"/A3C++" + str(self.env.day) + ".h5") |
|
print("Model saved") |
|
except: |
|
pass |
|
brain.rewards[self.env.day] = R |
|
|
|
|
|
def run(self): |
|
while not self.stop_signal: |
|
self.runEpisode() |
|
|
|
def stop(self): |
|
self.stop_signal = True |
|
|
|
|
|
|
|
class Optimizer(threading.Thread): |
|
stop_signal = False |
|
|
|
def __init__(self): |
|
threading.Thread.__init__(self) |
|
|
|
def run(self): |
|
while not self.stop_signal: |
|
brain.optimize() |
|
|
|
def stop(self): |
|
self.stop_signal = True |
|
|
|
|
|
|
|
|
|
if __name__ =="__main__": |
|
import sys |
|
TRAIN=False |
|
|
|
|
|
|
|
|
|
|
|
DAY0 = 0 |
|
DAYN = 10 |
|
|
|
REWARDS = {} |
|
for i in range(DAY0,DAYN): |
|
REWARDS[i]=[] |
|
|
|
env_test = Environment(render=True, eps_start=0., eps_end=0., day0=DAY0, dayn=DAYN, iterations=24) |
|
NUM_STATE = env_test.env.observation_space.shape[0] |
|
NUM_ACTIONS = env_test.env.action_space.n |
|
NONE_STATE = np.zeros(NUM_STATE) |
|
|
|
brain = Brain(environment=env_test) |
|
|
|
if TRAIN: |
|
|
|
envs = [Environment(day0=DAY0, dayn=DAYN) for i in range(THREADS)] |
|
opts = [Optimizer() for i in range(OPTIMIZERS)] |
|
t0=time.time() |
|
|
|
for o in opts: |
|
o.start() |
|
|
|
for e in envs: |
|
e.start() |
|
|
|
time.sleep(RUN_TIME) |
|
|
|
for e in envs: |
|
e.stop() |
|
for e in envs: |
|
e.join() |
|
|
|
for o in opts: |
|
o.stop() |
|
for o in opts: |
|
o.join() |
|
brain.model.save("success00/A3C++" + ".h5") |
|
print("Training finished") |
|
print('training_time:', time.time()-t0) |
|
|
|
import pickle |
|
with open("REWARDS_A3C++train.pkl", 'wb') as f: |
|
pickle.dump(REWARDS, f, pickle.HIGHEST_PROTOCOL) |
|
|
|
|
|
try: |
|
for day in range(DAY0,DAYN): |
|
env_test.runEpisode(day) |
|
print("average reward: ",np.average([list(REWARDS[i])[-1] for i in range(DAY0,DAYN)])) |
|
import pickle |
|
|
|
|
|
except NameError: |
|
print(NameError) |
|
|
|
|
|
|
|
|