frozen-lake / main.py
jackvial's picture
setup
0484cb0
raw history blame
No virus
4.46 kB
import time
import curses
import numpy as np
import warnings
from environment import FrozenLakeEnvCustom
from agent import QLearningAgent
warnings.filterwarnings("ignore")
def train_agent(
n_training_episodes,
min_epsilon,
max_epsilon,
decay_rate,
env,
max_steps,
agent,
learning_rate,
gamma,
use_frame_delay,
):
for episode in range(n_training_episodes + 1):
# Reduce epsilon (because we need less and less exploration)
epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(
-decay_rate * episode
)
state, info = env.reset()
done = False
for step in range(max_steps):
# Choose the action At using epsilon greedy policy
action = agent.epsilon_greedy_policy(state, epsilon)
# Take action At and observe Rt+1 and St+1
# Take the action (a) and observe the outcome state(s') and reward (r)
new_state, reward, done, truncated, info = env.step(action)
agent.update_q_table(state, action, reward, gamma, learning_rate, new_state)
env.render(
title=f"Training: {episode}/{n_training_episodes}",
q_table=agent.q_table,
)
if use_frame_delay:
time.sleep(0.01)
if done:
break
state = new_state
return agent
def evaluate_agent(env, max_steps, n_eval_episodes, agent, seed, use_frame_delay):
successful_episodes = []
episodes_slips = []
for episode in range(n_eval_episodes + 1):
if seed:
state, info = env.reset(seed=seed[episode])
else:
state, info = env.reset()
done = False
total_rewards_ep = 0
slips = []
for step in range(max_steps):
# Take the action (index) that have the maximum expected future reward given that state
action = agent.greedy_policy(state)
expected_new_state = env.get_expected_new_state_for_action(action)
new_state, reward, done, truncated, info = env.step(action)
total_rewards_ep += reward
if expected_new_state != new_state:
slips.append((step, action, expected_new_state, new_state))
if reward != 0:
successful_episodes.append(episode)
env.render(
title=f"Evaluating: {episode}/{n_eval_episodes} | Slips: {len(slips)}",
q_table=agent.q_table,
)
episodes_slips.append(len(slips))
if use_frame_delay:
time.sleep(0.01)
if done:
break
state = new_state
mean_slips = np.mean(episodes_slips)
return successful_episodes, mean_slips
def main(screen):
# Training parameters
n_training_episodes = 2000 # Total training episodes
learning_rate = 0.1 # Learning rate
# Evaluation parameters
n_eval_episodes = 100 # Total number of test episodes
# Environment parameters
max_steps = 99 # Max steps per episode
gamma = 0.99 # Discounting rate
eval_seed = [] # The evaluation seed of the environment
# Exploration parameters
max_epsilon = 1.0 # Exploration probability at start
min_epsilon = 0.05 # Minimum exploration probability
decay_rate = 0.0005 # Exponential decay rate for exploration prob
use_frame_delay = False
env = FrozenLakeEnvCustom(map_name="4x4", is_slippery=True, render_mode="curses")
agent = QLearningAgent(env)
agent = train_agent(
n_training_episodes,
min_epsilon,
max_epsilon,
decay_rate,
env,
max_steps,
agent,
learning_rate,
gamma,
use_frame_delay,
)
successful_episodes, mean_slips = evaluate_agent(
env, max_steps, n_eval_episodes, agent, eval_seed, use_frame_delay
)
env_curses_screen = env.curses_screen
env_curses_screen.addstr(
5,
2,
f"Successful episodes: {len(successful_episodes)}/{n_eval_episodes} | Avg slips: {mean_slips:.2f}\n\n",
)
env_curses_screen.noutrefresh()
curses.doupdate()
time.sleep(10)
if __name__ == "__main__":
# Reset the terminal state after using curses
# Call main("") instead if you want to leave the final state of the environment
# on the terminal
curses.wrapper(main)