frozen-lake / main.py
jackvial's picture
setup
0484cb0
import time
import curses
import numpy as np
import warnings
from environment import FrozenLakeEnvCustom
from agent import QLearningAgent
warnings.filterwarnings("ignore")
def train_agent(
n_training_episodes,
min_epsilon,
max_epsilon,
decay_rate,
env,
max_steps,
agent,
learning_rate,
gamma,
use_frame_delay,
):
for episode in range(n_training_episodes + 1):
# Reduce epsilon (because we need less and less exploration)
epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(
-decay_rate * episode
)
state, info = env.reset()
done = False
for step in range(max_steps):
# Choose the action At using epsilon greedy policy
action = agent.epsilon_greedy_policy(state, epsilon)
# Take action At and observe Rt+1 and St+1
# Take the action (a) and observe the outcome state(s') and reward (r)
new_state, reward, done, truncated, info = env.step(action)
agent.update_q_table(state, action, reward, gamma, learning_rate, new_state)
env.render(
title=f"Training: {episode}/{n_training_episodes}",
q_table=agent.q_table,
)
if use_frame_delay:
time.sleep(0.01)
if done:
break
state = new_state
return agent
def evaluate_agent(env, max_steps, n_eval_episodes, agent, seed, use_frame_delay):
successful_episodes = []
episodes_slips = []
for episode in range(n_eval_episodes + 1):
if seed:
state, info = env.reset(seed=seed[episode])
else:
state, info = env.reset()
done = False
total_rewards_ep = 0
slips = []
for step in range(max_steps):
# Take the action (index) that have the maximum expected future reward given that state
action = agent.greedy_policy(state)
expected_new_state = env.get_expected_new_state_for_action(action)
new_state, reward, done, truncated, info = env.step(action)
total_rewards_ep += reward
if expected_new_state != new_state:
slips.append((step, action, expected_new_state, new_state))
if reward != 0:
successful_episodes.append(episode)
env.render(
title=f"Evaluating: {episode}/{n_eval_episodes} | Slips: {len(slips)}",
q_table=agent.q_table,
)
episodes_slips.append(len(slips))
if use_frame_delay:
time.sleep(0.01)
if done:
break
state = new_state
mean_slips = np.mean(episodes_slips)
return successful_episodes, mean_slips
def main(screen):
# Training parameters
n_training_episodes = 2000 # Total training episodes
learning_rate = 0.1 # Learning rate
# Evaluation parameters
n_eval_episodes = 100 # Total number of test episodes
# Environment parameters
max_steps = 99 # Max steps per episode
gamma = 0.99 # Discounting rate
eval_seed = [] # The evaluation seed of the environment
# Exploration parameters
max_epsilon = 1.0 # Exploration probability at start
min_epsilon = 0.05 # Minimum exploration probability
decay_rate = 0.0005 # Exponential decay rate for exploration prob
use_frame_delay = False
env = FrozenLakeEnvCustom(map_name="4x4", is_slippery=True, render_mode="curses")
agent = QLearningAgent(env)
agent = train_agent(
n_training_episodes,
min_epsilon,
max_epsilon,
decay_rate,
env,
max_steps,
agent,
learning_rate,
gamma,
use_frame_delay,
)
successful_episodes, mean_slips = evaluate_agent(
env, max_steps, n_eval_episodes, agent, eval_seed, use_frame_delay
)
env_curses_screen = env.curses_screen
env_curses_screen.addstr(
5,
2,
f"Successful episodes: {len(successful_episodes)}/{n_eval_episodes} | Avg slips: {mean_slips:.2f}\n\n",
)
env_curses_screen.noutrefresh()
curses.doupdate()
time.sleep(10)
if __name__ == "__main__":
# Reset the terminal state after using curses
# Call main("") instead if you want to leave the final state of the environment
# on the terminal
curses.wrapper(main)