Spaces:

jackvial
/

frozen-lake

Running

App Files Files Community

jackvial commited on Dec 29, 2022

Commit

0484cb0

•

0 Parent(s):

setup

Browse files

Files changed (6) hide show

.gitignore +3 -0
.vscode/launch.json +15 -0
agent.py +30 -0
environment.py +179 -0
main.py +157 -0
requirements.txt +6 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.env
+frozen_lake_env
+__pycache__

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Debug",
+            "type": "python",
+            "request": "launch",
+            "python": "${workspaceFolder}/frozen_lake_env/bin/python3.10",
+            "program": "${workspaceFolder}/main.py",
+            "envFile": "${workspaceFolder}/.env",
+            "console": "integratedTerminal",
+            "justMyCode": false
+        }
+    ]
+}

agent.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import numpy as np
+class QLearningAgent:
+    def __init__(self, env) -> None:
+        self.env = env
+        self.q_table = self.build_q_table(env.observation_space.n, env.action_space.n)
+    def build_q_table(self, n_states, n_actions):
+        return np.zeros((n_states, n_actions))
+    def epsilon_greedy_policy(self, state, epsilon):
+        # Epsilon probability of taking a random action or the
+        # action that has the highest Q value for the current state
+        if np.random.random() < epsilon:
+            return np.random.choice(self.env.action_space.n)
+        return np.argmax(self.q_table[state])
+    def greedy_policy(self, state):
+        return np.argmax(self.q_table[state])
+    def update_q_table(self, state, action, reward, gamma, learning_rate, new_state):
+        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
+        current_q = self.q_table[state][action]
+        next_max_q = np.max(self.q_table[new_state])
+        self.q_table[state][action] = current_q + learning_rate * (
+            reward + gamma * next_max_q - current_q
+        )

environment.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import re
+import curses
+import numpy as np
+import collections
+import warnings
+from typing import Optional
+from gym.envs.toy_text.frozen_lake import FrozenLakeEnv
+warnings.filterwarnings("ignore")
+class FrozenLakeEnvCustom(FrozenLakeEnv):
+    def __init__(
+        self,
+        render_mode: Optional[str] = None,
+        desc=None,
+        map_name="4x4",
+        is_slippery=True,
+    ):
+        self.curses_screen = curses.initscr()
+        curses.start_color()
+        curses.curs_set(0)
+        self.curses_color_pairs = self.build_ncurses_color_pairs()
+        # Blocking reads
+        self.curses_screen.timeout(-1)
+        super().__init__(
+            render_mode=render_mode,
+            desc=desc,
+            map_name=map_name,
+            is_slippery=is_slippery,
+        )
+    def build_ncurses_color_pairs(self):
+        """
+        Based on Deepmind Pycolab https://github.com/deepmind/pycolab/blob/master/pycolab/human_ui.py
+        """
+        color_fg = {
+            " ": (0, 0, 0),
+            "S": (368, 333, 388),
+            "H": (309, 572, 999),
+            "P": (999, 364, 0),
+            "F": (500, 999, 948),
+            "G": (999, 917, 298),
+            "?": (368, 333, 388),
+            "←": (309, 572, 999),
+            "↓": (999, 364, 0),
+            "→": (500, 999, 948),
+            "↑": (999, 917, 298),
+        }
+        color_pair = {}
+        cpair_0_fg_id, cpair_0_bg_id = curses.pair_content(0)
+        ids = set(range(curses.COLORS - 1)) - {
+            cpair_0_fg_id,
+            cpair_0_bg_id,
+        }
+        # We use color IDs from large to small.
+        ids = list(reversed(sorted(ids)))
+        # But only those color IDs we actually need.
+        ids = ids[: len(color_fg)]
+        color_ids = dict(zip(color_fg.values(), ids))
+        # Program these colors into curses.
+        for color, cid in color_ids.items():
+            curses.init_color(cid, *color)
+        # Now add the default colors to the color-to-ID map.
+        cpair_0_fg = curses.color_content(cpair_0_fg_id)
+        cpair_0_bg = curses.color_content(cpair_0_bg_id)
+        color_ids[cpair_0_fg] = cpair_0_fg_id
+        color_ids[cpair_0_bg] = cpair_0_bg_id
+        # The color pair IDs we'll use for all characters count up from 1; note that
+        # the "default" color pair of 0 is already defined, since _color_pair is a
+        # defaultdict.
+        color_pair.update(
+            {character: pid for pid, character in enumerate(color_fg, start=1)}
+        )
+        # Program these color pairs into curses, and that's all there is to do.
+        for character, pid in color_pair.items():
+            # Get foreground and background colors for this character. Note how in
+            # the absence of a specified background color, the same color as the
+            # foreground is used.
+            cpair_fg = color_fg.get(character, cpair_0_fg_id)
+            cpair_bg = color_fg.get(character, cpair_0_fg_id)
+            # Get color IDs for those colors and initialise a color pair.
+            cpair_fg_id = color_ids[cpair_fg]
+            cpair_bg_id = color_ids[cpair_bg]
+            curses.init_pair(pid, cpair_fg_id, cpair_bg_id)
+        return color_pair
+    def render_ncurses_ui(self, screen, board, color_pair, title, q_table):
+        screen.erase()
+        # Draw the title
+        screen.addstr(0, 2, title)
+        # Draw the game board
+        for row_index, board_line in enumerate(board, start=1):
+            screen.move(row_index, 2)
+            for codepoint in "".join(list(board_line)):
+                screen.addch(codepoint, curses.color_pair(color_pair[codepoint]))
+        def action_to_char(action):
+            if action == 0:
+                return "←"
+            elif action == 1:
+                return "↓"
+            elif action == 2:
+                return "→"
+            elif action == 3:
+                return "↑"
+            else:
+                return "?"
+        # Draw the action grid
+        max_action_table = np.argmax(q_table, axis=1).reshape(4, 4)
+        for row_index, row in enumerate(max_action_table, start=1):
+            screen.move(row_index, 8)
+            for action in row:
+                char = action_to_char(action)
+                screen.addch(char, curses.color_pair(color_pair[char]))
+        # Draw the Q-table
+        q_table_2d = q_table.reshape(4, 16)
+        for row_index, row in enumerate(q_table_2d, start=1):
+            screen.move(row_index, 14)
+            for col_index, col in enumerate(row):
+                action = col_index % 4
+                char = action_to_char(action)
+                screen.addstr(f" {col:.2f}", curses.color_pair(color_pair[char]))
+                if action == 3:
+                    screen.addstr(" ", curses.color_pair(color_pair[" "]))
+        # Redraw the game screen (but in the curses memory buffer only).
+        screen.noutrefresh()
+    def ansi_frame_to_board(self, frame_string):
+        parts = frame_string.split("\n")
+        board = []
+        p = "\x1b[41m"
+        for part in parts[1:]:
+            if len(part):
+                row = re.findall(r"S|F|H|G", part)
+                try:
+                    row[part.index(p)] = "P"
+                except:
+                    pass
+                board.append(row)
+        return np.array(board)
+    def render(self, title=None, q_table=None):
+        if self.render_mode == "curses":
+            frame = self._render_text()
+            board = self.ansi_frame_to_board(frame)
+            self.render_ncurses_ui(
+                self.curses_screen, board, self.curses_color_pairs, title, q_table
+            )
+            # Show the screen to the user.
+            curses.doupdate()
+            return board
+        return super().render()
+    def get_expected_new_state_for_action(self, action):
+        return self.P[self.s][action][1][1]

main.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import time
+import curses
+import numpy as np
+import warnings
+from environment import FrozenLakeEnvCustom
+from agent import QLearningAgent
+warnings.filterwarnings("ignore")
+def train_agent(
+    n_training_episodes,
+    min_epsilon,
+    max_epsilon,
+    decay_rate,
+    env,
+    max_steps,
+    agent,
+    learning_rate,
+    gamma,
+    use_frame_delay,
+):
+    for episode in range(n_training_episodes + 1):
+        # Reduce epsilon (because we need less and less exploration)
+        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(
+            -decay_rate * episode
+        )
+        state, info = env.reset()
+        done = False
+        for step in range(max_steps):
+            # Choose the action At using epsilon greedy policy
+            action = agent.epsilon_greedy_policy(state, epsilon)
+            # Take action At and observe Rt+1 and St+1
+            # Take the action (a) and observe the outcome state(s') and reward (r)
+            new_state, reward, done, truncated, info = env.step(action)
+            agent.update_q_table(state, action, reward, gamma, learning_rate, new_state)
+            env.render(
+                title=f"Training: {episode}/{n_training_episodes}",
+                q_table=agent.q_table,
+            )
+            if use_frame_delay:
+                time.sleep(0.01)
+            if done:
+                break
+            state = new_state
+    return agent
+def evaluate_agent(env, max_steps, n_eval_episodes, agent, seed, use_frame_delay):
+    successful_episodes = []
+    episodes_slips = []
+    for episode in range(n_eval_episodes + 1):
+        if seed:
+            state, info = env.reset(seed=seed[episode])
+        else:
+            state, info = env.reset()
+        done = False
+        total_rewards_ep = 0
+        slips = []
+        for step in range(max_steps):
+            # Take the action (index) that have the maximum expected future reward given that state
+            action = agent.greedy_policy(state)
+            expected_new_state = env.get_expected_new_state_for_action(action)
+            new_state, reward, done, truncated, info = env.step(action)
+            total_rewards_ep += reward
+            if expected_new_state != new_state:
+                slips.append((step, action, expected_new_state, new_state))
+            if reward != 0:
+                successful_episodes.append(episode)
+            env.render(
+                title=f"Evaluating: {episode}/{n_eval_episodes} | Slips: {len(slips)}",
+                q_table=agent.q_table,
+            )
+            episodes_slips.append(len(slips))
+            if use_frame_delay:
+                time.sleep(0.01)
+            if done:
+                break
+            state = new_state
+    mean_slips = np.mean(episodes_slips)
+    return successful_episodes, mean_slips
+def main(screen):
+    # Training parameters
+    n_training_episodes = 2000  # Total training episodes
+    learning_rate = 0.1  # Learning rate
+    # Evaluation parameters
+    n_eval_episodes = 100  # Total number of test episodes
+    # Environment parameters
+    max_steps = 99  # Max steps per episode
+    gamma = 0.99  # Discounting rate
+    eval_seed = []  # The evaluation seed of the environment
+    # Exploration parameters
+    max_epsilon = 1.0  # Exploration probability at start
+    min_epsilon = 0.05  # Minimum exploration probability
+    decay_rate = 0.0005  # Exponential decay rate for exploration prob
+    use_frame_delay = False
+    env = FrozenLakeEnvCustom(map_name="4x4", is_slippery=True, render_mode="curses")
+    agent = QLearningAgent(env)
+    agent = train_agent(
+        n_training_episodes,
+        min_epsilon,
+        max_epsilon,
+        decay_rate,
+        env,
+        max_steps,
+        agent,
+        learning_rate,
+        gamma,
+        use_frame_delay,
+    )
+    successful_episodes, mean_slips = evaluate_agent(
+        env, max_steps, n_eval_episodes, agent, eval_seed, use_frame_delay
+    )
+    env_curses_screen = env.curses_screen
+    env_curses_screen.addstr(
+        5,
+        2,
+        f"Successful episodes: {len(successful_episodes)}/{n_eval_episodes} | Avg slips: {mean_slips:.2f}\n\n",
+    )
+    env_curses_screen.noutrefresh()
+    curses.doupdate()
+    time.sleep(10)
+if __name__ == "__main__":
+    # Reset the terminal state after using curses
+    # Call main("") instead if you want to leave the final state of the environment
+    # on the terminal
+    curses.wrapper(main)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+cloudpickle==2.2.0
+gym==0.26.2
+gym-notices==0.0.8
+numpy==1.24.0
+pygame==2.1.0
+tqdm==4.64.1