leaderboard

Running on CPU Upgrade

App Files Files Community

Quentin Gallouédec commited on Apr 6

Commit

76e0bcf

•

1 Parent(s): 4a5bd80

move eval to dedicated file

Browse files

Files changed (2) hide show

app.py +3 -86
src/evaluation.py +277 -0

app.py CHANGED Viewed

@@ -1,40 +1,24 @@
-import fnmatch
 import glob
 import json
-import logging
 import os
 import pprint
 import gradio as gr
-import gymnasium as gym
-import numpy as np
 import pandas as pd
-import torch
 from apscheduler.schedulers.background import BackgroundScheduler
-from huggingface_hub import hf_hub_download, snapshot_download
-from huggingface_hub.utils._errors import EntryNotFoundError
 from src.css_html_js import dark_mode_gradio_js
 from src.envs import API, RESULTS_PATH, RESULTS_REPO, TOKEN
 from src.logging import configure_root_logger, setup_logger
-logging.getLogger("openai").setLevel(logging.WARNING)
-logger = setup_logger(__name__)
 configure_root_logger()
 logger = setup_logger(__name__)
 pp = pprint.PrettyPrinter(width=80)
-ALL_ENV_IDS = [
-    "CartPole-v1",
-    "MountainCar-v0",
-    "Acrobot-v1",
-    "Hopper-v4",
-]
 def model_hyperlink(link, model_id):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_id}</a>'
@@ -44,73 +28,6 @@ def make_clickable_model(model_id):
     return model_hyperlink(link, model_id)
-def pattern_match(patterns, source_list):
-    if isinstance(patterns, str):
-        patterns = [patterns]
-    env_ids = set()
-    for pattern in patterns:
-        for matching in fnmatch.filter(source_list, pattern):
-            env_ids.add(matching)
-    return sorted(list(env_ids))
-def evaluate(model_id, revision):
-    tags = API.model_info(model_id, revision=revision).tags
-    # Extract the environment IDs from the tags (usually only one)
-    env_ids = pattern_match(tags, ALL_ENV_IDS)
-    logger.info(f"Selected environments: {env_ids}")
-    results = {}
-    # Check if the agent exists
-    try:
-        agent_path = hf_hub_download(repo_id=model_id, filename="agent.pt")
-    except EntryNotFoundError:
-        logger.error("Agent not found")
-        return None
-    # Check safety
-    security = next(iter(API.get_paths_info(model_id, "agent.pt", expand=True))).security
-    if security is None or "safe" not in security:
-        logger.error("Agent safety not available")
-        return None
-    elif not security["safe"]:
-        logger.error("Agent not safe")
-        return None
-    # Load the agent
-    try:
-        agent = torch.jit.load(agent_path)
-    except Exception as e:
-        logger.error(f"Error loading agent: {e}")
-        return None
-    # Evaluate the agent on the environments
-    for env_id in env_ids:
-        episodic_rewards = []
-        env = gym.make(env_id)
-        for _ in range(10):
-            episodic_reward = 0.0
-            observation, info = env.reset()
-            done = False
-            while not done:
-                torch_observation = torch.from_numpy(np.array([observation]))
-                action = agent(torch_observation).numpy()[0]
-                observation, reward, terminated, truncated, info = env.step(action)
-                done = terminated or truncated
-                episodic_reward += reward
-            episodic_rewards.append(episodic_reward)
-        mean_reward = np.mean(episodic_rewards)
-        std_reward = np.std(episodic_rewards)
-        results[env_id] = {"episodic_return_mean": mean_reward, "episodic_reward_std": std_reward}
-        logger.info(f"Environment {env_id}: {mean_reward} ± {std_reward}")
-    return results
 def _backend_routine():
     # List only the text classification models
     rl_models = list(API.list_models(filter="reinforcement-learning"))
@@ -265,7 +182,7 @@ with gr.Blocks(js=dark_mode_gradio_js) as demo:
 scheduler = BackgroundScheduler()
-scheduler.add_job(func=backend_routine, trigger="interval", seconds=5 * 60)
 scheduler.start()

 import glob
 import json
 import os
 import pprint
 import gradio as gr
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download
 from src.css_html_js import dark_mode_gradio_js
 from src.envs import API, RESULTS_PATH, RESULTS_REPO, TOKEN
+from src.evaluation import ALL_ENV_IDS, evaluate
 from src.logging import configure_root_logger, setup_logger
 configure_root_logger()
 logger = setup_logger(__name__)
 pp = pprint.PrettyPrinter(width=80)
 def model_hyperlink(link, model_id):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_id}</a>'
     return model_hyperlink(link, model_id)
 def _backend_routine():
     # List only the text classification models
     rl_models = list(API.list_models(filter="reinforcement-learning"))
 scheduler = BackgroundScheduler()
+scheduler.add_job(func=backend_routine, trigger="interval", seconds=0.5 * 60)
 scheduler.start()

src/evaluation.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import fnmatch
+from typing import Dict, SupportsFloat
+import gymnasium as gym
+import numpy as np
+import torch
+from gymnasium import wrappers
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils._errors import EntryNotFoundError
+from src.envs import API
+from src.logging import setup_logger
+logger = setup_logger(__name__)
+ALL_ENV_IDS = [
+    "CartPole-v1",
+    "MountainCar-v0",
+    "Acrobot-v1",
+    "Hopper-v4",
+    "MsPacmanNoFrameskip-v4",
+]
+class NoopResetEnv(gym.Wrapper[np.ndarray, int, np.ndarray, int]):
+    """
+    Sample initial states by taking random number of no-ops on reset.
+    No-op is assumed to be action 0.
+    :param env: Environment to wrap
+    :param noop_max: Maximum value of no-ops to run
+    """
+    def __init__(self, env: gym.Env, noop_max: int = 30) -> None:
+        super().__init__(env)
+        self.noop_max = noop_max
+        self.override_num_noops = None
+        self.noop_action = 0
+        assert env.unwrapped.get_action_meanings()[0] == "NOOP"  # type: ignore[attr-defined]
+    def reset(self, **kwargs):
+        self.env.reset(**kwargs)
+        if self.override_num_noops is not None:
+            noops = self.override_num_noops
+        else:
+            noops = self.unwrapped.np_random.integers(1, self.noop_max + 1)
+        assert noops > 0
+        obs = np.zeros(0)
+        info: Dict = {}
+        for _ in range(noops):
+            obs, _, terminated, truncated, info = self.env.step(self.noop_action)
+            if terminated or truncated:
+                obs, info = self.env.reset(**kwargs)
+        return obs, info
+class FireResetEnv(gym.Wrapper[np.ndarray, int, np.ndarray, int]):
+    """
+    Take action on reset for environments that are fixed until firing.
+    :param env: Environment to wrap
+    """
+    def __init__(self, env: gym.Env) -> None:
+        super().__init__(env)
+        assert env.unwrapped.get_action_meanings()[1] == "FIRE"  # type: ignore[attr-defined]
+        assert len(env.unwrapped.get_action_meanings()) >= 3  # type: ignore[attr-defined]
+    def reset(self, **kwargs):
+        self.env.reset(**kwargs)
+        obs, _, terminated, truncated, _ = self.env.step(1)
+        if terminated or truncated:
+            self.env.reset(**kwargs)
+        obs, _, terminated, truncated, _ = self.env.step(2)
+        if terminated or truncated:
+            self.env.reset(**kwargs)
+        return obs, {}
+class EpisodicLifeEnv(gym.Wrapper[np.ndarray, int, np.ndarray, int]):
+    """
+    Make end-of-life == end-of-episode, but only reset on true game over.
+    Done by DeepMind for the DQN and co. since it helps value estimation.
+    :param env: Environment to wrap
+    """
+    def __init__(self, env: gym.Env) -> None:
+        super().__init__(env)
+        self.lives = 0
+        self.was_real_done = True
+    def step(self, action: int):
+        obs, reward, terminated, truncated, info = self.env.step(action)
+        self.was_real_done = terminated or truncated
+        # check current lives, make loss of life terminal,
+        # then update lives to handle bonus lives
+        lives = self.env.unwrapped.ale.lives()  # type: ignore[attr-defined]
+        if 0 < lives < self.lives:
+            # for Qbert sometimes we stay in lives == 0 condition for a few frames
+            # so its important to keep lives > 0, so that we only reset once
+            # the environment advertises done.
+            terminated = True
+        self.lives = lives
+        return obs, reward, terminated, truncated, info
+    def reset(self, **kwargs):
+        """
+        Calls the Gym environment reset, only when lives are exhausted.
+        This way all states are still reachable even though lives are episodic,
+        and the learner need not know about any of this behind-the-scenes.
+        :param kwargs: Extra keywords passed to env.reset() call
+        :return: the first observation of the environment
+        """
+        if self.was_real_done:
+            obs, info = self.env.reset(**kwargs)
+        else:
+            # no-op step to advance from terminal/lost life state
+            obs, _, terminated, truncated, info = self.env.step(0)
+            # The no-op step can lead to a game over, so we need to check it again
+            # to see if we should reset the environment and avoid the
+            # monitor.py `RuntimeError: Tried to step environment that needs reset`
+            if terminated or truncated:
+                obs, info = self.env.reset(**kwargs)
+        self.lives = self.env.unwrapped.ale.lives()  # type: ignore[attr-defined]
+        return obs, info
+class MaxAndSkipEnv(gym.Wrapper[np.ndarray, int, np.ndarray, int]):
+    """
+    Return only every ``skip``-th frame (frameskipping)
+    and return the max between the two last frames.
+    :param env: Environment to wrap
+    :param skip: Number of ``skip``-th frame
+        The same action will be taken ``skip`` times.
+    """
+    def __init__(self, env: gym.Env, skip: int = 4) -> None:
+        super().__init__(env)
+        # most recent raw observations (for max pooling across time steps)
+        assert env.observation_space.dtype is not None, "No dtype specified for the observation space"
+        assert env.observation_space.shape is not None, "No shape defined for the observation space"
+        self._obs_buffer = np.zeros((2, *env.observation_space.shape), dtype=env.observation_space.dtype)
+        self._skip = skip
+    def step(self, action: int):
+        """
+        Step the environment with the given action
+        Repeat action, sum reward, and max over last observations.
+        :param action: the action
+        :return: observation, reward, terminated, truncated, information
+        """
+        total_reward = 0.0
+        terminated = truncated = False
+        for i in range(self._skip):
+            obs, reward, terminated, truncated, info = self.env.step(action)
+            done = terminated or truncated
+            if i == self._skip - 2:
+                self._obs_buffer[0] = obs
+            if i == self._skip - 1:
+                self._obs_buffer[1] = obs
+            total_reward += float(reward)
+            if done:
+                break
+        # Note that the observation on the done=True frame
+        # doesn't matter
+        max_frame = self._obs_buffer.max(axis=0)
+        return max_frame, total_reward, terminated, truncated, info
+class ClipRewardEnv(gym.RewardWrapper):
+    """
+    Clip the reward to {+1, 0, -1} by its sign.
+    :param env: Environment to wrap
+    """
+    def __init__(self, env: gym.Env) -> None:
+        super().__init__(env)
+    def reward(self, reward: SupportsFloat) -> float:
+        """
+        Bin reward to {+1, 0, -1} by its sign.
+        :param reward:
+        :return:
+        """
+        return np.sign(float(reward))
+def make(env_id):
+    def thunk():
+        env = gym.make(env_id)
+        env = wrappers.RecordEpisodeStatistics(env)
+        if "NoFrameskip" in env_id:
+            env = NoopResetEnv(env, noop_max=30)
+            env = MaxAndSkipEnv(env, skip=4)
+            env = EpisodicLifeEnv(env)
+            if "FIRE" in env.unwrapped.get_action_meanings():
+                env = FireResetEnv(env)
+            env = ClipRewardEnv(env)
+            env = wrappers.ResizeObservation(env, (84, 84))
+            env = wrappers.GrayScaleObservation(env)
+            env = wrappers.FrameStack(env, 4)
+        return env
+    return thunk
+def pattern_match(patterns, source_list):
+    if isinstance(patterns, str):
+        patterns = [patterns]
+    env_ids = set()
+    for pattern in patterns:
+        for matching in fnmatch.filter(source_list, pattern):
+            env_ids.add(matching)
+    return sorted(list(env_ids))
+def evaluate(model_id, revision):
+    tags = API.model_info(model_id, revision=revision).tags
+    # Extract the environment IDs from the tags (usually only one)
+    env_ids = pattern_match(tags, ALL_ENV_IDS)
+    logger.info(f"Selected environments: {env_ids}")
+    results = {}
+    # Check if the agent exists
+    try:
+        agent_path = hf_hub_download(repo_id=model_id, filename="agent.pt")
+    except EntryNotFoundError:
+        logger.error("Agent not found")
+        return None
+    # Check safety
+    security = next(iter(API.get_paths_info(model_id, "agent.pt", expand=True))).security
+    if security is None or "safe" not in security:
+        logger.error("Agent safety not available")
+        return None
+    elif not security["safe"]:
+        logger.error("Agent not safe")
+        return None
+    # Load the agent
+    try:
+        agent = torch.jit.load(agent_path)
+    except Exception as e:
+        logger.error(f"Error loading agent: {e}")
+        return None
+    # Evaluate the agent on the environments
+    for env_id in env_ids:
+        envs = gym.vector.SyncVectorEnv([make(env_id) for _ in range(3)])
+        observations, _ = envs.reset()
+        episodic_returns = []
+        while len(episodic_returns) < 10:
+            actions = agent(torch.tensor(observations)).numpy()
+            observations, _, _, _, infos = envs.step(actions)
+            if "final_info" in infos:
+                for info in infos["final_info"]:
+                    if info is None or "episode" not in info:
+                        continue
+                    episodic_returns.append(info["episode"]["r"])
+        mean_reward = float(np.mean(episodic_returns))
+        std_reward = float(np.std(episodic_returns))
+        results[env_id] = {"episodic_return_mean": mean_reward, "episodic_reward_std": std_reward}
+        logger.info(f"Environment {env_id}: {mean_reward} ± {std_reward}")
+    return results