Upload folder using huggingface_hub

Browse files

Files changed (11) hide show

.vscode/settings.json +7 -0
cartpole-v1/README.md +27 -0
cartpole-v1/hyperparameters.json +1 -0
cartpole-v1/model.pt +3 -0
cartpole-v1/replay.mp4 +0 -0
cartpole-v1/results.json +1 -0
example1.py +4 -0
hyperparameters.json +1 -0
model.pt +3 -0
policy_gradient.py +383 -0
ppo.py +194 -0

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+    "workbench.colorCustomizations": {
+        "activityBar.background": "#561625",
+        "titleBar.activeBackground": "#781F34",
+        "titleBar.activeForeground": "#FEFBFC"
+    }
+}

cartpole-v1/README.md ADDED Viewed

	@@ -0,0 +1,27 @@

+---
+tags:
+- CartPole-v1
+- reinforce
+- reinforcement-learning
+- custom-implementation
+- deep-rl-class
+model-index:
+- name: Reinforce_cartpol-v1
+  results:
+  - task:
+      type: reinforcement-learning
+      name: reinforcement-learning
+    dataset:
+      name: CartPole-v1
+      type: CartPole-v1
+    metrics:
+    - type: mean_reward
+      value: 0.19 +/- 0.03
+      name: mean_reward
+      verified: false
+---
+    # **Reinforce** Agent playing **CartPole-v1**
+    This is a trained model of a **Reinforce** agent playing **CartPole-v1** .
+    To learn to use this model and train yours check Unit 4 of the Deep Reinforcement Learning Course: https://huggingface.co/deep-rl-course/unit4/introduction

cartpole-v1/hyperparameters.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"h_size": 16, "n_training_episodes": 1000, "n_evaluation_episodes": 10, "max_t": 1000, "gamma": 1.0, "lr": 0.01, "env_id": "CartPole-v1", "state_space": 4, "action_space": 2}

cartpole-v1/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee315b44d3dbc05cd7ba1d04fc0b2b0c49605d506c21e34be0e3239656b0aaf9
+size 3264

cartpole-v1/replay.mp4 ADDED Viewed

Binary file (2.02 kB). View file

cartpole-v1/results.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"env_id": "CartPole-v1", "mean_reward": 0.1875143900513649, "n_evaluation_episodes": 10, "eval_datetime": "2024-03-09T00:13:58.880496"}

example1.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from pyvirtualdisplay import Display
+virtual_display = Display(visible=0, size=(1400, 900))
+virtual_display.start()

hyperparameters.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"h_size": 16, "n_training_episodes": 1000, "n_evaluation_episodes": 10, "max_t": 1000, "gamma": 1.0, "lr": 0.01, "env_id": "CartPole-v1", "state_space": 4, "action_space": 2}

model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b290c70b9fb42b69640d1b651339c462d74379e36a1ef938e6119f30c2d0747
+size 3264

policy_gradient.py ADDED Viewed

	@@ -0,0 +1,383 @@

+import numpy as np
+from collections import deque
+import matplotlib.pyplot as plt
+import pdb
+# PyTorch
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.distributions import Categorical
+# Gym
+import gym
+import gym_pygame
+# Hugging Face Hub
+from huggingface_hub import notebook_login # To log to our Hugging Face account to be able to upload models to the Hub.
+import imageio
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print(device)
+env_id = "CartPole-v1"
+# Create the env
+env = gym.make(env_id, render_mode="rgb_array")
+# Create the evaluation env
+eval_env = gym.make(env_id)
+# Get the state space and action space
+s_size = env.observation_space.shape[0]
+a_size = env.action_space.n
+print("_____OBSERVATION SPACE_____ \n")
+print("The State Space is: ", s_size)
+# cart postion, cart velocity, pole angle, pole velocity at tip
+print("Sample observation", env.observation_space.sample())  # Get a random observation
+print("\n _____ACTION SPACE_____ \n")
+print("The Action Space is: ", a_size)
+print("Action Space Sample", env.action_space.sample())  # Take a random action
+# Policy Gradient Network
+class Policy(nn.Module):
+    def __init__(self, s_size, a_size, h_size):
+        super(Policy, self).__init__()
+        self.fc1 = nn.Linear(s_size, h_size)
+        self.fc2 = nn.Linear(h_size, a_size)
+    def forward(self, x):
+        x = F.relu(self.fc1(x))
+        x = self.fc2(x)
+        return F.softmax(x, dim=1)
+    def act(self, state):
+        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
+        probs = self.forward(state).cpu()[0]
+        m = Categorical(probs)
+        action = m.sample()
+        return action.item(), m.log_prob(action)
+def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
+    # Help us to calculate the score during the training
+    scores_deque = deque(maxlen=100)
+    scores = []
+    # Line 3 of pseudocode
+    for i_episode in range(1, n_training_episodes + 1):
+        saved_log_probs = []
+        rewards = []
+        state = env.reset()[0]
+        # Line 4 of pseudocode
+        for t in range(max_t):
+            action, log_prob = policy.act(state)
+            saved_log_probs.append(log_prob)
+            state, reward, done, _ = env.step(action)[0]
+            rewards.append(reward)
+            if done:
+                break
+        scores_deque.append(sum(rewards))
+        scores.append(sum(rewards))
+        # Line 6 of pseudocode: calculate the return
+        returns = deque(maxlen=max_t)
+        n_steps = len(rewards)
+        # Compute the discounted returns at each timestep,
+        # as
+        #      the sum of the gamma-discounted return at time t (G_t) + the reward at time t
+        #
+        # In O(N) time, where N is the number of time steps
+        # (this definition of the discounted return G_t follows the definition of this quantity
+        # shown at page 44 of Sutton&Barto 2017 2nd draft)
+        # G_t = r_(t+1) + r_(t+2) + ...
+        # Given this formulation, the returns at each timestep t can be computed
+        # by re-using the computed future returns G_(t+1) to compute the current return G_t
+        # G_t = r_(t+1) + gamma*G_(t+1)
+        # G_(t-1) = r_t + gamma* G_t
+        # (this follows a dynamic programming approach, with which we memorize solutions in order
+        # to avoid computing them multiple times)
+        # This is correct since the above is equivalent to (see also page 46 of Sutton&Barto 2017 2nd draft)
+        # G_(t-1) = r_t + gamma*r_(t+1) + gamma*gamma*r_(t+2) + ...
+        ## Given the above, we calculate the returns at timestep t as:
+        #               gamma[t] * return[t] + reward[t]
+        #
+        ## We compute this starting from the last timestep to the first, in order
+        ## to employ the formula presented above and avoid redundant computations that would be needed
+        ## if we were to do it from first to last.
+        ## Hence, the queue "returns" will hold the returns in chronological order, from t=0 to t=n_steps
+        ## thanks to the appendleft() function which allows to append to the position 0 in constant time O(1)
+        ## a normal python list would instead require O(N) to do this.
+        for t in range(n_steps)[::-1]:
+            disc_return_t = returns[0] if len(returns) > 0 else 0
+            returns.appendleft(gamma * disc_return_t + rewards[t])
+        ## standardization of the returns is employed to make training more stable
+        eps = np.finfo(np.float32).eps.item()
+        ## eps is the smallest representable float, which is
+        # added to the standard deviation of the returns to avoid numerical instabilities
+        returns = torch.tensor(returns)
+        if len(returns) > 1:
+            returns = (returns - returns.mean()) / (returns.std() + eps)
+        # Line 7:
+        policy_loss = []
+        for log_prob, disc_return in zip(saved_log_probs, returns):
+            policy_loss.append(-log_prob * disc_return)
+        if len(policy_loss) > 1:
+            policy_loss = torch.cat(policy_loss).sum()
+        else:
+            policy_loss = policy_loss[0]
+        # Line 8: PyTorch prefers gradient descent
+        optimizer.zero_grad()
+        policy_loss.backward()
+        optimizer.step()
+        if i_episode % print_every == 0:
+            print("Episode {}\tAverage Score: {:.2f}".format(i_episode, np.mean(scores_deque)))
+    return scores
+cartpole_hyperparameters = {
+    "h_size": 16,
+    "n_training_episodes": 1000,
+    "n_evaluation_episodes": 10,
+    "max_t": 1000,
+    "gamma": 1.0,
+    "lr": 1e-2,
+    "env_id": env_id,
+    "state_space": s_size,
+    "action_space": a_size,
+}
+# Create policy and place it to the device
+cartpole_policy = Policy(
+    cartpole_hyperparameters["state_space"],
+    cartpole_hyperparameters["action_space"],
+    cartpole_hyperparameters["h_size"],
+).to(device)
+cartpole_optimizer = optim.Adam(cartpole_policy.parameters(), lr=cartpole_hyperparameters["lr"])
+scores = reinforce(
+    cartpole_policy,
+    cartpole_optimizer,
+    cartpole_hyperparameters["n_training_episodes"],
+    cartpole_hyperparameters["max_t"],
+    cartpole_hyperparameters["gamma"],
+    100,
+)
+def evaluate_agent(env, max_steps, n_eval_episodes, policy):
+    """
+    Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
+    :param env: The evaluation environment
+    :param n_eval_episodes: Number of episode to evaluate the agent
+    :param policy: The Reinforce agent
+    """
+    episode_rewards = []
+    for episode in range(n_eval_episodes):
+        state = env.reset()[0]
+        step = 0
+        done = False
+        total_rewards_ep = 0
+        for step in range(max_steps):
+            action, _ = policy.act(state)
+            new_state, reward, done, info = env.step(action)[0]
+            total_rewards_ep += reward
+            if done:
+                break
+            state = new_state
+        episode_rewards.append(total_rewards_ep)
+    mean_reward = np.mean(episode_rewards)
+    std_reward = np.std(episode_rewards)
+    return mean_reward, std_reward
+evaluate_agent(
+    eval_env, cartpole_hyperparameters["max_t"], cartpole_hyperparameters["n_evaluation_episodes"], cartpole_policy
+)
+from huggingface_hub import HfApi, snapshot_download
+from huggingface_hub.repocard import metadata_eval_result, metadata_save
+from pathlib import Path
+import datetime
+import json
+import imageio
+import tempfile
+import os
+def record_video(env, policy, out_directory, fps=30):
+    """
+    Generate a replay video of the agent
+    :param env
+    :param Qtable: Qtable of our agent
+    :param out_directory
+    :param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1)
+    """
+    images = []
+    done = False
+    state = env.reset()[0]
+    img = env.render()
+    images.append(img)
+    while not done:
+        # Take the action (index) that have the maximum expected future reward given that state
+        action, _ = policy.act(state)
+        state, reward, done, info = env.step(action)[0]  # We directly put next_state = state for recording logic
+        img = env.render()
+        images.append(img)
+    imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)
+def push_to_hub(repo_id,
+                model,
+                hyperparameters,
+                eval_env,
+                video_fps=30
+                ):
+    """
+    Evaluate, Generate a video and Upload a model to Hugging Face Hub.
+    This method does the complete pipeline:
+    - It evaluates the model
+    - It generates the model card
+    - It generates a replay video of the agent
+    - It pushes everything to the Hub
+    :param repo_id: repo_id: id of the model repository from the Hugging Face Hub
+    :param model: the pytorch model we want to save
+    :param hyperparameters: training hyperparameters
+    :param eval_env: evaluation environment
+    :param video_fps: how many frame per seconds to record our video replay
+    """
+    _, repo_name = repo_id.split("/")
+    api = HfApi()
+    # Step 1: Create the repo
+    repo_url = api.create_repo(
+        repo_id=repo_id,
+        exist_ok=True,
+    )
+    local_dir  = "./cartpole-v1"
+    # Step 2: Save the model
+    torch.save(model, os.path.join(local_dir, "model.pt"))
+    # Step 3: Save the hyperparameters to JSON
+    hyper_path = os.path.join(local_dir, "hyperparameters.json")
+    with open(hyper_path, "w") as outfile:
+        json.dump(hyperparameters, outfile)
+    # Step 4: Evaluate the model and build JSON
+    mean_reward, std_reward = evaluate_agent(eval_env,
+                                            hyperparameters["max_t"],
+                                            hyperparameters["n_evaluation_episodes"],
+                                            model)
+    # Get datetime
+    eval_datetime = datetime.datetime.now()
+    eval_form_datetime = eval_datetime.isoformat()
+    evaluate_data = {
+        "env_id": hyperparameters["env_id"],
+        "mean_reward": mean_reward,
+        "n_evaluation_episodes": hyperparameters["n_evaluation_episodes"],
+        "eval_datetime": eval_form_datetime,
+    }
+    # Write a JSON file
+    result_path = os.path.join(local_dir, "results.json")
+    with open(result_path, "w") as outfile:
+        json.dump(evaluate_data, outfile)
+    # Step 5: Create the model card
+    env_name = hyperparameters["env_id"]
+    metadata = {}
+    metadata["tags"] = [
+        env_name,
+        "reinforce",
+        "reinforcement-learning",
+        "custom-implementation",
+        "deep-rl-class"
+    ]
+    # Add metrics
+    eval = metadata_eval_result(
+        model_pretty_name=repo_name,
+        task_pretty_name="reinforcement-learning",
+        task_id="reinforcement-learning",
+        metrics_pretty_name="mean_reward",
+        metrics_id="mean_reward",
+        metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
+        dataset_pretty_name=env_name,
+        dataset_id=env_name,
+    )
+    # Merges both dictionaries
+    metadata = {**metadata, **eval}
+    model_card = f"""
+    # **Reinforce** Agent playing **{env_id}**
+    This is a trained model of a **Reinforce** agent playing **{env_id}** .
+    To learn to use this model and train yours check Unit 4 of the Deep Reinforcement Learning Course: https://huggingface.co/deep-rl-course/unit4/introduction
+    """
+    readme_path = Path(local_dir )/ "README.md"
+    readme = ""
+    if readme_path.exists():
+        with readme_path.open("r", encoding="utf8") as f:
+            readme = f.read()
+    else:
+        readme = model_card
+    with readme_path.open("w", encoding="utf-8") as f:
+        f.write(readme)
+    # Save our metrics to Readme metadata
+    metadata_save(readme_path, metadata)
+    # Step 6: Record a video
+    video_path = os.path.join(local_dir,"replay.mp4")
+    record_video(env, model, video_path, video_fps)
+    # Step 7. Push everything to the Hub
+    api.upload_folder(
+        repo_id=repo_id,
+        folder_path="./",
+        path_in_repo=".",
+)
+    print(f"Your model is pushed to the Hub. You can view your model here: {repo_url}")
+repo_id = "dlwlgus53/Reinforce_cartpol-v1"
+push_to_hub(
+    repo_id,
+    cartpole_policy,  # The model we want to save
+    cartpole_hyperparameters,  # Hyperparameters
+    eval_env,  # Evaluation environment
+    video_fps=30
+)

ppo.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+python examples/scripts/ppo.py \
+    --log_with=wandb
+"""
+from dataclasses import dataclass, field
+from typing import Optional
+import torch
+from accelerate import Accelerator
+from datasets import load_dataset
+from peft import LoraConfig
+from tqdm import tqdm
+from transformers import AutoTokenizer, HfArgumentParser, pipeline
+from trl import AutoModelForCausalLMWithValueHead, AutoModelForSeq2SeqLMWithValueHead, PPOConfig, PPOTrainer, set_seed
+from trl.core import LengthSampler
+from trl.import_utils import is_npu_available, is_xpu_available
+tqdm.pandas()
+@dataclass
+class ScriptArguments:
+    use_seq2seq: bool = field(default=False, metadata={"help": "whether to use seq2seq"})
+    trust_remote_code: bool = field(default=False, metadata={"help": "Enable `trust_remote_code`"})
+    # LoraConfig
+    use_peft: bool = field(default=False, metadata={"help": "whether to use peft"})
+    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
+    lora_r: Optional[int] = field(default=16, metadata={"help": "the lora r parameter"})
+parser = HfArgumentParser((ScriptArguments, PPOConfig))
+args, ppo_config = parser.parse_args_into_dataclasses()
+# We then define the arguments to pass to the sentiment analysis pipeline.
+# We set `return_all_scores` to True to get the sentiment score for each token.
+sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}
+trl_model_class = AutoModelForCausalLMWithValueHead if not args.use_seq2seq else AutoModelForSeq2SeqLMWithValueHead
+# Below is an example function to build the dataset. In our case, we use the IMDB dataset
+# from the `datasets` library. One should customize this function to train the model on
+# its own dataset.
+def build_dataset(config, query_dataset, input_min_text_length=2, input_max_text_length=8):
+    """
+    Build dataset for training. This builds the dataset from `load_dataset`, one should
+    customize this function to train the model on its own dataset.
+    Args:
+        query_dataset (`str`):
+            The name of the dataset to be loaded.
+    Returns:
+        dataloader (`torch.utils.data.DataLoader`):
+            The dataloader for the dataset.
+    """
+    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+    # load imdb with datasets
+    ds = load_dataset(query_dataset, split="train")
+    ds = ds.rename_columns({"text": "review"})
+    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)
+    input_size = LengthSampler(input_min_text_length, input_max_text_length)
+    def tokenize(sample):
+        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
+        sample["query"] = tokenizer.decode(sample["input_ids"])
+        return sample
+    ds = ds.map(tokenize, batched=False)
+    ds.set_format(type="torch")
+    return ds
+# We retrieve the dataloader by calling the `build_dataset` function.
+dataset = build_dataset(ppo_config, ppo_config.query_dataset)
+def collator(data):
+    return {key: [d[key] for d in data] for key in data[0]}
+# set seed before initializing value head for deterministic eval
+set_seed(ppo_config.seed)
+# Now let's build the model, the reference model, and the tokenizer.
+if not args.use_peft:
+    ref_model = trl_model_class.from_pretrained(ppo_config.model_name, trust_remote_code=args.trust_remote_code)
+    device_map = None
+    peft_config = None
+else:
+    peft_config = LoraConfig(
+        r=args.lora_r,
+        lora_alpha=args.lora_alpha,
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+    ref_model = None
+    # Copy the model to each device
+    device_map = {"": Accelerator().local_process_index}
+model = trl_model_class.from_pretrained(
+    ppo_config.model_name,
+    trust_remote_code=args.trust_remote_code,
+    device_map=device_map,
+    peft_config=peft_config,
+)
+tokenizer = AutoTokenizer.from_pretrained(ppo_config.model_name)
+# Some tokenizers like GPT-2's don't have a padding token by default, so we set one here.
+tokenizer.pad_token_id = tokenizer.eos_token_id
+# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
+ppo_trainer = PPOTrainer(ppo_config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)
+# We then build the sentiment analysis pipeline, passing the model name and the
+# sentiment analysis pipeline arguments. Let's also make sure to set the device
+# to the same device as the PPOTrainer.
+device = ppo_trainer.accelerator.device
+if ppo_trainer.accelerator.num_processes == 1:
+    if is_xpu_available():
+        device = "xpu:0"
+    elif is_npu_available():
+        device = "npu:0"
+    else:
+        device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug
+ds_plugin = ppo_trainer.accelerator.state.deepspeed_plugin
+task, model_name = ppo_config.reward_model.split(":")
+if ds_plugin is not None and ds_plugin.is_zero3_init_enabled():
+    with ds_plugin.zero3_init_context_manager(enable=False):
+        sentiment_pipe = pipeline(task, model=model_name, device=device)
+else:
+    sentiment_pipe = pipeline(task, model=model_name, device=device)
+# Some tokenizers like GPT-2's don't have a padding token by default, so we set one here.
+if sentiment_pipe.tokenizer.pad_token_id is None:
+    sentiment_pipe.tokenizer.pad_token_id = tokenizer.pad_token_id
+if sentiment_pipe.model.config.pad_token_id is None:
+    sentiment_pipe.model.config.pad_token_id = tokenizer.pad_token_id
+# We then define the arguments to pass to the `generate` function. These arguments
+# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
+# the `generate` function of the trained model.
+generation_kwargs = {
+    "min_length": -1,
+    "top_k": 0.0,
+    "top_p": 1.0,
+    "do_sample": True,
+    "pad_token_id": tokenizer.eos_token_id,
+    "max_new_tokens": 32,
+}
+for _epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
+    query_tensors = batch["input_ids"]
+    # Get response from gpt2
+    response_tensors, ref_response_tensors = ppo_trainer.generate(
+        query_tensors, return_prompt=False, generate_ref_response=True, **generation_kwargs
+    )
+    batch["response"] = tokenizer.batch_decode(response_tensors)
+    batch["ref_response"] = tokenizer.batch_decode(ref_response_tensors)
+    # Compute sentiment score
+    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
+    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
+    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
+    ref_texts = [q + r for q, r in zip(batch["query"], batch["ref_response"])]
+    ref_pipe_outputs = sentiment_pipe(ref_texts, **sent_kwargs)
+    ref_rewards = [torch.tensor(output[1]["score"]) for output in ref_pipe_outputs]
+    batch["ref_rewards"] = ref_rewards
+    # Run PPO step
+    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
+    ppo_trainer.log_stats(stats, batch, rewards, columns_to_log=["query", "response", "ref_response", "ref_rewards"])