asuzuki commited on
Commit
fe11896
1 Parent(s): 15477d2
Files changed (1) hide show
  1. README.md +76 -2
README.md CHANGED
@@ -30,8 +30,82 @@ TODO: Add your code
30
 
31
 
32
  ```python
33
- from stable_baselines3 import ...
34
- from huggingface_sb3 import load_from_hub
 
 
 
 
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  ...
37
  ```
 
30
 
31
 
32
  ```python
33
+ #install
34
+ !apt install python-opengl
35
+ !apt install ffmpeg
36
+ !apt install xvfb
37
+ !pip3 install pyvirtualdisplay
38
+ !pip install -r https://raw.githubusercontent.com/huggingface/deep-rl-class/main/notebooks/unit6/requirements-unit6.txt
39
 
40
+ # Virtual display
41
+ from pyvirtualdisplay import Display
42
+ virtual_display = Display(visible=0, size=(1400, 900))
43
+ virtual_display.start()
44
+
45
+ #imports
46
+ import pybullet_envs
47
+ import panda_gym
48
+ import gym
49
+
50
+ import os
51
+
52
+ from huggingface_sb3 import load_from_hub, package_to_hub
53
+
54
+ from stable_baselines3 import A2C
55
+ from stable_baselines3.common.evaluation import evaluate_policy
56
+ from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize
57
+ from stable_baselines3.common.env_util import make_vec_env
58
+
59
+ from huggingface_hub import notebook_login
60
+
61
+ #Define the environment called "PandaReachDense-v2"
62
+ env_id = "PandaReachDense-v2"
63
+
64
+ #Make a vectorized environment
65
+ env = make_vec_env(env_id, n_envs=4)
66
+
67
+ #Add a wrapper to normalize the observations and rewards. Check the documentation
68
+ env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10)
69
+
70
+ #Create the A2C Model (don't forget verbose=1 to print the training logs).
71
+ model = A2C(policy = "MultiInputPolicy",
72
+ env = env,
73
+ gae_lambda = 0.9,
74
+ gamma = 0.95,
75
+ learning_rate = 0.001,
76
+ max_grad_norm = 0.5,
77
+ n_steps = 8,
78
+ vf_coef = 0.4,
79
+ ent_coef = 0.0,
80
+ seed=11,
81
+ policy_kwargs=dict(
82
+ log_std_init=-2, ortho_init=False),
83
+ normalize_advantage=False,
84
+ use_rms_prop= True,
85
+ use_sde= True,
86
+ verbose=1)
87
+
88
+ #Train it for 1M Timesteps
89
+ model.learn(1_500_000)
90
+
91
+ #Save the model and VecNormalize statistics when saving the agent
92
+ model.save(f"a2c-{env_id}")
93
+ env.save(f"vec_normalize_{env_id}.pkl")
94
+
95
+ #Evaluate your agent
96
+ eval_env = DummyVecEnv([lambda: gym.make(env_id)])
97
+ eval_env = VecNormalize.load(f"vec_normalize_{env_id}.pkl", eval_env)
98
+
99
+ # do not update them at test time
100
+ eval_env.training = False
101
+ # reward normalization is not needed at test time
102
+ eval_env.norm_reward = False
103
+
104
+ # Load the model
105
+ model = A2C.load(f"a2c-{env_id}")
106
+
107
+ #Evaluate model
108
+ mean_reward, std_reward = evaluate_policy(model, eval_env)
109
+ print(f"Mean reward = {mean_reward:.2f} +/- {std_reward:.2f}")
110
  ...
111
  ```