cyrodw commited on
Commit
28f359e
·
1 Parent(s): f4fd5ac

Upload folder using huggingface_hub

Browse files
.idea/.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
.idea/PPO.iml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ </module>
.idea/inspectionProfiles/Project_Default.xml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <profile version="1.0">
3
+ <option name="myName" value="Project Default" />
4
+ <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
5
+ <option name="ignoredErrors">
6
+ <list>
7
+ <option value="N802" />
8
+ </list>
9
+ </option>
10
+ </inspection_tool>
11
+ </profile>
12
+ </component>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="py39" project-jdk-type="Python SDK" />
4
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/PPO.iml" filepath="$PROJECT_DIR$/.idea/PPO.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/workspace.xml ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="AutoImportSettings">
4
+ <option name="autoReloadType" value="SELECTIVE" />
5
+ </component>
6
+ <component name="ChangeListManager">
7
+ <list default="true" id="82c11718-8476-4ed5-8f5a-34544b12ac29" name="Changes" comment="" />
8
+ <option name="SHOW_DIALOG" value="false" />
9
+ <option name="HIGHLIGHT_CONFLICTS" value="true" />
10
+ <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
11
+ <option name="LAST_RESOLUTION" value="IGNORE" />
12
+ </component>
13
+ <component name="MarkdownSettingsMigration">
14
+ <option name="stateVersion" value="1" />
15
+ </component>
16
+ <component name="ProjectId" id="2Poxmc2HHyhQ85i4TLtzuLFsOBK" />
17
+ <component name="ProjectViewState">
18
+ <option name="hideEmptyMiddlePackages" value="true" />
19
+ <option name="showLibraryContents" value="true" />
20
+ </component>
21
+ <component name="PropertiesComponent">{
22
+ &quot;keyToString&quot;: {
23
+ &quot;RunOnceActivity.OpenProjectViewOnStart&quot;: &quot;true&quot;,
24
+ &quot;RunOnceActivity.ShowReadmeOnStart&quot;: &quot;true&quot;
25
+ }
26
+ }</component>
27
+ <component name="RunManager">
28
+ <configuration name="main" type="PythonConfigurationType" factoryName="Python" nameIsGenerated="true">
29
+ <module name="PPO" />
30
+ <option name="INTERPRETER_OPTIONS" value="" />
31
+ <option name="PARENT_ENVS" value="true" />
32
+ <envs>
33
+ <env name="PYTHONUNBUFFERED" value="1" />
34
+ </envs>
35
+ <option name="SDK_HOME" value="" />
36
+ <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
37
+ <option name="IS_MODULE_SDK" value="true" />
38
+ <option name="ADD_CONTENT_ROOTS" value="true" />
39
+ <option name="ADD_SOURCE_ROOTS" value="true" />
40
+ <option name="SCRIPT_NAME" value="$PROJECT_DIR$/main.py" />
41
+ <option name="PARAMETERS" value="" />
42
+ <option name="SHOW_COMMAND_LINE" value="false" />
43
+ <option name="EMULATE_TERMINAL" value="false" />
44
+ <option name="MODULE_MODE" value="false" />
45
+ <option name="REDIRECT_INPUT" value="false" />
46
+ <option name="INPUT_FILE" value="" />
47
+ <method v="2" />
48
+ </configuration>
49
+ </component>
50
+ <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
51
+ <component name="TaskManager">
52
+ <task active="true" id="Default" summary="Default task">
53
+ <changelist id="82c11718-8476-4ed5-8f5a-34544b12ac29" name="Changes" comment="" />
54
+ <created>1684137395396</created>
55
+ <option name="number" value="Default" />
56
+ <option name="presentableId" value="Default" />
57
+ <updated>1684137395396</updated>
58
+ </task>
59
+ <servers />
60
+ </component>
61
+ </project>
README.md CHANGED
@@ -16,7 +16,7 @@ model-index:
16
  type: Pixelcopter-PLE-v0
17
  metrics:
18
  - type: mean_reward
19
- value: 9.20 +/- 9.04
20
  name: mean_reward
21
  verified: false
22
  ---
 
16
  type: Pixelcopter-PLE-v0
17
  metrics:
18
  - type: mean_reward
19
+ value: 0.00 +/- 0.00
20
  name: mean_reward
21
  verified: false
22
  ---
hyperparameters.json CHANGED
@@ -1 +1 @@
1
- {"h_size": 64, "n_training_episodes": 50000, "n_evaluation_episodes": 10, "max_t": 10000, "gamma": 0.99, "lr": 0.0001, "env_id": "Pixelcopter-PLE-v0", "state_space": 7, "action_space": 2}
 
1
+ {"h_size": 64, "n_training_episodes": 1000, "n_evaluation_episodes": 10, "max_t": 10000, "gamma": 0.99, "lr": 0.0001, "env_id": "Pixelcopter-PLE-v0", "state_space": 7, "action_space": 2}
main.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import datetime
3
+ import tempfile
4
+
5
+ import numpy as np
6
+
7
+ from collections import deque
8
+
9
+ # PyTorch
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ import torch.optim as optim
14
+ from huggingface_hub import metadata_eval_result, HfApi, metadata_save
15
+ from torch.distributions import Categorical
16
+
17
+ # Gym
18
+ import gym
19
+ import gym_pygame
20
+
21
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
22
+ print(device)
23
+
24
+ env_id = "Pixelcopter-PLE-v0"
25
+ env = gym.make(env_id)
26
+ eval_env = gym.make(env_id)
27
+ s_size = env.observation_space.shape[0]
28
+ a_size = env.action_space.n
29
+
30
+
31
+ class Policy(nn.Module):
32
+ def __init__(self, s_size, a_size, h_size):
33
+ super(Policy, self).__init__()
34
+ self.fc1 = nn.Linear(s_size, h_size)
35
+ self.fc2 = nn.Linear(h_size, h_size * 2)
36
+ self.fc3 = nn.Linear(h_size * 2, a_size)
37
+
38
+ def forward(self, x):
39
+ x = F.relu(self.fc1(x))
40
+ x = F.relu(self.fc2(x))
41
+ x = self.fc3(x)
42
+ return F.softmax(x, dim=1)
43
+
44
+ def act(self, state):
45
+ state = torch.from_numpy(state).float().unsqueeze(0).to(device)
46
+ probs = self.forward(state).cpu()
47
+ m = Categorical(probs)
48
+ action = m.sample()
49
+ return action.item(), m.log_prob(action)
50
+
51
+
52
+ def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
53
+ # Help us to calculate the score during the training
54
+ scores_deque = deque(maxlen=100)
55
+ scores = []
56
+ # Line 3 of pseudocode
57
+ for i_episode in range(1, n_training_episodes + 1):
58
+ saved_log_probs = []
59
+ rewards = []
60
+ state = env.reset()
61
+ # Line 4 of pseudocode
62
+ for t in range(max_t):
63
+ action, log_prob = policy.act(state)
64
+ saved_log_probs.append(log_prob)
65
+ state, reward, done, _ = env.step(action)
66
+ rewards.append(reward)
67
+ if done:
68
+ break
69
+ scores_deque.append(sum(rewards))
70
+ scores.append(sum(rewards))
71
+
72
+ # Line 6 of pseudocode: calculate the return
73
+ returns = deque(maxlen=max_t)
74
+ n_steps = len(rewards)
75
+ # Compute the discounted returns at each timestep,
76
+ # as
77
+ # the sum of the gamma-discounted return at time t (G_t) + the reward at time t
78
+ #
79
+ # In O(N) time, where N is the number of time steps
80
+ # (this definition of the discounted return G_t follows the definition of this quantity
81
+ # shown at page 44 of Sutton&Barto 2017 2nd draft)
82
+ # G_t = r_(t+1) + r_(t+2) + ...
83
+
84
+ # Given this formulation, the returns at each timestep t can be computed
85
+ # by re-using the computed future returns G_(t+1) to compute the current return G_t
86
+ # G_t = r_(t+1) + gamma*G_(t+1)
87
+ # G_(t-1) = r_t + gamma* G_t
88
+ # (this follows a dynamic programming approach, with which we memorize solutions in order
89
+ # to avoid computing them multiple times)
90
+
91
+ # This is correct since the above is equivalent to (see also page 46 of Sutton&Barto 2017 2nd draft)
92
+ # G_(t-1) = r_t + gamma*r_(t+1) + gamma*gamma*r_(t+2) + ...
93
+
94
+ ## Given the above, we calculate the returns at timestep t as:
95
+ # gamma[t] * return[t] + reward[t]
96
+ #
97
+ ## We compute this starting from the last timestep to the first, in order
98
+ ## to employ the formula presented above and avoid redundant computations that would be needed
99
+ ## if we were to do it from first to last.
100
+
101
+ ## Hence, the queue "returns" will hold the returns in chronological order, from t=0 to t=n_steps
102
+ ## thanks to the appendleft() function which allows to append to the position 0 in constant time O(1)
103
+ ## a normal python list would instead require O(N) to do this.
104
+ for t in range(n_steps)[::-1]:
105
+ disc_return_t = (returns[0] if len(returns) > 0 else 0)
106
+ returns.appendleft(gamma * disc_return_t + rewards[t])
107
+
108
+ ## standardization of the returns is employed to make training more stable
109
+ eps = np.finfo(np.float32).eps.item()
110
+ ## eps is the smallest representable float, which is
111
+ # added to the standard deviation of the returns to avoid numerical instabilities
112
+ returns = torch.tensor(returns)
113
+ returns = (returns - returns.mean()) / (returns.std() + eps)
114
+
115
+ # Line 7:
116
+ policy_loss = []
117
+ for log_prob, disc_return in zip(saved_log_probs, returns):
118
+ policy_loss.append(-log_prob * disc_return)
119
+ policy_loss = torch.cat(policy_loss).sum()
120
+
121
+ # Line 8: PyTorch prefers gradient descent
122
+ optimizer.zero_grad()
123
+ policy_loss.backward()
124
+ optimizer.step()
125
+
126
+ if i_episode % print_every == 0:
127
+ print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
128
+
129
+ return scores
130
+
131
+
132
+ pixelcopter_hyperparameters = {
133
+ "h_size": 64,
134
+ "n_training_episodes": 1000,
135
+ "n_evaluation_episodes": 10,
136
+ "max_t": 10000,
137
+ "gamma": 0.99,
138
+ "lr": 1e-4,
139
+ "env_id": env_id,
140
+ "state_space": s_size,
141
+ "action_space": a_size,
142
+ }
143
+
144
+ # Create policy and place it to the device
145
+ # torch.manual_seed(50)
146
+ pixelcopter_policy = Policy(pixelcopter_hyperparameters["state_space"], pixelcopter_hyperparameters["action_space"],
147
+ pixelcopter_hyperparameters["h_size"]).to(device)
148
+ pixelcopter_optimizer = optim.Adam(pixelcopter_policy.parameters(), lr=pixelcopter_hyperparameters["lr"])
149
+
150
+ scores = reinforce(pixelcopter_policy,
151
+ pixelcopter_optimizer,
152
+ pixelcopter_hyperparameters["n_training_episodes"],
153
+ pixelcopter_hyperparameters["max_t"],
154
+ pixelcopter_hyperparameters["gamma"],
155
+ 1000)
156
+
157
+
158
+ def push_to_hub(repo_id,
159
+ model,
160
+ hyperparameters,
161
+ ):
162
+ """
163
+ Evaluate, Generate a video and Upload a model to Hugging Face Hub.
164
+ This method does the complete pipeline:
165
+ - It evaluates the model
166
+ - It generates the model card
167
+ - It generates a replay video of the agent
168
+ - It pushes everything to the Hub
169
+
170
+ :param repo_id: repo_id: id of the model repository from the Hugging Face Hub
171
+ :param model: the pytorch model we want to save
172
+ :param hyperparameters: training hyperparameters
173
+ :param eval_env: evaluation environment
174
+ :param video_fps: how many frame per seconds to record our video replay
175
+ """
176
+
177
+ _, repo_name = repo_id.split("/")
178
+ api = HfApi()
179
+
180
+ # Step 1: Create the repo
181
+ repo_url = api.create_repo(
182
+ repo_id=repo_id,
183
+ exist_ok=True,
184
+ )
185
+
186
+ # Step 2: Save the model
187
+ torch.save(model, "model.pt")
188
+
189
+ # Step 3: Save the hyperparameters to JSON
190
+ with open("hyperparameters.json", "w") as outfile:
191
+ json.dump(hyperparameters, outfile)
192
+
193
+ # Step 4: Evaluate the model and build JSON
194
+ mean_reward, std_reward = 0, 0
195
+ # Get datetime
196
+ eval_datetime = datetime.datetime.now()
197
+ eval_form_datetime = eval_datetime.isoformat()
198
+
199
+ evaluate_data = {
200
+ "env_id": hyperparameters["env_id"],
201
+ "mean_reward": mean_reward,
202
+ "n_evaluation_episodes": hyperparameters["n_evaluation_episodes"],
203
+ "eval_datetime": eval_form_datetime,
204
+ }
205
+
206
+ # Write a JSON file
207
+ with open("results.json", "w") as outfile:
208
+ json.dump(evaluate_data, outfile)
209
+
210
+ # Step 5: Create the model card
211
+ env_name = hyperparameters["env_id"]
212
+
213
+ metadata = {}
214
+ metadata["tags"] = [
215
+ env_name,
216
+ "reinforce",
217
+ "reinforcement-learning",
218
+ "custom-implementation",
219
+ "deep-rl-class"
220
+ ]
221
+
222
+ # Add metrics
223
+ eval = metadata_eval_result(
224
+ model_pretty_name=repo_name,
225
+ task_pretty_name="reinforcement-learning",
226
+ task_id="reinforcement-learning",
227
+ metrics_pretty_name="mean_reward",
228
+ metrics_id="mean_reward",
229
+ metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
230
+ dataset_pretty_name=env_name,
231
+ dataset_id=env_name,
232
+ )
233
+
234
+ # Merges both dictionaries
235
+ metadata = {**metadata, **eval}
236
+
237
+ model_card = f"""
238
+ # **Reinforce** Agent playing **{env_id}**
239
+ This is a trained model of a **Reinforce** agent playing **{env_id}** .
240
+ To learn to use this model and train yours check Unit 4 of the Deep Reinforcement Learning Course: https://huggingface.co/deep-rl-course/unit4/introduction
241
+ """
242
+
243
+ readme_path = "README.md"
244
+ readme = model_card
245
+ with open(readme_path, "w", encoding="utf-8") as f:
246
+ f.write(readme)
247
+
248
+ # Save our metrics to Readme metadata
249
+ metadata_save(readme_path, metadata)
250
+
251
+ # Step 7. Push everything to the Hub
252
+ api.upload_folder(
253
+ repo_id=repo_id,
254
+ folder_path=".",
255
+ path_in_repo=".",
256
+ )
257
+
258
+ print(f"Your model is pushed to the Hub. You can view your model here: {repo_url}")
259
+
260
+
261
+ repo_id = "cyrodw/Reinforce-Pixelcopter" # TODO Define your repo id {username/Reinforce-{model-id}}
262
+ push_to_hub(repo_id,
263
+ pixelcopter_policy, # The model we want to save
264
+ pixelcopter_hyperparameters, # Hyperparameters
265
+ )
model.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55025a9fff5526ca57e6dc693ebc675a17d81762f8b7b24e7ddace646a8edd9b
3
  size 39239
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8cec34e623aac7a14072410390f26a7ea6c16ea4382739edc385a26fb73e7b8
3
  size 39239
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ gymnasium
results.json CHANGED
@@ -1 +1 @@
1
- {"env_id": "Pixelcopter-PLE-v0", "mean_reward": 9.2, "n_evaluation_episodes": 10, "eval_datetime": "2023-05-10T12:37:14.411368"}
 
1
+ {"env_id": "Pixelcopter-PLE-v0", "mean_reward": 0, "n_evaluation_episodes": 10, "eval_datetime": "2023-05-16T15:12:27.011351"}