Dabe commited on
Commit
eb496e7
1 Parent(s): 5dce4ce

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +172 -2
README.md CHANGED
@@ -22,6 +22,176 @@ model-index:
22
  ---
23
 
24
  # **Reinforce** Agent playing **CartPole-v1**
25
- This is a trained model of a **Reinforce** agent playing **CartPole-v1** .
26
- To learn to use this model and train yours check Unit 4 of the Deep Reinforcement Learning Course: https://huggingface.co/deep-rl-course/unit4/introduction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
 
22
  ---
23
 
24
  # **Reinforce** Agent playing **CartPole-v1**
25
+ This is a trained model of a **Reinforce** agent playing **CartPole-v1**.
26
+
27
+ ```python
28
+
29
+ # ----------- Libraries -----------
30
+ import numpy as np
31
+
32
+ from collections import deque
33
+
34
+ import matplotlib.pyplot as plt
35
+ %matplotlib inline
36
+
37
+ # PyTorch
38
+ import torch
39
+ import torch.nn as nn
40
+ import torch.nn.functional as F
41
+ import torch.optim as optim
42
+ from torch.distributions import Categorical
43
+
44
+ # Gym
45
+ import gym
46
+ import gym_pygame
47
+
48
+
49
+ #------------- Enviroment -----------
50
+ env_id = "CartPole-v1"
51
+ # Create the env
52
+ env = gym.make(env_id)
53
+
54
+ # Create the evaluation env
55
+ eval_env = gym.make(env_id)
56
+
57
+ # Get the state space and action space
58
+ s_size = env.observation_space.shape[0]
59
+ a_size = env.action_space.n
60
+
61
+
62
+ #------------ Policy --------------
63
+ class Policy(nn.Module):
64
+ def __init__(self, s_size, a_size, h_size):
65
+ super(Policy, self).__init__()
66
+ # Create two fully connected layers
67
+ self.fc1 = nn.Linear(s_size, h_size)
68
+ self.fc2 = nn.Linear(h_size, a_size)
69
+
70
+
71
+ def forward(self, x):
72
+ # Define the forward pass
73
+ # state goes to fc1 then we apply ReLU activation function
74
+ x = F.relu(self.fc1(x))
75
+ # fc1 outputs goes to fc2
76
+ x = self.fc2(x)
77
+ # We output the softmax
78
+ return F.softmax(x, dim=1)
79
+
80
+ def act(self, state):
81
+ """
82
+ Given a state, take action
83
+ """
84
+ state = torch.from_numpy(state).float().unsqueeze(0).to(device)
85
+ probs = self.forward(state).cpu()
86
+ m = Categorical(probs)
87
+ action = m.sample()
88
+ return action.item(), m.log_prob(action)
89
+
90
+ #--------------- Reinforce --------------
91
+ def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
92
+ # Help us to calculate the score during the training
93
+ scores_deque = deque(maxlen=100)
94
+ scores = []
95
+ # Line 3 of pseudocode
96
+ for i_episode in range(1, n_training_episodes+1):
97
+ saved_log_probs = []
98
+ rewards = []
99
+ state = env.reset()
100
+ # Line 4 of pseudocode
101
+ for t in range(max_t):
102
+ action, log_prob = policy.act(state)
103
+ saved_log_probs.append(log_prob)
104
+ state, reward, done, _ = env.step(action)
105
+ rewards.append(reward)
106
+ if done:
107
+ break
108
+ scores_deque.append(sum(rewards))
109
+ scores.append(sum(rewards))
110
+
111
+ # Line 6 of pseudocode: calculate the return
112
+ returns = deque(maxlen=max_t)
113
+ n_steps = len(rewards)
114
+ # Compute the discounted returns at each timestep,
115
+ # as the sum of the gamma-discounted return at time t (G_t) + the reward at time t
116
+
117
+ # In O(N) time, where N is the number of time steps
118
+ # (this definition of the discounted return G_t follows the definition of this quantity
119
+ # shown at page 44 of Sutton&Barto 2017 2nd draft)
120
+ # G_t = r_(t+1) + r_(t+2) + ...
121
+
122
+ # Given this formulation, the returns at each timestep t can be computed
123
+ # by re-using the computed future returns G_(t+1) to compute the current return G_t
124
+ # G_t = r_(t+1) + gamma*G_(t+1)
125
+ # G_(t-1) = r_t + gamma* G_t
126
+ # (this follows a dynamic programming approach, with which we memorize solutions in order
127
+ # to avoid computing them multiple times)
128
+
129
+ # This is correct since the above is equivalent to (see also page 46 of Sutton&Barto 2017 2nd draft)
130
+ # G_(t-1) = r_t + gamma*r_(t+1) + gamma*gamma*r_(t+2) + ...
131
+
132
+
133
+ ## Given the above, we calculate the returns at timestep t as:
134
+ # gamma[t] * return[t] + reward[t]
135
+ #
136
+ ## We compute this starting from the last timestep to the first, in order
137
+ ## to employ the formula presented above and avoid redundant computations that would be needed
138
+ ## if we were to do it from first to last.
139
+
140
+ ## Hence, the queue "returns" will hold the returns in chronological order, from t=0 to t=n_steps
141
+ ## thanks to the appendleft() function which allows to append to the position 0 in constant time O(1)
142
+ ## a normal python list would instead require O(N) to do this.
143
+ for t in range(n_steps)[::-1]:
144
+ disc_return_t = (returns[0] if len(returns)>0 else 0)
145
+ returns.appendleft( gamma*disc_return_t + rewards[t] )
146
+
147
+ ## standardization of the returns is employed to make training more stable
148
+ eps = np.finfo(np.float32).eps.item()
149
+
150
+ ## eps is the smallest representable float, which is
151
+ # added to the standard deviation of the returns to avoid numerical instabilities
152
+ returns = torch.tensor(returns)
153
+ returns = (returns - returns.mean()) / (returns.std() + eps)
154
+
155
+ # Line 7:
156
+ policy_loss = []
157
+ for log_prob, disc_return in zip(saved_log_probs, returns):
158
+ policy_loss.append(-log_prob * disc_return)
159
+ policy_loss = torch.cat(policy_loss).sum()
160
+
161
+ # Line 8: PyTorch prefers gradient descent
162
+ optimizer.zero_grad()
163
+ policy_loss.backward()
164
+ optimizer.step()
165
+
166
+ if i_episode % print_every == 0:
167
+ print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
168
+
169
+ return scores
170
+
171
+ # ---------- Training Hyperparameters ----------
172
+ cartpole_hyperparameters = {
173
+ "h_size": 16,
174
+ "n_training_episodes": 1000,
175
+ "n_evaluation_episodes": 100,
176
+ "max_t": 1000,
177
+ "gamma": 1.0,
178
+ "lr": 1e-2,
179
+ "env_id": env_id,
180
+ "state_space": s_size,
181
+ "action_space": a_size,
182
+ }
183
+
184
+ # ---------- Policy and optimizer ----------
185
+ # Create policy and place it to the device
186
+ cartpole_policy = Policy(cartpole_hyperparameters["state_space"], cartpole_hyperparameters["action_space"], cartpole_hyperparameters["h_size"]).to(device)
187
+ cartpole_optimizer = optim.Adam(cartpole_policy.parameters(), lr=cartpole_hyperparameters["lr"])
188
+
189
+ # --------- Training -----------
190
+ scores = reinforce(cartpole_policy,
191
+ cartpole_optimizer,
192
+ cartpole_hyperparameters["n_training_episodes"],
193
+ cartpole_hyperparameters["max_t"],
194
+ cartpole_hyperparameters["gamma"],
195
+ 100)
196
+ ```
197