File size: 6,689 Bytes
5dce4ce eb496e7 5dce4ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
---
tags:
- CartPole-v1
- reinforce
- reinforcement-learning
- custom-implementation
- deep-rl-class
model-index:
- name: CartPole
results:
- task:
type: reinforcement-learning
name: reinforcement-learning
dataset:
name: CartPole-v1
type: CartPole-v1
metrics:
- type: mean_reward
value: 500.00 +/- 0.00
name: mean_reward
verified: false
---
# **Reinforce** Agent playing **CartPole-v1**
This is a trained model of a **Reinforce** agent playing **CartPole-v1**.
```python
# ----------- Libraries -----------
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
# Gym
import gym
import gym_pygame
#------------- Enviroment -----------
env_id = "CartPole-v1"
# Create the env
env = gym.make(env_id)
# Create the evaluation env
eval_env = gym.make(env_id)
# Get the state space and action space
s_size = env.observation_space.shape[0]
a_size = env.action_space.n
#------------ Policy --------------
class Policy(nn.Module):
def __init__(self, s_size, a_size, h_size):
super(Policy, self).__init__()
# Create two fully connected layers
self.fc1 = nn.Linear(s_size, h_size)
self.fc2 = nn.Linear(h_size, a_size)
def forward(self, x):
# Define the forward pass
# state goes to fc1 then we apply ReLU activation function
x = F.relu(self.fc1(x))
# fc1 outputs goes to fc2
x = self.fc2(x)
# We output the softmax
return F.softmax(x, dim=1)
def act(self, state):
"""
Given a state, take action
"""
state = torch.from_numpy(state).float().unsqueeze(0).to(device)
probs = self.forward(state).cpu()
m = Categorical(probs)
action = m.sample()
return action.item(), m.log_prob(action)
#--------------- Reinforce --------------
def reinforce(policy, optimizer, n_training_episodes, max_t, gamma, print_every):
# Help us to calculate the score during the training
scores_deque = deque(maxlen=100)
scores = []
# Line 3 of pseudocode
for i_episode in range(1, n_training_episodes+1):
saved_log_probs = []
rewards = []
state = env.reset()
# Line 4 of pseudocode
for t in range(max_t):
action, log_prob = policy.act(state)
saved_log_probs.append(log_prob)
state, reward, done, _ = env.step(action)
rewards.append(reward)
if done:
break
scores_deque.append(sum(rewards))
scores.append(sum(rewards))
# Line 6 of pseudocode: calculate the return
returns = deque(maxlen=max_t)
n_steps = len(rewards)
# Compute the discounted returns at each timestep,
# as the sum of the gamma-discounted return at time t (G_t) + the reward at time t
# In O(N) time, where N is the number of time steps
# (this definition of the discounted return G_t follows the definition of this quantity
# shown at page 44 of Sutton&Barto 2017 2nd draft)
# G_t = r_(t+1) + r_(t+2) + ...
# Given this formulation, the returns at each timestep t can be computed
# by re-using the computed future returns G_(t+1) to compute the current return G_t
# G_t = r_(t+1) + gamma*G_(t+1)
# G_(t-1) = r_t + gamma* G_t
# (this follows a dynamic programming approach, with which we memorize solutions in order
# to avoid computing them multiple times)
# This is correct since the above is equivalent to (see also page 46 of Sutton&Barto 2017 2nd draft)
# G_(t-1) = r_t + gamma*r_(t+1) + gamma*gamma*r_(t+2) + ...
## Given the above, we calculate the returns at timestep t as:
# gamma[t] * return[t] + reward[t]
#
## We compute this starting from the last timestep to the first, in order
## to employ the formula presented above and avoid redundant computations that would be needed
## if we were to do it from first to last.
## Hence, the queue "returns" will hold the returns in chronological order, from t=0 to t=n_steps
## thanks to the appendleft() function which allows to append to the position 0 in constant time O(1)
## a normal python list would instead require O(N) to do this.
for t in range(n_steps)[::-1]:
disc_return_t = (returns[0] if len(returns)>0 else 0)
returns.appendleft( gamma*disc_return_t + rewards[t] )
## standardization of the returns is employed to make training more stable
eps = np.finfo(np.float32).eps.item()
## eps is the smallest representable float, which is
# added to the standard deviation of the returns to avoid numerical instabilities
returns = torch.tensor(returns)
returns = (returns - returns.mean()) / (returns.std() + eps)
# Line 7:
policy_loss = []
for log_prob, disc_return in zip(saved_log_probs, returns):
policy_loss.append(-log_prob * disc_return)
policy_loss = torch.cat(policy_loss).sum()
# Line 8: PyTorch prefers gradient descent
optimizer.zero_grad()
policy_loss.backward()
optimizer.step()
if i_episode % print_every == 0:
print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
return scores
# ---------- Training Hyperparameters ----------
cartpole_hyperparameters = {
"h_size": 16,
"n_training_episodes": 1000,
"n_evaluation_episodes": 100,
"max_t": 1000,
"gamma": 1.0,
"lr": 1e-2,
"env_id": env_id,
"state_space": s_size,
"action_space": a_size,
}
# ---------- Policy and optimizer ----------
# Create policy and place it to the device
cartpole_policy = Policy(cartpole_hyperparameters["state_space"], cartpole_hyperparameters["action_space"], cartpole_hyperparameters["h_size"]).to(device)
cartpole_optimizer = optim.Adam(cartpole_policy.parameters(), lr=cartpole_hyperparameters["lr"])
# --------- Training -----------
scores = reinforce(cartpole_policy,
cartpole_optimizer,
cartpole_hyperparameters["n_training_episodes"],
cartpole_hyperparameters["max_t"],
cartpole_hyperparameters["gamma"],
100)
```
|