bonadio commited on
Commit
da4e4fb
1 Parent(s): 421e9fd

A2C trading results

Browse files
fin_rl_a2c_v1.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
fin_rl_policy_gradiant_v1.ipynb → fin_rl_policy_gradient_v1.ipynb RENAMED
@@ -6,10 +6,10 @@
6
  "id": "nwaAZRu1NTiI"
7
  },
8
  "source": [
9
- "# PolicyGradiant\n",
10
  "\n",
11
  "\n",
12
- "#### This version implements PolicyGradiant using a custom enviroment (Unit 4)"
13
  ]
14
  },
15
  {
@@ -24,25 +24,17 @@
24
  },
25
  {
26
  "cell_type": "code",
27
- "execution_count": 1,
28
  "metadata": {
29
  "id": "LNXxxKojNTiL"
30
  },
31
- "outputs": [
32
- {
33
- "name": "stderr",
34
- "output_type": "stream",
35
- "text": [
36
- "2022-12-27 12:47:16.481995: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
37
- "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
38
- "\n"
39
- ]
40
- }
41
- ],
42
  "source": [
43
  "import tensorflow as tf\n",
44
- "from tensorflow.keras import layers\n",
45
  "from tensorflow.keras.utils import to_categorical\n",
 
 
46
  "import gym\n",
47
  "from gym import spaces\n",
48
  "from gym.utils import seeding\n",
@@ -66,63 +58,123 @@
66
  },
67
  {
68
  "cell_type": "code",
69
- "execution_count": 66,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  "metadata": {},
71
  "outputs": [],
72
  "source": [
73
  "class Policy:\n",
74
- " def __init__(self, env=None):\n",
75
  "\n",
76
- " self.action_size = env.action_space.n\n",
77
  "\n",
78
  " # Hyperparameters\n",
79
  " self.gamma = 0.95 # Discount rate\n",
80
- " self.epsilon = 1.0 # Exploration rate\n",
81
- " self.epsilon_min = 0.001 # Minimal exploration rate (epsilon-greedy)\n",
82
- " self.epsilon_decay = 0.95 # Decay rate for epsilon\n",
83
- " self.update_rate = 5 # Number of steps until updating the target network\n",
84
- " self.batch_size = 200\n",
85
- " self.learning_rate = 1e-4\n",
86
  " \n",
87
- " self.model = self._build_model()\n",
88
- " self.model.summary()\n",
89
  " self.env = env\n",
 
 
 
 
 
 
90
  "\n",
91
- " self.history = None\n",
92
- " self.scaler = None\n",
93
  "\n",
94
  " def _build_model(self):\n",
95
- " model = tf.keras.Sequential()\n",
96
- " \n",
97
- " model.add(tf.keras.Input(shape=(4,)))\n",
98
- " model.add(layers.Dense(256, activation = 'relu'))\n",
99
- " model.add(layers.Dense(128, activation = 'relu'))\n",
100
- " model.add(layers.Dense(64, activation = 'relu'))\n",
101
- " model.add(layers.Dense(self.action_size, activation = 'softmax'))\n",
102
  " \n",
103
  " optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)\n",
104
- " model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics = ['accuracy'])\n",
105
- " # model.compile(loss='mse', optimizer=optimizer, metrics = ['mse'])\n",
106
- " return model\n",
 
 
 
 
 
 
107
  "\n",
108
  " def act(self, state):\n",
109
- " probs = self.model.predict([state])\n",
110
- " action = tf.random.categorical(tf.math.log(probs), 1)\n",
111
- " return action.numpy()[0], tf.math.log(probs[0][action])\n",
112
  "\n",
113
- " def reinforce(self, n_training_episodes, max_t, gamma, print_every):\n",
 
 
 
114
  " # Help us to calculate the score during the training\n",
115
  " scores_deque = deque(maxlen=100)\n",
116
  " scores = []\n",
117
  " # Line 3 of pseudocode\n",
118
  " for i_episode in range(1, n_training_episodes+1):\n",
119
- " saved_log_probs = []\n",
 
 
120
  " rewards = []\n",
121
  " state = self.env.reset()\n",
122
  " # Line 4 of pseudocode\n",
123
  " for t in range(max_t):\n",
124
- " action, log_prob = self.act(state)\n",
125
- " saved_log_probs.append(log_prob)\n",
 
 
 
126
  " state, reward, done, _ = self.env.step(action)\n",
127
  " rewards.append(reward)\n",
128
  " if done:\n",
@@ -165,7 +217,7 @@
165
  " ## a normal python list would instead require O(N) to do this.\n",
166
  " for t in range(n_steps)[::-1]:\n",
167
  " disc_return_t = (returns[0] if len(returns)>0 else 0)\n",
168
- " returns.appendleft( gamma*disc_return_t + rewards[t] ) \n",
169
  " \n",
170
  " ## standardization of the returns is employed to make training more stable\n",
171
  " eps = np.finfo(np.float32).eps.item()\n",
@@ -173,22 +225,32 @@
173
  " # added to the standard deviation of the returns to avoid numerical instabilities \n",
174
  " returns = np.array(returns)\n",
175
  " returns = (returns - returns.mean()) / (returns.std() + eps)\n",
 
176
  " \n",
177
  " # Line 7:\n",
178
- " policy_loss = []\n",
179
- " for log_prob, disc_return in zip(saved_log_probs, returns):\n",
180
- " policy_loss.append(-log_prob * disc_return)\n",
181
- " policy_loss = np.concatenate(policy_loss).sum()\n",
 
 
 
 
 
 
 
 
 
 
182
  " \n",
183
- " # Line 8: gradient descent step\n",
184
  " # optimizer.zero_grad()\n",
185
- " policy_loss.backward()\n",
186
- " self.model.train_on_batch()\n",
187
  " # optimizer.step()\n",
188
  " \n",
189
  " if i_episode % print_every == 0:\n",
190
  " print('Episode {}\\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))\n",
191
- " \n",
192
  " return scores\n",
193
  "\n",
194
  "\n",
@@ -213,7 +275,7 @@
213
  },
214
  {
215
  "cell_type": "code",
216
- "execution_count": 67,
217
  "metadata": {},
218
  "outputs": [],
219
  "source": [
@@ -445,15 +507,15 @@
445
  },
446
  {
447
  "cell_type": "code",
448
- "execution_count": 68,
449
  "metadata": {},
450
  "outputs": [
451
  {
452
  "name": "stdout",
453
  "output_type": "stream",
454
  "text": [
455
- "3024\n",
456
- "1875\n"
457
  ]
458
  }
459
  ],
@@ -476,17 +538,75 @@
476
  },
477
  {
478
  "cell_type": "code",
479
- "execution_count": null,
480
  "metadata": {},
481
- "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
  "source": [
483
  "# create env\n",
484
  "max_steps = 20 \n",
485
  "env = CustTradingEnv(df=eth_train, max_steps=max_steps)\n",
486
  "\n",
487
- "model = Policy(env=env)\n",
488
- "#n_training_episodes, max_t, gamma, print_every\n",
489
- "model.reinforce(1000, 1000, 0.95, 100)\n"
 
 
 
490
  ]
491
  },
492
  {
@@ -495,8 +615,8 @@
495
  "metadata": {},
496
  "outputs": [],
497
  "source": [
498
- "model.save(\"./alt/fin_rl_dqn_v1\")\n",
499
- "joblib.dump(env.get_scaler(),\"./alt/fin_rl_dqn_v1.h5_scaler\")\n"
500
  ]
501
  },
502
  {
 
6
  "id": "nwaAZRu1NTiI"
7
  },
8
  "source": [
9
+ "# Policy Gradient\n",
10
  "\n",
11
  "\n",
12
+ "#### This version implements Policy Gradient using a custom enviroment (Unit 4)"
13
  ]
14
  },
15
  {
 
24
  },
25
  {
26
  "cell_type": "code",
27
+ "execution_count": 44,
28
  "metadata": {
29
  "id": "LNXxxKojNTiL"
30
  },
31
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
32
  "source": [
33
  "import tensorflow as tf\n",
34
+ "from tensorflow.keras import layers, Model, Input\n",
35
  "from tensorflow.keras.utils import to_categorical\n",
36
+ "import tensorflow.keras.backend as K\n",
37
+ "\n",
38
  "import gym\n",
39
  "from gym import spaces\n",
40
  "from gym.utils import seeding\n",
 
58
  },
59
  {
60
  "cell_type": "code",
61
+ "execution_count": 45,
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "# custom model to be able to run a custom loss with parameters\n",
66
+ "class CustomModel(tf.keras.Model):\n",
67
+ " def custom_loss(self,y, y_pred, d_returns):\n",
68
+ " log_like = y * K.log(y_pred)\n",
69
+ " # K.print_tensor(d_returns)\n",
70
+ " return K.sum(-log_like * d_returns )\n",
71
+ " \n",
72
+ " def train_step(self, data):\n",
73
+ " # Unpack the data. Its structure depends on your model and\n",
74
+ " # on what you pass to `fit()`.\n",
75
+ " if len(data) == 3:\n",
76
+ " x, y, sample_weight = data\n",
77
+ " else:\n",
78
+ " sample_weight = None\n",
79
+ " x, y = data\n",
80
+ "\n",
81
+ " # check if we passed the d_return\n",
82
+ " if isinstance(x, tuple):\n",
83
+ " x, d_return = x\n",
84
+ "\n",
85
+ " with tf.GradientTape() as tape:\n",
86
+ " y_pred = self(x, training=True) # Forward pass\n",
87
+ " # Compute the loss value.\n",
88
+ " y = tf.cast(y, tf.float32)\n",
89
+ " loss = self.custom_loss(y, y_pred, d_return)\n",
90
+ "\n",
91
+ " # Compute gradients\n",
92
+ " trainable_vars = self.trainable_variables\n",
93
+ " gradients = tape.gradient(loss, trainable_vars)\n",
94
+ "\n",
95
+ " # Update weights\n",
96
+ " self.optimizer.apply_gradients(zip(gradients, trainable_vars))\n",
97
+ "\n",
98
+ " # Update the metrics.\n",
99
+ " # Metrics are configured in `compile()`.\n",
100
+ " self.compiled_metrics.update_state(y, y_pred, sample_weight=sample_weight)\n",
101
+ "\n",
102
+ " # Return a dict mapping metric names to current value.\n",
103
+ " # Note that it will include the loss (tracked in self.metrics).\n",
104
+ " return {m.name: m.result() for m in self.metrics}"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": 46,
110
  "metadata": {},
111
  "outputs": [],
112
  "source": [
113
  "class Policy:\n",
114
+ " def __init__(self, env=None, action_size=2):\n",
115
  "\n",
116
+ " self.action_size = action_size\n",
117
  "\n",
118
  " # Hyperparameters\n",
119
  " self.gamma = 0.95 # Discount rate\n",
120
+ "\n",
121
+ " self.learning_rate = 1e-3\n",
 
 
 
 
122
  " \n",
123
+ " # Construct DQN models\n",
 
124
  " self.env = env\n",
125
+ " self.action_size = action_size\n",
126
+ " self.action_space = [i for i in range(action_size)]\n",
127
+ " print(\"action space\",self.action_space)\n",
128
+ " # self.saved_log_probs = None\n",
129
+ " self.model= self._build_model()\n",
130
+ " self.model.summary()\n",
131
  "\n",
 
 
132
  "\n",
133
  " def _build_model(self):\n",
134
+ " x = Input(shape=(4,), name='x_input')\n",
135
+ " # y_true = Input( shape=(2,), name='y_true' )\n",
136
+ " d_returns = Input(shape=[1], name='d_returns')\n",
137
+ "\n",
138
+ " l = layers.Dense(16, activation = 'relu')(x)\n",
139
+ " l = layers.Dense(16, activation = 'relu')(l)\n",
140
+ " y_pred = layers.Dense(self.action_size, activation = 'softmax', name='y_pred')(l)\n",
141
  " \n",
142
  " optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)\n",
143
+ "\n",
144
+ " # model_train = Model( inputs=[x], outputs=[y_pred], name='train_only' )\n",
145
+ " model_train = CustomModel( inputs=x, outputs=y_pred, name='train_only' )\n",
146
+ " # model_predict = Model( inputs=x, outputs=y_pred, name='predict_only' )\n",
147
+ " model_train.compile(loss=None, optimizer=optimizer, metrics = ['accuracy'])\n",
148
+ " # use run_eagerly to print values inside the loss function to debug\n",
149
+ " # model_train.compile(loss=None, optimizer=optimizer, metrics = ['accuracy'], run_eagerly = True)\n",
150
+ "\n",
151
+ " return model_train\n",
152
  "\n",
153
  " def act(self, state):\n",
154
+ " probs = self.model.predict(np.array([state]), verbose=0)[0]\n",
155
+ " action = np.random.choice(self.action_space, p=probs)\n",
 
156
  "\n",
157
+ " return action\n",
158
+ "\n",
159
+ " # this implements the reinforce \n",
160
+ " def learn(self, n_training_episodes=None, max_t=None, print_every=100):\n",
161
  " # Help us to calculate the score during the training\n",
162
  " scores_deque = deque(maxlen=100)\n",
163
  " scores = []\n",
164
  " # Line 3 of pseudocode\n",
165
  " for i_episode in range(1, n_training_episodes+1):\n",
166
+ " # saved_log_probs = []\n",
167
+ " saved_actions = []\n",
168
+ " saved_state = []\n",
169
  " rewards = []\n",
170
  " state = self.env.reset()\n",
171
  " # Line 4 of pseudocode\n",
172
  " for t in range(max_t):\n",
173
+ " saved_state.append(state)\n",
174
+ " action = self.act(state)\n",
175
+ " # action, log_prob = self.act(state)\n",
176
+ " # saved_log_probs.append(log_prob)\n",
177
+ " saved_actions.append(action)\n",
178
  " state, reward, done, _ = self.env.step(action)\n",
179
  " rewards.append(reward)\n",
180
  " if done:\n",
 
217
  " ## a normal python list would instead require O(N) to do this.\n",
218
  " for t in range(n_steps)[::-1]:\n",
219
  " disc_return_t = (returns[0] if len(returns)>0 else 0)\n",
220
+ " returns.appendleft( self.gamma*disc_return_t + rewards[t] ) \n",
221
  " \n",
222
  " ## standardization of the returns is employed to make training more stable\n",
223
  " eps = np.finfo(np.float32).eps.item()\n",
 
225
  " # added to the standard deviation of the returns to avoid numerical instabilities \n",
226
  " returns = np.array(returns)\n",
227
  " returns = (returns - returns.mean()) / (returns.std() + eps)\n",
228
+ " # self.saved_log_probs = saved_log_probs\n",
229
  " \n",
230
  " # Line 7:\n",
231
+ " saved_state = np.array(saved_state)\n",
232
+ " # print(\"Saved state\", saved_state, saved_state.shape)\n",
233
+ " saved_actions = np.array(to_categorical(saved_actions, num_classes=self.action_size))\n",
234
+ " # print(\"Saved actions\", saved_actions, saved_actions.shape)\n",
235
+ " returns = returns.reshape(-1,1)\n",
236
+ " # print(\"Returns\", returns, returns.shape)\n",
237
+ " # this is the trick part, we send a tuple so the CustomModel is able to split the x and use \n",
238
+ " # the returns inside to calculate the custom loss\n",
239
+ " self.model.train_on_batch(x=(saved_state,returns), y=saved_actions)\n",
240
+ "\n",
241
+ " # policy_loss = []\n",
242
+ " # for action, log_prob, disc_return in zip(saved_actions, saved_log_probs, returns):\n",
243
+ " # policy_loss.append(-log_prob * disc_return)\n",
244
+ " # policy_loss = torch.cat(policy_loss).sum()\n",
245
  " \n",
246
+ " # # Line 8: PyTorch prefers gradient descent \n",
247
  " # optimizer.zero_grad()\n",
248
+ " # policy_loss.backward()\n",
 
249
  " # optimizer.step()\n",
250
  " \n",
251
  " if i_episode % print_every == 0:\n",
252
  " print('Episode {}\\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))\n",
253
+ " \n",
254
  " return scores\n",
255
  "\n",
256
  "\n",
 
275
  },
276
  {
277
  "cell_type": "code",
278
+ "execution_count": 47,
279
  "metadata": {},
280
  "outputs": [],
281
  "source": [
 
507
  },
508
  {
509
  "cell_type": "code",
510
+ "execution_count": 48,
511
  "metadata": {},
512
  "outputs": [
513
  {
514
  "name": "stdout",
515
  "output_type": "stream",
516
  "text": [
517
+ "3067\n",
518
+ "1918\n"
519
  ]
520
  }
521
  ],
 
538
  },
539
  {
540
  "cell_type": "code",
541
+ "execution_count": 49,
542
  "metadata": {},
543
+ "outputs": [
544
+ {
545
+ "name": "stdout",
546
+ "output_type": "stream",
547
+ "text": [
548
+ "action space [0, 1, 2]\n",
549
+ "Model: \"train_only\"\n",
550
+ "_________________________________________________________________\n",
551
+ " Layer (type) Output Shape Param # \n",
552
+ "=================================================================\n",
553
+ " x_input (InputLayer) [(None, 4)] 0 \n",
554
+ " \n",
555
+ " dense_22 (Dense) (None, 16) 80 \n",
556
+ " \n",
557
+ " dense_23 (Dense) (None, 16) 272 \n",
558
+ " \n",
559
+ " y_pred (Dense) (None, 3) 51 \n",
560
+ " \n",
561
+ "=================================================================\n",
562
+ "Total params: 403\n",
563
+ "Trainable params: 403\n",
564
+ "Non-trainable params: 0\n",
565
+ "_________________________________________________________________\n",
566
+ "Episode 100\tAverage Score: -180.05\n",
567
+ "Episode 200\tAverage Score: -164.72\n",
568
+ "Episode 300\tAverage Score: -81.03\n",
569
+ "Episode 400\tAverage Score: -117.40\n",
570
+ "Episode 500\tAverage Score: -182.76\n",
571
+ "Episode 600\tAverage Score: -92.27\n",
572
+ "Episode 700\tAverage Score: -207.78\n",
573
+ "Episode 800\tAverage Score: -232.02\n",
574
+ "Episode 900\tAverage Score: -29.72\n",
575
+ "Episode 1000\tAverage Score: -44.37\n",
576
+ "Episode 1100\tAverage Score: -60.61\n",
577
+ "Episode 1200\tAverage Score: -67.30\n",
578
+ "Episode 1300\tAverage Score: -36.28\n",
579
+ "Episode 1400\tAverage Score: -60.42\n",
580
+ "Episode 1500\tAverage Score: -93.99\n",
581
+ "Episode 1600\tAverage Score: -70.92\n",
582
+ "Episode 1700\tAverage Score: -88.01\n",
583
+ "Episode 1800\tAverage Score: -21.69\n",
584
+ "Episode 1900\tAverage Score: -66.15\n",
585
+ "Episode 2000\tAverage Score: -96.49\n",
586
+ "Episode 2100\tAverage Score: -33.40\n",
587
+ "Episode 2200\tAverage Score: -25.62\n",
588
+ "Episode 2300\tAverage Score: -46.25\n",
589
+ "Episode 2400\tAverage Score: -63.88\n",
590
+ "Episode 2500\tAverage Score: -29.43\n",
591
+ "Episode 2600\tAverage Score: -19.85\n",
592
+ "Episode 2700\tAverage Score: -53.53\n",
593
+ "Episode 2800\tAverage Score: -42.98\n",
594
+ "Episode 2900\tAverage Score: -50.12\n",
595
+ "Episode 3000\tAverage Score: -27.25\n"
596
+ ]
597
+ }
598
+ ],
599
  "source": [
600
  "# create env\n",
601
  "max_steps = 20 \n",
602
  "env = CustTradingEnv(df=eth_train, max_steps=max_steps)\n",
603
  "\n",
604
+ "model = Policy(env=env, action_size=3)\n",
605
+ "# model.learn(total_steps=6_000)\n",
606
+ "\n",
607
+ "model.learn(n_training_episodes=3000, max_t=20, print_every=100)\n",
608
+ "# model.learn(n_training_episodes=1000, max_t=1000, print_every=100)\n",
609
+ "env.close()\n"
610
  ]
611
  },
612
  {
 
615
  "metadata": {},
616
  "outputs": [],
617
  "source": [
618
+ "model.save(\"./alt/fin_rl_policy_gradient_v1\")\n",
619
+ "joblib.dump(env.get_scaler(),\"./alt/fin_rl_policy_gradient_v1.h5_scaler\")\n"
620
  ]
621
  },
622
  {
test_return.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import deque
2
+ import numpy as np
3
+ returns = deque(maxlen=20)
4
+ rewards = [1,1,1,1,1]
5
+ n_steps = len(rewards)
6
+
7
+ for t in range(n_steps)[::-1]:
8
+ print("Step=======",t)
9
+ disc_return_t = (returns[0] if len(returns)>0 else 0)
10
+ print("return",disc_return_t)
11
+ print("reward",rewards[t] )
12
+ returns.appendleft( 0.95 * disc_return_t +rewards[t] )
13
+ print("appended ret",returns )
14
+
15
+ returns = np.array(returns)
16
+ print(returns)