bonadio commited on
Commit
1cb25d9
1 Parent(s): 5d5dd44

DQN_v1 converging

Browse files
Files changed (3) hide show
  1. DQN_v1.ipynb +113 -145
  2. DQN_v1_result.mp4 +0 -0
  3. DQN_v2.ipynb +0 -0
DQN_v1.ipynb CHANGED
@@ -13,20 +13,7 @@
13
  },
14
  {
15
  "cell_type": "code",
16
- "execution_count": null,
17
- "metadata": {
18
- "id": "DDf1gLC2NTiK"
19
- },
20
- "outputs": [],
21
- "source": [
22
- "# !pip install -r ./requirements.txt\n",
23
- "!pip install stable_baselines3[extra]\n",
24
- "!pip install huggingface_sb3\n"
25
- ]
26
- },
27
- {
28
- "cell_type": "code",
29
- "execution_count": 2,
30
  "metadata": {
31
  "id": "LNXxxKojNTiL"
32
  },
@@ -35,7 +22,7 @@
35
  "name": "stderr",
36
  "output_type": "stream",
37
  "text": [
38
- "2022-12-21 23:28:04.436066: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
39
  "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
40
  "\n"
41
  ]
@@ -55,6 +42,7 @@
55
  "import numpy as np\n",
56
  "import random\n",
57
  "from matplotlib import pyplot as plt\n",
 
58
  "\n",
59
  "import io\n",
60
  "import base64\n",
@@ -63,7 +51,7 @@
63
  },
64
  {
65
  "cell_type": "code",
66
- "execution_count": 16,
67
  "metadata": {},
68
  "outputs": [],
69
  "source": [
@@ -76,11 +64,11 @@
76
  " # Hyperparameters\n",
77
  " self.gamma = 0.95 # Discount rate\n",
78
  " self.epsilon = 1.0 # Exploration rate\n",
79
- " self.epsilon_min = 0.05 # Minimal exploration rate (epsilon-greedy)\n",
80
- " self.epsilon_decay = 0.90 # Decay rate for epsilon\n",
81
- " self.update_rate = 200 # Number of steps until updating the target network\n",
82
  " self.batch_size = 100\n",
83
- " self.learning_rate = 0.001\n",
84
  " \n",
85
  " # Construct DQN models\n",
86
  " self.model = self._build_model()\n",
@@ -90,120 +78,116 @@
90
  " self.env = env\n",
91
  " self.action_size = action_size\n",
92
  "\n",
 
 
93
  " def _build_model(self):\n",
94
  " model = tf.keras.Sequential()\n",
95
  " \n",
96
  " model.add(tf.keras.Input(shape=(4,)))\n",
97
- " # FC Layers\n",
98
- " model.add(layers.Dense(24, activation='relu'))\n",
99
- " model.add(layers.Dense(24, activation='relu'))\n",
100
- " model.add(layers.Dense(self.action_size, activation='linear'))\n",
 
101
  " \n",
102
  " optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)\n",
103
- " model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])\n",
 
104
  " return model\n",
105
  "\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  "\n",
107
  " #\n",
108
  " # Trains the model using randomly selected experiences in the replay memory\n",
109
  " #\n",
110
  " def _train(self):\n",
111
- " minibatch = random.sample(self.replay_buffer, self.batch_size)\n",
112
- " \n",
113
- " for state, action, reward, next_state, done in minibatch:\n",
114
- " \n",
115
- " if not done:\n",
116
- " model_predict = self.model.predict(np.array([next_state]), verbose=0)\n",
117
- " max_action = np.argmax(model_predict[0])\n",
118
- " target = (reward + self.gamma * self.target_model.predict(np.array([next_state]), verbose=0)[0][max_action])\n",
119
- " else:\n",
120
- " target = reward\n",
121
- " \n",
122
- " # Construct the target vector as follows:\n",
123
- " # 1. Use the current model to output the Q-value predictions\n",
124
- " target_f = self.model.predict(np.array([state]), verbose=0)\n",
125
- " \n",
126
- " # 2. Rewrite the chosen action value with the computed target\n",
127
- " target_f[0][action] = target\n",
128
- " \n",
129
- " # 3. Use vectors in the objective computation\n",
130
- " history = self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)\n",
131
- " print(f\"Loss: {history.history['loss']} \")\n",
132
- " \n",
133
- " if self.epsilon > self.epsilon_min:\n",
134
- " self.epsilon *= self.epsilon_decay\n",
135
- " #\n",
136
- " # Trains the model using randomly selected experiences in the replay memory\n",
137
- " #\n",
138
- " def _train_b(self):\n",
139
- " \n",
140
  " # state, action, reward, next_state, done \n",
141
  " # create the targets \n",
142
- " mb_arr = np.array(random.sample(self.replay_buffer, self.batch_size), dtype=object)\n",
 
 
 
143
  "\n",
144
  " next_state_arr = np.stack(mb_arr[:,3])\n",
145
- " target_model_predict = self.target_model.predict(next_state_arr, verbose=0)\n",
146
- " max_action_arr = np.argmax(target_model_predict, axis=1)\n",
147
- " q_targets = []\n",
148
- " for idx,val in enumerate(zip(target_model_predict, max_action_arr)):\n",
149
- " row, col = val\n",
150
- " # if done\n",
151
- " if mb_arr[idx,4] == True:\n",
152
- " q_targets.append(mb_arr[idx,2])\n",
153
  " else:\n",
154
- " q_targets.append(row[col])\n",
155
  "\n",
156
- " q_targets = np.array(q_targets)\n",
157
- " reward_arr = np.stack(mb_arr[:,2])\n",
158
- " # targets Yj\n",
159
- " target_arr = (reward_arr + self.gamma * q_targets)\n",
160
  "\n",
161
  " # Perform gradient step\n",
162
- " state_arr = np.stack(mb_arr[:,0])\n",
163
- " model_predict = self.model.predict(state_arr, verbose=0)\n",
164
- " action_arr = np.stack(mb_arr[:,1])\n",
165
- " f_targets=[]\n",
166
- " for idx, val in enumerate(zip(action_arr, target_arr)):\n",
167
- " act, targ = val\n",
168
- " model_predict[idx][act] = targ\n",
169
  "\n",
170
- " history = self.model.fit(state_arr, model_predict, epochs=1, verbose=0)\n",
171
- " print(f\"Loss: {history.history['loss']} \")\n",
172
- " # update epsilon\n",
173
- " if self.epsilon > self.epsilon_min:\n",
174
- " self.epsilon *= self.epsilon_decay\n",
175
  "\n",
176
  " def learn(self, total_steps=None):\n",
177
- "\n",
178
- " state = self.env.reset()\n",
 
179
  " total_reward = 0\n",
180
- " rewards = []\n",
181
- " for current_step in tqdm(range(total_steps)):\n",
182
- "\n",
183
- " # e-greedy\n",
184
- " if np.random.rand() <= self.epsilon:\n",
185
- " action = random.randrange(self.action_size)\n",
186
- " else:\n",
187
- " model_predict = self.model.predict(np.array([state]), verbose=0)\n",
188
- " action = np.argmax(model_predict[0])\n",
 
 
 
 
 
 
189
  "\n",
190
- " # step\n",
191
- " next_state, reward, done, info = self.env.step(action)\n",
192
- " total_reward += reward\n",
193
- " # add to buffer\n",
194
- " self.replay_buffer.append((state, action, reward, next_state, done))\n",
195
  "\n",
196
- " if done:\n",
197
- " rewards.append(total_reward)\n",
198
- " total_reward = 0\n",
199
- " state = self.env.reset()\n",
200
  "\n",
201
- " if current_step>10 and current_step % self.update_rate == 0:\n",
202
- " print(f\"epsilon:{self.epsilon} step:{current_step} mean_reward {np.mean(rewards)} \")\n",
203
- " self._train()\n",
204
- " # update target\n",
205
- " self.target_model.set_weights(self.model.get_weights())\n",
 
 
206
  " \n",
 
 
 
 
207
  " #\n",
208
  " # Loads a saved model\n",
209
  " #\n",
@@ -229,89 +213,73 @@
229
  "env = gym.make('CartPole-v1')\n",
230
  "\n",
231
  "model = DQN(env=env, replay_buffer_size=10_000, action_size=2)\n",
232
- "model.learn(total_steps=20_000)\n",
233
  "env.close()"
234
  ]
235
  },
236
  {
237
  "cell_type": "code",
238
- "execution_count": null,
239
- "metadata": {},
240
- "outputs": [],
241
- "source": [
242
- "# env = gym.make('CartPole-v1')\n",
243
- "\n",
244
- "# model = DQN(env=env, replay_buffer_size=10_000, action_size=2)\n",
245
- "\n",
246
- "# state = model.env.reset()\n",
247
- "# for i in range(100):\n",
248
- "# random_action = env.action_space.sample()\n",
249
- "# next_state, reward, done, info = model.env.step(random_action)\n",
250
- "# model.replay_buffer.append((state, random_action, reward, next_state, done))\n",
251
- "# if done:\n",
252
- "# state = model.env.reset()\n",
253
- "# else:\n",
254
- "# state = next_state\n",
255
- "\n",
256
- "# minibatch = random.sample(model.replay_buffer, 10)\n",
257
- "# mb = np.array(minibatch, dtype=object)\n",
258
- "# print(mb[:,0])\n",
259
- "# np.stack(mb[:,0])\n"
260
- ]
261
- },
262
- {
263
- "cell_type": "code",
264
- "execution_count": 6,
265
  "metadata": {},
266
  "outputs": [],
267
  "source": [
268
- "model.save(\"./m1.h5\")"
269
  ]
270
  },
271
  {
272
  "cell_type": "code",
273
- "execution_count": 7,
274
  "metadata": {},
275
  "outputs": [
276
  {
277
  "name": "stdout",
278
  "output_type": "stream",
279
  "text": [
280
- "Model: \"sequential_2\"\n",
281
  "_________________________________________________________________\n",
282
  " Layer (type) Output Shape Param # \n",
283
  "=================================================================\n",
284
- " dense_6 (Dense) (None, 128) 640 \n",
285
  " \n",
286
- " dense_7 (Dense) (None, 64) 8256 \n",
287
  " \n",
288
- " dense_8 (Dense) (None, 2) 130 \n",
 
 
289
  " \n",
290
  "=================================================================\n",
291
- "Total params: 9,026\n",
292
- "Trainable params: 9,026\n",
293
  "Non-trainable params: 0\n",
294
  "_________________________________________________________________\n",
295
- "1.0 {}\n"
296
  ]
297
  }
298
  ],
299
  "source": [
300
  "eval_env = gym.make('CartPole-v1')\n",
301
  "model = DQN(env=eval_env, replay_buffer_size=10_000, action_size=2)\n",
302
- "model.load(\"./m1.h5\")\n",
303
  "eval_env = wrappers.Monitor(eval_env, \"./alt/gym-results\", force=True)\n",
304
  "state = eval_env.reset()\n",
 
305
  "for _ in range(1000):\n",
306
  " action = model.play(state)\n",
307
  " observation, reward, done, info = eval_env.step(action)\n",
308
- " # print(info)\n",
309
  " state = observation\n",
310
  " if done: \n",
311
- " print(reward, info)\n",
312
  " break\n",
313
  "eval_env.close()"
314
  ]
 
 
 
 
 
 
 
315
  }
316
  ],
317
  "metadata": {
 
13
  },
14
  {
15
  "cell_type": "code",
16
+ "execution_count": 1,
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  "metadata": {
18
  "id": "LNXxxKojNTiL"
19
  },
 
22
  "name": "stderr",
23
  "output_type": "stream",
24
  "text": [
25
+ "2022-12-22 18:43:04.111595: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n",
26
  "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
27
  "\n"
28
  ]
 
42
  "import numpy as np\n",
43
  "import random\n",
44
  "from matplotlib import pyplot as plt\n",
45
+ "from sklearn.preprocessing import MinMaxScaler\n",
46
  "\n",
47
  "import io\n",
48
  "import base64\n",
 
51
  },
52
  {
53
  "cell_type": "code",
54
+ "execution_count": 29,
55
  "metadata": {},
56
  "outputs": [],
57
  "source": [
 
64
  " # Hyperparameters\n",
65
  " self.gamma = 0.95 # Discount rate\n",
66
  " self.epsilon = 1.0 # Exploration rate\n",
67
+ " self.epsilon_min = 0.001 # Minimal exploration rate (epsilon-greedy)\n",
68
+ " self.epsilon_decay = 0.95 # Decay rate for epsilon\n",
69
+ " self.update_rate = 5 # Number of steps until updating the target network\n",
70
  " self.batch_size = 100\n",
71
+ " self.learning_rate = 2.5e-4\n",
72
  " \n",
73
  " # Construct DQN models\n",
74
  " self.model = self._build_model()\n",
 
78
  " self.env = env\n",
79
  " self.action_size = action_size\n",
80
  "\n",
81
+ " self.scaler = None\n",
82
+ "\n",
83
  " def _build_model(self):\n",
84
  " model = tf.keras.Sequential()\n",
85
  " \n",
86
  " model.add(tf.keras.Input(shape=(4,)))\n",
87
+ " model.add(layers.Dense(512, activation = 'relu'))\n",
88
+ " model.add(layers.Dense(256, activation = 'relu'))\n",
89
+ " model.add(layers.Dense(128, activation = 'relu'))\n",
90
+ " model.add(layers.Dense(self.action_size, activation = 'linear'))\n",
91
+ " # model.compile(optimizer = RMSprop(lr = self.lr, rho = 0.95, epsilon = 0.01), loss = \"mse\", metrics = ['accuracy'])\n",
92
  " \n",
93
  " optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)\n",
94
+ " # model.compile(loss='mse', optimizer=tf.keras.optimizers.RMSprop(lr = self.learning_rate, rho = 0.95, epsilon = 0.01), metrics = ['accuracy'])\n",
95
+ " model.compile(loss='mse', optimizer=optimizer, metrics = ['accuracy'])\n",
96
  " return model\n",
97
  "\n",
98
+ " def _min_max(self):\n",
99
+ " \"\"\"Run some steps to get data to do MINMAX scale \"\"\"\n",
100
+ " state_arr = []\n",
101
+ " state = self.env.reset()\n",
102
+ " state_arr.append(self.env.observation_space.high)\n",
103
+ " state_arr.append(self.env.observation_space.low)\n",
104
+ " for i in range(1000):\n",
105
+ " random_action = self.env.action_space.sample()\n",
106
+ " next_state, reward, done, info = self.env.step(random_action)\n",
107
+ " state_arr.append(next_state)\n",
108
+ " if done:\n",
109
+ " state = self.env.reset()\n",
110
+ "\n",
111
+ " state_arr = np.array(state_arr)\n",
112
+ " self.scaler = MinMaxScaler()\n",
113
+ " self.scaler.fit(state_arr)\n",
114
  "\n",
115
  " #\n",
116
  " # Trains the model using randomly selected experiences in the replay memory\n",
117
  " #\n",
118
  " def _train(self):\n",
119
+ " X, y = [], []\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  " # state, action, reward, next_state, done \n",
121
  " # create the targets \n",
122
+ " if self.batch_size > len(self.replay_buffer):\n",
123
+ " return\n",
124
+ " minibatch = random.sample(self.replay_buffer, self.batch_size)\n",
125
+ " mb_arr = np.array(minibatch, dtype=object)\n",
126
  "\n",
127
  " next_state_arr = np.stack(mb_arr[:,3])\n",
128
+ " future_qvalues = self.target_model.predict(next_state_arr, verbose=0)\n",
129
+ "\n",
130
+ " state_arr = np.stack(mb_arr[:,0])\n",
131
+ " qvalues = self.model.predict(state_arr, verbose=0)\n",
132
+ "\n",
133
+ " for index, (state, action, reward, next_state, done) in enumerate(minibatch):\n",
134
+ " if done == True:\n",
135
+ " q_target = reward\n",
136
  " else:\n",
137
+ " q_target = reward + self.gamma * np.max(future_qvalues[index])\n",
138
  "\n",
139
+ " q_curr = qvalues[index]\n",
140
+ " q_curr[action] = q_target \n",
141
+ " X.append(state)\n",
142
+ " y.append(q_curr)\n",
143
  "\n",
144
  " # Perform gradient step\n",
145
+ " X, y = np.array(X), np.array(y)\n",
146
+ " history = self.model.fit(X, y, batch_size = self.batch_size, shuffle = False, verbose=0)\n",
147
+ " # history = self.model.fit(X, y, epochs=1, verbose=0)\n",
148
+ " # print(f\"Loss: {history.history['loss']} \")\n",
 
 
 
149
  "\n",
 
 
 
 
 
150
  "\n",
151
  " def learn(self, total_steps=None):\n",
152
+ " #create scaler\n",
153
+ " self._min_max()\n",
154
+ " current_episode = 0\n",
155
  " total_reward = 0\n",
156
+ " rewards = [0]\n",
157
+ " current_step = 0\n",
158
+ " while current_step < total_steps:\n",
159
+ " current_episode += 1\n",
160
+ " state = self.env.reset()\n",
161
+ " total_reward = 0\n",
162
+ " done = False\n",
163
+ " while done != True:\n",
164
+ " current_step +=1\n",
165
+ " # e-greedy\n",
166
+ " if np.random.random() > (1 - self.epsilon):\n",
167
+ " action = random.randrange(self.action_size)\n",
168
+ " else:\n",
169
+ " model_predict = self.model.predict(np.array([state]), verbose=0)\n",
170
+ " action = np.argmax(model_predict)\n",
171
  "\n",
172
+ " # step\n",
173
+ " next_state, reward, done, info = self.env.step(action)\n",
174
+ " total_reward += reward\n",
 
 
175
  "\n",
176
+ " # add to buffer\n",
177
+ " self.replay_buffer.append((state, action, reward, next_state, done))\n",
 
 
178
  "\n",
179
+ " if current_step>10 and current_step % self.update_rate == 0:\n",
180
+ " print(f\"epsilon:{self.epsilon} step:{current_step} episode:{current_episode} last_score {rewards[-1]} \")\n",
181
+ " self._train()\n",
182
+ " # update target\n",
183
+ " self.target_model.set_weights(self.model.get_weights())\n",
184
+ " \n",
185
+ " state = next_state\n",
186
  " \n",
187
+ " rewards.append(total_reward)\n",
188
+ " # update epsilon\n",
189
+ " if self.epsilon > self.epsilon_min:\n",
190
+ " self.epsilon *= self.epsilon_decay\n",
191
  " #\n",
192
  " # Loads a saved model\n",
193
  " #\n",
 
213
  "env = gym.make('CartPole-v1')\n",
214
  "\n",
215
  "model = DQN(env=env, replay_buffer_size=10_000, action_size=2)\n",
216
+ "model.learn(total_steps=6_000)\n",
217
  "env.close()"
218
  ]
219
  },
220
  {
221
  "cell_type": "code",
222
+ "execution_count": 31,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  "metadata": {},
224
  "outputs": [],
225
  "source": [
226
+ "model.save(\"./alt/m1.h5\")"
227
  ]
228
  },
229
  {
230
  "cell_type": "code",
231
+ "execution_count": 33,
232
  "metadata": {},
233
  "outputs": [
234
  {
235
  "name": "stdout",
236
  "output_type": "stream",
237
  "text": [
238
+ "Model: \"sequential_28\"\n",
239
  "_________________________________________________________________\n",
240
  " Layer (type) Output Shape Param # \n",
241
  "=================================================================\n",
242
+ " dense_97 (Dense) (None, 512) 2560 \n",
243
  " \n",
244
+ " dense_98 (Dense) (None, 256) 131328 \n",
245
  " \n",
246
+ " dense_99 (Dense) (None, 128) 32896 \n",
247
+ " \n",
248
+ " dense_100 (Dense) (None, 2) 258 \n",
249
  " \n",
250
  "=================================================================\n",
251
+ "Total params: 167,042\n",
252
+ "Trainable params: 167,042\n",
253
  "Non-trainable params: 0\n",
254
  "_________________________________________________________________\n",
255
+ "Total reward 500.0\n"
256
  ]
257
  }
258
  ],
259
  "source": [
260
  "eval_env = gym.make('CartPole-v1')\n",
261
  "model = DQN(env=eval_env, replay_buffer_size=10_000, action_size=2)\n",
262
+ "model.load(\"./alt/m1.h5\")\n",
263
  "eval_env = wrappers.Monitor(eval_env, \"./alt/gym-results\", force=True)\n",
264
  "state = eval_env.reset()\n",
265
+ "total_reward = 0\n",
266
  "for _ in range(1000):\n",
267
  " action = model.play(state)\n",
268
  " observation, reward, done, info = eval_env.step(action)\n",
269
+ " total_reward +=reward\n",
270
  " state = observation\n",
271
  " if done: \n",
272
+ " print(f\"Total reward {total_reward}\")\n",
273
  " break\n",
274
  "eval_env.close()"
275
  ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": null,
280
+ "metadata": {},
281
+ "outputs": [],
282
+ "source": []
283
  }
284
  ],
285
  "metadata": {
DQN_v1_result.mp4 ADDED
Binary file (23.8 kB). View file
 
DQN_v2.ipynb ADDED
The diff for this file is too large to render. See raw diff