tricodex commited on
Commit
869aecb
1 Parent(s): e3b27e2

Upload 2 files

Browse files
learning_machines_robobo/examples/full_project_setup/catkin_ws/src/learning_machines/src/learning_machines/task2_robobo_env_t3rework.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import numpy as np
4
+ import gymnasium as gym
5
+ from gymnasium import spaces
6
+ from stable_baselines3 import PPO
7
+ from stable_baselines3.common.vec_env import DummyVecEnv
8
+ from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, EveryNTimesteps, CallbackList
9
+ from datetime import datetime
10
+ from robobo_interface import IRobobo, Position, Orientation
11
+ from data_files import FIGRURES_DIR
12
+
13
+ class RoboboEnv(gym.Env):
14
+ def __init__(self, rob: IRobobo, ep_steps=1024, simulation=True):
15
+ super(RoboboEnv, self).__init__()
16
+ self.robot = rob
17
+ self.simulation = simulation
18
+ self.ep_steps = ep_steps
19
+
20
+ # Define action and observation space
21
+ self.action_space = spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32)
22
+ self.observation_space = spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.uint8)
23
+
24
+ self.has_reset = False
25
+ self.current_step = 0
26
+
27
+ def reset(self, seed=None, options=None):
28
+ super().reset(seed=seed) # Reset parent class
29
+ if self.simulation:
30
+ self.robot.stop_simulation()
31
+ self.robot.set_position(Position(0, 0, 0), Orientation(0, 0, 0))
32
+ self.robot.play_simulation()
33
+
34
+ self.robot.set_phone_tilt(240, 50) # tilted down
35
+ self.robot.sleep(3)
36
+ self.current_step = 0
37
+ self.has_reset = True
38
+ return self._get_obs(), {}
39
+
40
+ def step(self, action):
41
+ if not self.has_reset:
42
+ raise RuntimeError("Environment must be reset before stepping")
43
+
44
+ self.current_step += 1
45
+
46
+ action[2] = (((action[2] + 1) / 2) * 0.75) + 0.15
47
+ left_speed, right_speed, duration_scale = action * np.array([50, 50, 800])
48
+
49
+ self.robot.move(left_speed, right_speed, duration_scale)
50
+ self.robot.sleep(action[2])
51
+
52
+ obs = self._get_obs()
53
+ reward = self._calculate_reward(obs)
54
+ print(self.current_step,reward)
55
+
56
+ done = self.current_step >= self.ep_steps
57
+
58
+ return obs, reward, done, False, {}
59
+
60
+ def _get_obs(self):
61
+ frame = self.robot.get_image_front()
62
+ frame = cv2.resize(frame, (64, 64))
63
+ return frame
64
+
65
+
66
+ def _calculate_reward(self, obs):
67
+ food_collected = self.robot.nr_food_collected()
68
+
69
+ # Convert to HSV for easier color detection
70
+ hsv = cv2.cvtColor(obs, cv2.COLOR_BGR2HSV)
71
+
72
+ # Detect green area (representing food)
73
+ lower_green = np.array([40, 50, 50])
74
+ upper_green = np.array([80, 255, 255])
75
+ green_mask = cv2.inRange(hsv, lower_green, upper_green)
76
+ green_area = np.sum(green_mask > 0)
77
+
78
+ # Calculate centroid of the green area
79
+ green_centroid = np.mean(np.argwhere(green_mask > 0), axis=0) if green_area > 0 else np.array([32, 32])
80
+
81
+ # Distance to the green centroid
82
+ distance_to_green = np.linalg.norm(np.array([32, 32]) - green_centroid)
83
+
84
+ # Reward for collecting food
85
+ reward = food_collected * 10 # Reward for collecting food
86
+
87
+ # Additional reward for getting closer to the green area
88
+ if green_area > 0:
89
+ reward += 5 / (distance_to_green + 1) # Inverse of the distance to the green area for reward
90
+
91
+ # Less reward if no food detected by base
92
+ if not self.robot.base_detects_food():
93
+ reward *= 0.5
94
+
95
+ return reward
96
+
97
+ def run(rob: IRobobo):
98
+ # load_model = False
99
+ env = RoboboEnv(rob)
100
+ env = DummyVecEnv([lambda: env])
101
+
102
+ current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
103
+ log_dir = os.path.join(FIGRURES_DIR, "models", f"PPO_{current_datetime}")
104
+ os.makedirs(log_dir, exist_ok=True)
105
+
106
+ steps = 1024
107
+ episodes = 1000
108
+ total_timesteps = steps * episodes
109
+
110
+ checkpoint_callback = CheckpointCallback(save_freq=total_timesteps // 1000, save_path=log_dir, name_prefix="ppo_model")
111
+ event_callback = EveryNTimesteps(n_steps=total_timesteps // 1000, callback=checkpoint_callback)
112
+
113
+ model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=log_dir,
114
+ learning_rate=1e-4,
115
+ n_steps=512,
116
+ batch_size=128,
117
+ n_epochs=5,
118
+ gamma=0.99,
119
+ ent_coef=0.01,
120
+ use_sde=True,
121
+ sde_sample_freq=4,
122
+ target_kl=0.015)
123
+
124
+ eval_callback = EvalCallback(env, best_model_save_path=log_dir,
125
+ log_path=log_dir, eval_freq=steps * 11,
126
+ deterministic=True, render=False)
127
+
128
+ callbacks = CallbackList([checkpoint_callback, event_callback, eval_callback])
129
+ model.learn(total_timesteps=total_timesteps, callback=callbacks)
130
+ model.save(os.path.join(log_dir, "final_model"))
131
+
learning_machines_robobo/examples/full_project_setup/catkin_ws/src/learning_machines/src/learning_machines/task3_rob_env_irs.py ADDED
@@ -0,0 +1,695 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import cv2
3
+ import numpy as np
4
+ import gymnasium as gym
5
+ from gymnasium import spaces
6
+ from stable_baselines3 import PPO
7
+ from stable_baselines3.common.vec_env import DummyVecEnv
8
+ from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, EveryNTimesteps, CallbackList
9
+ from datetime import datetime
10
+ from robobo_interface import IRobobo, Position, Orientation
11
+ from data_files import FIGRURES_DIR
12
+
13
+ stream = True
14
+
15
+ class RoboboEnv(gym.Env):
16
+ def __init__(self, rob: IRobobo, ep_steps=512, simulation=True):
17
+ super(RoboboEnv, self).__init__()
18
+ self.robot = rob
19
+ self.simulation = simulation
20
+ self.ep_steps = ep_steps
21
+
22
+ # Define action and observation space
23
+ self.action_space = spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32)
24
+
25
+ # Modified observation space with only specified IR sensors
26
+ self.observation_space = spaces.Dict({
27
+ 'image_segments': spaces.Box(low=0, high=2, shape=(3,), dtype=np.uint8),
28
+ 'ir_sensors': spaces.Box(low=0, high=1, shape=(3,), dtype=np.float32) # Normalized IR sensors
29
+ })
30
+
31
+ self.has_reset = False
32
+ self.current_step = 0
33
+ self.episode_reward = 0
34
+ self.best_completion_steps = float('inf')
35
+ self.consecutive_red_sightings = 0
36
+ self.previous_action = np.zeros(3)
37
+
38
+ # Curriculum variables
39
+ self.curriculum_stage = 4
40
+ # Updated curriculum stages
41
+ # Updated curriculum stages
42
+ # Reordered curriculum stages
43
+ self.stages = [
44
+ "find_red_box",
45
+ "smooth_movement",
46
+ "push_red_box",
47
+ "find_green_box",
48
+ "push_to_green_box"
49
+ ]
50
+ self.lost_red_box = False
51
+
52
+ # Adjust thresholds to match the new order (you may need to fine-tune these)
53
+ self.stage_threshold_rewards = [
54
+ 4096, # Stage 0: find_red_box
55
+ 3072, # Stage 1: smooth_movement
56
+ 5120, # Stage 2: push_red_box
57
+ 6144, # Stage 3: find_green_box
58
+ 10640 # Stage 4: push_to_green_box
59
+ ]
60
+
61
+ self.stage_success_count = 0
62
+ self.recent_episode_rewards = []
63
+
64
+ self.consecutive_wall_detections = 0
65
+ self.max_wall_penalty = 10000000
66
+
67
+ self.visited_places = set()
68
+ self.grid_size = 0.1 # Size of each grid cell in meters
69
+ self.exploration_reward_scale = 10 # Scale factor for exploration reward
70
+
71
+ # Time tracking for final stage
72
+ self.start_time = None
73
+ self.best_completion_time = float('inf')
74
+
75
+ if stream:
76
+ cv2.namedWindow("Robot View", cv2.WINDOW_NORMAL)
77
+
78
+ # Initialize previous_front_distance
79
+ self.previous_front_distance = None
80
+
81
+ def reset(self, seed=None, options=None):
82
+ super().reset(seed=seed)
83
+ if self.simulation:
84
+ self.robot.stop_simulation()
85
+ self.robot.set_position(Position(-2.4000000000000026, 0.07699995934963236, 0.03970504179596901), Orientation(-1.57191039448275, -1.5144899542893442, -1.5719101704888712))
86
+ self.robot.play_simulation()
87
+
88
+ self.robot.set_phone_tilt(240, 50) # tilted down
89
+ self.robot.sleep(2)
90
+ self.current_step = 0
91
+ self.episode_reward = 0
92
+ self.consecutive_red_sightings = 0
93
+ self.has_reset = True
94
+ self.previous_action = np.zeros(3)
95
+ self.consecutive_wall_detections = 0
96
+
97
+ self.visited_places = set()
98
+
99
+
100
+
101
+ self.previous_front_distance = None
102
+
103
+ if self.curriculum_stage == len(self.stages) - 1:
104
+ self.start_time = datetime.now()
105
+
106
+ self.action_history = []
107
+ return self._get_obs(), {}
108
+
109
+ def step(self, action):
110
+ if not self.has_reset:
111
+ raise RuntimeError("Environment must be reset before stepping")
112
+
113
+ self.current_step += 1
114
+ self.action_history.append(action)
115
+ if len(self.action_history) > 5:
116
+ self.action_history.pop(0)
117
+
118
+ # Adjust action scaling for faster movements
119
+ action[2] = ((action[2]+1)/8)+0.1 # Reduced upper limit of duration
120
+
121
+
122
+ # Convert normalized actions to robot commands
123
+ left_speed, right_speed, duration_scale = action * np.array([60, 60, 1000]) # Increased speed range, reduced duration
124
+
125
+ self.robot.move(left_speed, right_speed, duration_scale)
126
+ self.robot.sleep(action[2]/4)
127
+ #print actions and sleep
128
+ # print("----------------------")
129
+ # print("Left Speed:", left_speed, "Right Speed:", right_speed, "Duration Scale:", duration_scale, "Action[2]:", action[2])
130
+ # print("----------------------")
131
+
132
+ obs = self._get_obs()
133
+ # print("Observation:")
134
+ # print(f" Image Segments: {obs['image_segments']}")
135
+ # print(f" IR Sensors: {obs['ir_sensors']}")
136
+
137
+ if stream:
138
+ self._stream_frame(obs['image_segments'])
139
+
140
+ # exploration_reward = self._calculate_exploration_reward()
141
+ # reward += exploration_reward
142
+
143
+ reward = self._calculate_reward(obs, action)
144
+ self.episode_reward += reward
145
+ print(f"Step: {self.current_step}, Reward: {reward}, Stage: {self.stages[self.curriculum_stage]}")
146
+
147
+ done = self._check_done(obs)
148
+
149
+ if done:
150
+ self._check_curriculum_progress()
151
+
152
+ self.previous_action = action
153
+
154
+ return obs, reward, done, False, {}
155
+
156
+ def _get_obs(self):
157
+ frame = self.robot.get_image_front()
158
+ height, width = frame.shape[:2]
159
+ crop_height = int(height * 0.6)
160
+ frame = frame[height - crop_height:height, :]
161
+ frame = cv2.resize(frame, (64, 64))
162
+
163
+ hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
164
+
165
+ lower_red = np.array([0, 50, 50])
166
+ upper_red = np.array([10, 255, 255])
167
+ lower_green = np.array([40, 50, 50])
168
+ upper_green = np.array([80, 255, 255])
169
+
170
+ red_mask = cv2.inRange(hsv, lower_red, upper_red)
171
+ green_mask = cv2.inRange(hsv, lower_green, upper_green)
172
+
173
+ segment_width = 21 # 64 // 3
174
+ image_segments = np.zeros(3, dtype=np.uint8)
175
+
176
+ for i in range(3):
177
+ start = i * segment_width
178
+ end = (i + 1) * segment_width
179
+ red_area = np.sum(red_mask[:, start:end] > 0)
180
+ green_area = np.sum(green_mask[:, start:end] > 0)
181
+
182
+ if red_area > 100 and green_area > 100:
183
+ image_segments[i] = 3
184
+ elif red_area > 100:
185
+ image_segments[i] = 1
186
+ elif green_area > 100:
187
+ image_segments[i] = 2
188
+
189
+ all_ir_sensors = self.robot.read_irs()
190
+ ir_sensors = np.array([all_ir_sensors[4], all_ir_sensors[5], all_ir_sensors[7]], dtype=np.float32) / 1000.0
191
+
192
+ return {
193
+ 'image_segments': image_segments,
194
+ 'ir_sensors': ir_sensors
195
+ }
196
+
197
+ def _stream_frame(self, image_segments):
198
+ # Create a visual representation of the image segments
199
+ frame = np.zeros((64, 64, 3), dtype=np.uint8)
200
+ segment_width = frame.shape[1] // 3
201
+ for i, color in enumerate(image_segments):
202
+ if color == 1: # Red
203
+ frame[:, i*segment_width:(i+1)*segment_width] = [0, 0, 255]
204
+ elif color == 2: # Green
205
+ frame[:, i*segment_width:(i+1)*segment_width] = [0, 255, 0]
206
+ elif color == 3: # Both red and green
207
+ frame[:, i*segment_width:(i+1)*segment_width] = [0, 255, 255] # Yellow to represent both
208
+ cv2.imshow("Robot View", cv2.resize(frame, (320, 320)))
209
+ cv2.waitKey(1)
210
+
211
+ def close(self):
212
+ if stream:
213
+ cv2.destroyAllWindows()
214
+
215
+ def get_stage_threshold(self):
216
+ return self.stage_threshold_rewards[self.curriculum_stage]
217
+
218
+ def _calculate_exploration_reward(self):
219
+ # Get current position
220
+ position = self.robot.get_position()
221
+ x, y = position.x, position.y
222
+
223
+ # Discretize the position to grid coordinates
224
+ grid_x = int(x / self.grid_size)
225
+ grid_y = int(y / self.grid_size)
226
+
227
+ # Check if this grid cell has been visited before
228
+ grid_pos = (grid_x, grid_y)
229
+ if grid_pos not in self.visited_places:
230
+ self.visited_places.add(grid_pos)
231
+ return 10*self.exploration_reward_scale**2
232
+ return -100
233
+
234
+
235
+
236
+ def _calculate_reward(self, obs, action):
237
+ image_segments = obs['image_segments']
238
+ ir_sensors = obs['ir_sensors']
239
+
240
+ reward = 0
241
+
242
+ # if wall_detected: artifact for training till 30712
243
+ # reward -= 10
244
+
245
+ # Common penalties for all stages
246
+ movement_penalty = np.sum(np.abs(action - self.previous_action))
247
+ reward -= movement_penalty * 0.75 # Reduced penalty for unnecessary movement
248
+
249
+ if action[0] == action[1]:
250
+ reward += 100
251
+ else:
252
+ reward -= 50
253
+
254
+ # Penalty for spinning
255
+ if np.abs(action[0] - action[1]) > 0.3:
256
+ reward -= 1001 # Reduced penalty for spinning
257
+
258
+ # New penalty for fast turns
259
+ turn_speed = abs(action[0] - action[1]) # Difference between left and right wheel speeds
260
+ fast_turn_threshold = 0.3 # Adjust this value as needed
261
+ if turn_speed > fast_turn_threshold:
262
+ fast_turn_penalty = -200 * (turn_speed - fast_turn_threshold)**2 # Quadratic penalty
263
+ reward += fast_turn_penalty
264
+
265
+ red_visible = 1 in image_segments
266
+ red_centered = image_segments[1] == 1
267
+
268
+ # Improved object detection
269
+ object_close = ir_sensors[0] > 0.1 and ir_sensors[1] < 0.01 and ir_sensors[2] < 0.01
270
+
271
+ # New wall avoidance logic with controlled penalties
272
+ front_distance = ir_sensors[0]
273
+ left_distance = ir_sensors[1]
274
+ right_distance = ir_sensors[2]
275
+
276
+ wall_penalty = 0
277
+
278
+ # Quadratic penalty for getting close to walls
279
+ wall_threshold = 0.1
280
+ for distance in [left_distance, right_distance]:
281
+ if distance > wall_threshold:
282
+ wall_penalty -= min(1000, 100 * ((distance - wall_threshold) ** 2))
283
+
284
+ # Additional penalty for being too close to walls on multiple sides
285
+ close_walls = sum(1 for d in [front_distance, left_distance, right_distance] if d > wall_threshold)
286
+ if close_walls > 1:
287
+ wall_penalty -= 100 * close_walls
288
+
289
+ # Encourage turning away from walls
290
+ if front_distance > 0.08 or max(left_distance, right_distance) > 0.08:
291
+ turn_encouragement = 50 * abs(action[0] - action[1])
292
+ wall_penalty += turn_encouragement
293
+
294
+ # Reward for moving away from walls
295
+ if self.previous_front_distance is not None:
296
+ if front_distance < self.previous_front_distance:
297
+ wall_penalty += 20
298
+
299
+ self.previous_front_distance = front_distance
300
+
301
+ # Reward for safe navigation in open space
302
+ if all(sensor < 0.05 for sensor in ir_sensors):
303
+ reward += 10
304
+
305
+ # Stuck detection with increased penalty
306
+ if np.all(np.abs(action) < 0.1): # If the action is very small in all dimensions
307
+ self.stuck_count += 1
308
+ if self.stuck_count > 5: # If stuck for more than 5 timesteps
309
+ reward -= 100 # Increased penalty for being stuck
310
+ else:
311
+ self.stuck_count = 0
312
+
313
+ # Red box detection rewards (scaled down compared to wall penalties)
314
+ if red_visible:
315
+ self.consecutive_red_sightings += 1
316
+ reward += min(12000, self.consecutive_red_sightings**1.1) # Cap the reward
317
+ if red_centered:
318
+ reward += 1
319
+ if object_close:
320
+ reward += 3
321
+ else:
322
+ self.consecutive_red_sightings = 0
323
+ reward -= 10
324
+
325
+ # Check for red box inside green area at every stage
326
+ if self._is_red_inside_green(obs):
327
+ steps_factor = max(0, 1 - (self.current_step / self.ep_steps))
328
+ omega_reward = 100000 * (1 + steps_factor) # Reduced large reward for completing the task
329
+ reward += omega_reward
330
+ print(f"Red box inside green area! Omega reward: {omega_reward}")
331
+ return reward # End the episode immediately with a large reward
332
+
333
+ # Stage-specific rewards
334
+ if self.curriculum_stage == 0: # find_red_box stage
335
+ if red_visible:
336
+ reward += self.consecutive_red_sightings
337
+ if red_centered:
338
+ reward += 1
339
+ if object_close:
340
+ reward += 2
341
+ else:
342
+ reward -= 0.5
343
+
344
+ elif self.curriculum_stage == 1: # smooth_movement stage
345
+ # Reward for moving straight
346
+ if np.abs(action[0] - action[1]) < 0.1:
347
+ reward += 1
348
+
349
+ # Penalty for frequent direction changes
350
+ if len(self.action_history) >= 3:
351
+ last_three_actions = self.action_history[-3:]
352
+ if all(np.abs(a[0] - a[1]) > 0.2 for a in last_three_actions):
353
+ reward -= 2 # Reduced penalty for 3 twists/turns in a row
354
+
355
+ # # Reward for detecting and approaching red box (ARTIFACTR2)
356
+ # if red_visible:
357
+ # reward += 30
358
+ # if red_centered:
359
+ # reward += 20
360
+ # if object_close:
361
+ # reward += 50
362
+ # Keep the existing red box logic, but scale down rewards
363
+
364
+ elif self.curriculum_stage == 2: # push_red_box stage
365
+ if red_visible and object_close:
366
+ reward += 5
367
+ if red_centered:
368
+ reward += 2
369
+ elif red_visible or object_close:
370
+ reward += 2
371
+ else:
372
+ reward -= 1
373
+
374
+ elif self.curriculum_stage == 3: # find_green_box stage
375
+ red_reward = self._get_color_reward(obs, 1)
376
+ green_reward = self._get_color_reward(obs, 2)
377
+ if red_reward > 0 and green_reward > 0:
378
+ reward += (red_reward + green_reward) * 1.5
379
+ elif green_reward > 0:
380
+ reward += green_reward
381
+ elif red_reward > 0:
382
+ reward += red_reward * 0.5
383
+ else:
384
+ reward -= 5
385
+
386
+ elif self.curriculum_stage == 4: # push_to_green_box stage
387
+ red_reward = self._get_color_reward(obs, 1)
388
+ green_reward = self._get_color_reward(obs, 2)
389
+ both_colors_reward = 10 if 3 in obs['image_segments'] else 0
390
+
391
+ if red_reward > 0 and green_reward > 0:
392
+ red_positions = np.where(obs['image_segments'] == 1)[0]
393
+ green_positions = np.where(obs['image_segments'] == 2)[0]
394
+ if len(red_positions) > 0 and len(green_positions) > 0:
395
+ red_index = red_positions[0]
396
+ green_index = green_positions[0]
397
+ distance = abs(red_index - green_index)
398
+ proximity_reward = 25 / (distance + 1)
399
+ reward += red_reward + green_reward + proximity_reward + both_colors_reward
400
+ else:
401
+ reward += red_reward + green_reward + both_colors_reward
402
+ elif green_reward > 0:
403
+ reward += green_reward
404
+ elif red_reward > 0:
405
+ reward += red_reward * 0.5
406
+ else:
407
+ reward -= 5
408
+
409
+ reward += wall_penalty
410
+ # Encourage forward movement for all stages
411
+ # if action[0] > 0.1 and action[1] > 0.1: # if action[0] > 0.25 ARTIFACT 5
412
+ # reward += 5
413
+ # Add exploration reward
414
+ exploration_reward = self._calculate_exploration_reward()
415
+ reward += exploration_reward
416
+
417
+ # Apply omega penalty if episode is ending due to losing sight of red box
418
+ if self.lost_red_box:
419
+ omega_penalty = -100000 # Large negative reward
420
+ print(f"Lost sight of red box! Omega penalty: {omega_penalty}")
421
+ reward += omega_penalty
422
+ self.lost_red_box = False # Reset the flag
423
+
424
+ return reward
425
+
426
+ def _is_red_inside_green(self, obs):
427
+ image_segments = obs['image_segments']
428
+ return 3 in image_segments and image_segments[1] == 3 and obs['ir_sensors'][0] > 0.1
429
+
430
+ def _check_done(self, obs):
431
+ if self.current_step >= self.ep_steps:
432
+ return True
433
+
434
+ if self._is_red_inside_green(obs):
435
+ return True
436
+
437
+ if self.consecutive_red_sightings == 0 and self.current_step > 100:
438
+ self.lost_red_box = True
439
+ return True
440
+ return False
441
+
442
+ def _get_color_reward(self, obs, color, multiplier=1):
443
+ if color in obs['image_segments']:
444
+ base_reward = 2 if color == 1 else 1 # Higher base reward for red
445
+ if obs['image_segments'][1] == color: # Check if color is in center segment
446
+ return base_reward * 3 * multiplier # Triple reward for center
447
+ else:
448
+ return base_reward * multiplier
449
+ return 0
450
+
451
+ def _check_curriculum_progress(self):
452
+ self.recent_episode_rewards.append(self.episode_reward)
453
+ if len(self.recent_episode_rewards) > 3:
454
+ self.recent_episode_rewards.pop(0)
455
+
456
+ if len(self.recent_episode_rewards) == 3:
457
+ avg_reward = sum(self.recent_episode_rewards) / 3
458
+ if avg_reward >= self.stage_threshold_rewards[self.curriculum_stage]:
459
+ self.stage_success_count += 1
460
+ else:
461
+ self.stage_success_count = 0
462
+
463
+ if self.stage_success_count >= 1:
464
+ self.increase_difficulty()
465
+ self.stage_success_count = 0
466
+ self.recent_episode_rewards = []
467
+
468
+ if self.curriculum_stage == len(self.stages) - 1:
469
+ if self._is_red_inside_green(self._get_obs()):
470
+ if self.current_step < self.best_completion_steps:
471
+ self.best_completion_steps = self.current_step
472
+ print(f"New best completion in {self.best_completion_steps} steps!")
473
+
474
+ def increase_difficulty(self):
475
+ if self.curriculum_stage < len(self.stages) - 1:
476
+ self.curriculum_stage += 1
477
+ print(f"Progressing to stage: {self.stages[self.curriculum_stage]}")
478
+ else:
479
+ print("Maximum stage reached")
480
+
481
+ def get_current_stage(self):
482
+ return self.stages[self.curriculum_stage]
483
+
484
+ def run(rob: IRobobo):
485
+ train = True
486
+
487
+ load_model = True # Set to True if you want to load a pre-trained model
488
+ steps = 512 # Reduced number of steps per episode
489
+ episodes = 2000 # Increased number of episodes
490
+ total_timesteps = steps * episodes
491
+
492
+ # Common setup
493
+ env = RoboboEnv(rob)
494
+ env = DummyVecEnv([lambda: env])
495
+
496
+ current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
497
+ log_dir = os.path.join(FIGRURES_DIR, "models", f"PPO_{current_datetime}")
498
+ os.makedirs(log_dir, exist_ok=True)
499
+ if train:
500
+ # Callbacks setup
501
+ checkpoint_callback = CheckpointCallback(save_freq=total_timesteps // 1000, save_path=log_dir, name_prefix="ppo_model")
502
+ event_callback = EveryNTimesteps(n_steps=total_timesteps // 1000, callback=checkpoint_callback)
503
+ eval_callback = EvalCallback(env, best_model_save_path=log_dir,
504
+ log_path=log_dir, eval_freq=steps * 20,
505
+ deterministic=True, render=False)
506
+ callbacks = CallbackList([checkpoint_callback, event_callback, eval_callback])
507
+
508
+ if not load_model:
509
+ # Create new model with curriculum learning
510
+ # model = PPO("MultiInputPolicy", env, verbose=1, tensorboard_log=log_dir,
511
+ # learning_rate=lambda f: f * 3e-4, # Increased initial learning rate
512
+ # n_steps=256,
513
+ # batch_size=64,
514
+ # n_epochs=10,
515
+ # gamma=0.99,
516
+ # ent_coef=0.01,
517
+ # use_sde=True,
518
+ # sde_sample_freq=4,
519
+ # target_kl=0.02,
520
+ # clip_range=lambda f: f * 0.2) # Clip range annealing
521
+ # model = PPO("MultiInputPolicy", env, verbose=1, tensorboard_log=log_dir,
522
+ # learning_rate=1e-3, # Fixed, higher learning rate
523
+ # n_steps=512, # Increased steps per update
524
+ # batch_size=64,
525
+ # n_epochs=10,
526
+ # gamma=0.99,
527
+ # ent_coef=0.05,
528
+ # use_sde=False, # Disabled SDE
529
+ # target_kl=0.02,
530
+ # clip_range=0.3)
531
+ # model = PPO("MultiInputPolicy", env, verbose=1, tensorboard_log=log_dir,
532
+ # learning_rate=5e-4, # Reduced learning rate
533
+ # n_steps=100, # Keep increased steps per update
534
+ # batch_size=64,
535
+ # n_epochs=10,
536
+ # gamma=0.99,
537
+ # ent_coef=0.02,
538
+ # use_sde=False,
539
+ # target_kl=0.015,
540
+ # clip_range=0.2)
541
+ model = PPO("MultiInputPolicy", env, verbose=1, tensorboard_log=log_dir,
542
+ learning_rate=1e-3,
543
+ batch_size=32,
544
+ n_epochs=5,
545
+ gamma=0.99,
546
+ ent_coef=0.01,
547
+ use_sde=False,
548
+ target_kl=0.02,
549
+ clip_range=0.2)
550
+ else:
551
+ # Load pre-trained model
552
+ # path = os.path.join(FIGRURES_DIR, "models", "PPO_2024-06-28_11-53-39", "ppo_model_2047_steps") #30718 20479 || ppo_model_10240_stepsGOOD+8185+ppo_model_6139_steps(c3) || back to PPO_2024-06-27_16-47-34 ppo_model_5116_steps(c3) +PPO_2024-06-27_19-43-11ppo_model_4093_steps+PPO_2024-06-27_20-59-07ppo_model_3570_steps+ppo_model_5116_stepsGOODPPO_2024-06-27_21-22-07+ppo_model_71611_steps
553
+ # model = PPO.load(path, env=env)
554
+ # model.learning_rate = 2e-2
555
+ # model.learn(total_timesteps=5000) # Train for a short while with higher rate
556
+ # model.learning_rate = 1e-3 # Then reduce it
557
+ # model.learn(total_timesteps=20000) # Continue training
558
+ # # Final lower learning rate phase
559
+ # model.learning_rate = 5e-4
560
+ # model.learn(total_timesteps=5000)
561
+ # model.n_steps = 256
562
+ # model.batch_size = 64
563
+ # model.n_epochs = 10
564
+ # model.ent_coef = 0.05
565
+
566
+ # model.gamma = 0.99
567
+
568
+ # model.rollout_buffer = model.rollout_buffer.__class__(
569
+ # model.n_steps,
570
+ # model.observation_space,
571
+ # model.action_space,
572
+ # device=model.device,
573
+ # gamma=model.gamma,
574
+ # gae_lambda=model.gae_lambda,
575
+ # n_envs=model.n_envs,
576
+ # )
577
+ # model.rollout_buffer.reset()
578
+ # path = os.path.join(FIGRURES_DIR, "models", "PPO_2024-06-27_23-43-49", "ppo_model_5116_steps")
579
+ path = os.path.join(FIGRURES_DIR, "models", "PPO_2024-06-28_13-57-22", "ppo_model_10231_stepsWORKWORKWORK.zip")
580
+
581
+ model = PPO.load(path, env=env)
582
+
583
+ # Define a function for a very high clip range (effectively no clipping)
584
+ def high_clip(progress_remaining):
585
+ return 5.0
586
+
587
+ # Initial phase with high learning rate and effectively no clipping
588
+ model.learning_rate = 1e-3
589
+ model.clip_range = high_clip
590
+ model.learn(total_timesteps=5000)
591
+
592
+ # Second phase with slightly lower learning rate but still high
593
+ model.learning_rate = 1e-4
594
+ model.learn(total_timesteps=20000)
595
+
596
+ # Define a function for standard clipping
597
+ def standard_clip(progress_remaining):
598
+ return 0.2
599
+
600
+ # Final phase with lower learning rate and standard clipping
601
+ model.learning_rate = 5e-4
602
+ model.clip_range = standard_clip
603
+ model.learn(total_timesteps=5000)
604
+
605
+ # Set other parameters
606
+ model.n_steps = 512
607
+ model.batch_size = 64
608
+ model.n_epochs = 10
609
+ model.ent_coef = 0.05
610
+ model.gamma = 0.99
611
+
612
+ # Reset the rollout buffer with the final parameters
613
+ model.rollout_buffer = model.rollout_buffer.__class__(
614
+ model.n_steps,
615
+ model.observation_space,
616
+ model.action_space,
617
+ device=model.device,
618
+ gamma=model.gamma,
619
+ gae_lambda=model.gae_lambda,
620
+ n_envs=model.n_envs,
621
+ )
622
+ model.rollout_buffer.reset()
623
+
624
+ ## Curriculum learning
625
+ for i in range(5): # 5 stages of curriculum
626
+ current_stage = env.env_method("get_current_stage")[0]
627
+ print(f"Curriculum stage {i+1}/5: {current_stage}")
628
+ print(f"Threshold reward for this stage: {env.env_method('get_stage_threshold')[0]}")
629
+ model.learn(total_timesteps=total_timesteps // 5, callback=callbacks)
630
+ # Progress to the next stage
631
+ env.env_method("increase_difficulty")
632
+
633
+ # Save the final model
634
+ model.save(os.path.join(log_dir, "final_model"))
635
+ else:
636
+ # Evaluation code
637
+ # Load the specific model you want to evaluate
638
+ path = os.path.join(FIGRURES_DIR, "models", "PPO_2024-06-28_13-57-22", "ppo_model_10231_stepsWORKWORKWORK.zip")
639
+ model = PPO.load(path, env=env)
640
+
641
+ # Evaluate the model
642
+ eval_episodes = 10
643
+ for episode in range(eval_episodes):
644
+ obs = env.reset()
645
+ done = False
646
+ total_reward = 0
647
+ step = 0
648
+
649
+ while not done:
650
+ action, _ = model.predict(obs, deterministic=True)
651
+ obs, reward, done, info = env.step(action)
652
+ reward = reward[0] # Extract scalar value from numpy array
653
+ total_reward += reward
654
+ step += 1
655
+
656
+ # Print step information
657
+ print(f"Episode {episode+1}, Step {step}: Reward = {reward:.2f}, Total Reward = {total_reward:.2f}")
658
+
659
+ # Check if the episode is done
660
+ if done[0]:
661
+ break
662
+
663
+ # Check if the episode is done due to reaching the maximum steps
664
+ if step >= env.get_attr('ep_steps')[0]:
665
+ print(f"Episode {episode+1} reached maximum steps.")
666
+ break
667
+
668
+ print(f"Episode {episode+1} finished: Total Reward = {total_reward:.2f}, Steps = {step}")
669
+
670
+ # Print the final stage reached
671
+ final_stage = env.get_attr('get_current_stage')[0]
672
+ print(f"Final stage reached: {final_stage}")
673
+
674
+ print("Evaluation complete.")
675
+
676
+ # Position: -2.4000000000000026 0.07699995934963236 0.03970504179596901
677
+ # Orientation: -1.57191039448275 -1.5144899542893442 -1.5719101704888712
678
+
679
+
680
+ #PPO_2024-06-27_23-43-49 ppo_model_71611_steps longest training + PPO_2024-06-28_09-09-00 ppo_model_10231_steps
681
+
682
+ """
683
+ +
684
+ PPO_2024-06-28_10-47-03
685
+ ppo_model_8185_steps
686
+ +
687
+ PPO_2024-06-28_11-53-39
688
+ ppo_model_2047_steps
689
+ ------------------------------------------------------------
690
+ PPO_2024-06-27_23-43-49
691
+ ppo_model_38875_steps
692
+ ppo_model_10231_steps
693
+ ppo_model_5116_steps
694
+
695
+ """