Upload 2 files
Browse files
learning_machines_robobo/examples/full_project_setup/catkin_ws/src/learning_machines/src/learning_machines/task2_robobo_env_t3rework.py
ADDED
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
import gymnasium as gym
|
5 |
+
from gymnasium import spaces
|
6 |
+
from stable_baselines3 import PPO
|
7 |
+
from stable_baselines3.common.vec_env import DummyVecEnv
|
8 |
+
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, EveryNTimesteps, CallbackList
|
9 |
+
from datetime import datetime
|
10 |
+
from robobo_interface import IRobobo, Position, Orientation
|
11 |
+
from data_files import FIGRURES_DIR
|
12 |
+
|
13 |
+
class RoboboEnv(gym.Env):
|
14 |
+
def __init__(self, rob: IRobobo, ep_steps=1024, simulation=True):
|
15 |
+
super(RoboboEnv, self).__init__()
|
16 |
+
self.robot = rob
|
17 |
+
self.simulation = simulation
|
18 |
+
self.ep_steps = ep_steps
|
19 |
+
|
20 |
+
# Define action and observation space
|
21 |
+
self.action_space = spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32)
|
22 |
+
self.observation_space = spaces.Box(low=0, high=255, shape=(64, 64, 3), dtype=np.uint8)
|
23 |
+
|
24 |
+
self.has_reset = False
|
25 |
+
self.current_step = 0
|
26 |
+
|
27 |
+
def reset(self, seed=None, options=None):
|
28 |
+
super().reset(seed=seed) # Reset parent class
|
29 |
+
if self.simulation:
|
30 |
+
self.robot.stop_simulation()
|
31 |
+
self.robot.set_position(Position(0, 0, 0), Orientation(0, 0, 0))
|
32 |
+
self.robot.play_simulation()
|
33 |
+
|
34 |
+
self.robot.set_phone_tilt(240, 50) # tilted down
|
35 |
+
self.robot.sleep(3)
|
36 |
+
self.current_step = 0
|
37 |
+
self.has_reset = True
|
38 |
+
return self._get_obs(), {}
|
39 |
+
|
40 |
+
def step(self, action):
|
41 |
+
if not self.has_reset:
|
42 |
+
raise RuntimeError("Environment must be reset before stepping")
|
43 |
+
|
44 |
+
self.current_step += 1
|
45 |
+
|
46 |
+
action[2] = (((action[2] + 1) / 2) * 0.75) + 0.15
|
47 |
+
left_speed, right_speed, duration_scale = action * np.array([50, 50, 800])
|
48 |
+
|
49 |
+
self.robot.move(left_speed, right_speed, duration_scale)
|
50 |
+
self.robot.sleep(action[2])
|
51 |
+
|
52 |
+
obs = self._get_obs()
|
53 |
+
reward = self._calculate_reward(obs)
|
54 |
+
print(self.current_step,reward)
|
55 |
+
|
56 |
+
done = self.current_step >= self.ep_steps
|
57 |
+
|
58 |
+
return obs, reward, done, False, {}
|
59 |
+
|
60 |
+
def _get_obs(self):
|
61 |
+
frame = self.robot.get_image_front()
|
62 |
+
frame = cv2.resize(frame, (64, 64))
|
63 |
+
return frame
|
64 |
+
|
65 |
+
|
66 |
+
def _calculate_reward(self, obs):
|
67 |
+
food_collected = self.robot.nr_food_collected()
|
68 |
+
|
69 |
+
# Convert to HSV for easier color detection
|
70 |
+
hsv = cv2.cvtColor(obs, cv2.COLOR_BGR2HSV)
|
71 |
+
|
72 |
+
# Detect green area (representing food)
|
73 |
+
lower_green = np.array([40, 50, 50])
|
74 |
+
upper_green = np.array([80, 255, 255])
|
75 |
+
green_mask = cv2.inRange(hsv, lower_green, upper_green)
|
76 |
+
green_area = np.sum(green_mask > 0)
|
77 |
+
|
78 |
+
# Calculate centroid of the green area
|
79 |
+
green_centroid = np.mean(np.argwhere(green_mask > 0), axis=0) if green_area > 0 else np.array([32, 32])
|
80 |
+
|
81 |
+
# Distance to the green centroid
|
82 |
+
distance_to_green = np.linalg.norm(np.array([32, 32]) - green_centroid)
|
83 |
+
|
84 |
+
# Reward for collecting food
|
85 |
+
reward = food_collected * 10 # Reward for collecting food
|
86 |
+
|
87 |
+
# Additional reward for getting closer to the green area
|
88 |
+
if green_area > 0:
|
89 |
+
reward += 5 / (distance_to_green + 1) # Inverse of the distance to the green area for reward
|
90 |
+
|
91 |
+
# Less reward if no food detected by base
|
92 |
+
if not self.robot.base_detects_food():
|
93 |
+
reward *= 0.5
|
94 |
+
|
95 |
+
return reward
|
96 |
+
|
97 |
+
def run(rob: IRobobo):
|
98 |
+
# load_model = False
|
99 |
+
env = RoboboEnv(rob)
|
100 |
+
env = DummyVecEnv([lambda: env])
|
101 |
+
|
102 |
+
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
103 |
+
log_dir = os.path.join(FIGRURES_DIR, "models", f"PPO_{current_datetime}")
|
104 |
+
os.makedirs(log_dir, exist_ok=True)
|
105 |
+
|
106 |
+
steps = 1024
|
107 |
+
episodes = 1000
|
108 |
+
total_timesteps = steps * episodes
|
109 |
+
|
110 |
+
checkpoint_callback = CheckpointCallback(save_freq=total_timesteps // 1000, save_path=log_dir, name_prefix="ppo_model")
|
111 |
+
event_callback = EveryNTimesteps(n_steps=total_timesteps // 1000, callback=checkpoint_callback)
|
112 |
+
|
113 |
+
model = PPO("CnnPolicy", env, verbose=1, tensorboard_log=log_dir,
|
114 |
+
learning_rate=1e-4,
|
115 |
+
n_steps=512,
|
116 |
+
batch_size=128,
|
117 |
+
n_epochs=5,
|
118 |
+
gamma=0.99,
|
119 |
+
ent_coef=0.01,
|
120 |
+
use_sde=True,
|
121 |
+
sde_sample_freq=4,
|
122 |
+
target_kl=0.015)
|
123 |
+
|
124 |
+
eval_callback = EvalCallback(env, best_model_save_path=log_dir,
|
125 |
+
log_path=log_dir, eval_freq=steps * 11,
|
126 |
+
deterministic=True, render=False)
|
127 |
+
|
128 |
+
callbacks = CallbackList([checkpoint_callback, event_callback, eval_callback])
|
129 |
+
model.learn(total_timesteps=total_timesteps, callback=callbacks)
|
130 |
+
model.save(os.path.join(log_dir, "final_model"))
|
131 |
+
|
learning_machines_robobo/examples/full_project_setup/catkin_ws/src/learning_machines/src/learning_machines/task3_rob_env_irs.py
ADDED
@@ -0,0 +1,695 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import cv2
|
3 |
+
import numpy as np
|
4 |
+
import gymnasium as gym
|
5 |
+
from gymnasium import spaces
|
6 |
+
from stable_baselines3 import PPO
|
7 |
+
from stable_baselines3.common.vec_env import DummyVecEnv
|
8 |
+
from stable_baselines3.common.callbacks import EvalCallback, CheckpointCallback, EveryNTimesteps, CallbackList
|
9 |
+
from datetime import datetime
|
10 |
+
from robobo_interface import IRobobo, Position, Orientation
|
11 |
+
from data_files import FIGRURES_DIR
|
12 |
+
|
13 |
+
stream = True
|
14 |
+
|
15 |
+
class RoboboEnv(gym.Env):
|
16 |
+
def __init__(self, rob: IRobobo, ep_steps=512, simulation=True):
|
17 |
+
super(RoboboEnv, self).__init__()
|
18 |
+
self.robot = rob
|
19 |
+
self.simulation = simulation
|
20 |
+
self.ep_steps = ep_steps
|
21 |
+
|
22 |
+
# Define action and observation space
|
23 |
+
self.action_space = spaces.Box(low=-1, high=1, shape=(3,), dtype=np.float32)
|
24 |
+
|
25 |
+
# Modified observation space with only specified IR sensors
|
26 |
+
self.observation_space = spaces.Dict({
|
27 |
+
'image_segments': spaces.Box(low=0, high=2, shape=(3,), dtype=np.uint8),
|
28 |
+
'ir_sensors': spaces.Box(low=0, high=1, shape=(3,), dtype=np.float32) # Normalized IR sensors
|
29 |
+
})
|
30 |
+
|
31 |
+
self.has_reset = False
|
32 |
+
self.current_step = 0
|
33 |
+
self.episode_reward = 0
|
34 |
+
self.best_completion_steps = float('inf')
|
35 |
+
self.consecutive_red_sightings = 0
|
36 |
+
self.previous_action = np.zeros(3)
|
37 |
+
|
38 |
+
# Curriculum variables
|
39 |
+
self.curriculum_stage = 4
|
40 |
+
# Updated curriculum stages
|
41 |
+
# Updated curriculum stages
|
42 |
+
# Reordered curriculum stages
|
43 |
+
self.stages = [
|
44 |
+
"find_red_box",
|
45 |
+
"smooth_movement",
|
46 |
+
"push_red_box",
|
47 |
+
"find_green_box",
|
48 |
+
"push_to_green_box"
|
49 |
+
]
|
50 |
+
self.lost_red_box = False
|
51 |
+
|
52 |
+
# Adjust thresholds to match the new order (you may need to fine-tune these)
|
53 |
+
self.stage_threshold_rewards = [
|
54 |
+
4096, # Stage 0: find_red_box
|
55 |
+
3072, # Stage 1: smooth_movement
|
56 |
+
5120, # Stage 2: push_red_box
|
57 |
+
6144, # Stage 3: find_green_box
|
58 |
+
10640 # Stage 4: push_to_green_box
|
59 |
+
]
|
60 |
+
|
61 |
+
self.stage_success_count = 0
|
62 |
+
self.recent_episode_rewards = []
|
63 |
+
|
64 |
+
self.consecutive_wall_detections = 0
|
65 |
+
self.max_wall_penalty = 10000000
|
66 |
+
|
67 |
+
self.visited_places = set()
|
68 |
+
self.grid_size = 0.1 # Size of each grid cell in meters
|
69 |
+
self.exploration_reward_scale = 10 # Scale factor for exploration reward
|
70 |
+
|
71 |
+
# Time tracking for final stage
|
72 |
+
self.start_time = None
|
73 |
+
self.best_completion_time = float('inf')
|
74 |
+
|
75 |
+
if stream:
|
76 |
+
cv2.namedWindow("Robot View", cv2.WINDOW_NORMAL)
|
77 |
+
|
78 |
+
# Initialize previous_front_distance
|
79 |
+
self.previous_front_distance = None
|
80 |
+
|
81 |
+
def reset(self, seed=None, options=None):
|
82 |
+
super().reset(seed=seed)
|
83 |
+
if self.simulation:
|
84 |
+
self.robot.stop_simulation()
|
85 |
+
self.robot.set_position(Position(-2.4000000000000026, 0.07699995934963236, 0.03970504179596901), Orientation(-1.57191039448275, -1.5144899542893442, -1.5719101704888712))
|
86 |
+
self.robot.play_simulation()
|
87 |
+
|
88 |
+
self.robot.set_phone_tilt(240, 50) # tilted down
|
89 |
+
self.robot.sleep(2)
|
90 |
+
self.current_step = 0
|
91 |
+
self.episode_reward = 0
|
92 |
+
self.consecutive_red_sightings = 0
|
93 |
+
self.has_reset = True
|
94 |
+
self.previous_action = np.zeros(3)
|
95 |
+
self.consecutive_wall_detections = 0
|
96 |
+
|
97 |
+
self.visited_places = set()
|
98 |
+
|
99 |
+
|
100 |
+
|
101 |
+
self.previous_front_distance = None
|
102 |
+
|
103 |
+
if self.curriculum_stage == len(self.stages) - 1:
|
104 |
+
self.start_time = datetime.now()
|
105 |
+
|
106 |
+
self.action_history = []
|
107 |
+
return self._get_obs(), {}
|
108 |
+
|
109 |
+
def step(self, action):
|
110 |
+
if not self.has_reset:
|
111 |
+
raise RuntimeError("Environment must be reset before stepping")
|
112 |
+
|
113 |
+
self.current_step += 1
|
114 |
+
self.action_history.append(action)
|
115 |
+
if len(self.action_history) > 5:
|
116 |
+
self.action_history.pop(0)
|
117 |
+
|
118 |
+
# Adjust action scaling for faster movements
|
119 |
+
action[2] = ((action[2]+1)/8)+0.1 # Reduced upper limit of duration
|
120 |
+
|
121 |
+
|
122 |
+
# Convert normalized actions to robot commands
|
123 |
+
left_speed, right_speed, duration_scale = action * np.array([60, 60, 1000]) # Increased speed range, reduced duration
|
124 |
+
|
125 |
+
self.robot.move(left_speed, right_speed, duration_scale)
|
126 |
+
self.robot.sleep(action[2]/4)
|
127 |
+
#print actions and sleep
|
128 |
+
# print("----------------------")
|
129 |
+
# print("Left Speed:", left_speed, "Right Speed:", right_speed, "Duration Scale:", duration_scale, "Action[2]:", action[2])
|
130 |
+
# print("----------------------")
|
131 |
+
|
132 |
+
obs = self._get_obs()
|
133 |
+
# print("Observation:")
|
134 |
+
# print(f" Image Segments: {obs['image_segments']}")
|
135 |
+
# print(f" IR Sensors: {obs['ir_sensors']}")
|
136 |
+
|
137 |
+
if stream:
|
138 |
+
self._stream_frame(obs['image_segments'])
|
139 |
+
|
140 |
+
# exploration_reward = self._calculate_exploration_reward()
|
141 |
+
# reward += exploration_reward
|
142 |
+
|
143 |
+
reward = self._calculate_reward(obs, action)
|
144 |
+
self.episode_reward += reward
|
145 |
+
print(f"Step: {self.current_step}, Reward: {reward}, Stage: {self.stages[self.curriculum_stage]}")
|
146 |
+
|
147 |
+
done = self._check_done(obs)
|
148 |
+
|
149 |
+
if done:
|
150 |
+
self._check_curriculum_progress()
|
151 |
+
|
152 |
+
self.previous_action = action
|
153 |
+
|
154 |
+
return obs, reward, done, False, {}
|
155 |
+
|
156 |
+
def _get_obs(self):
|
157 |
+
frame = self.robot.get_image_front()
|
158 |
+
height, width = frame.shape[:2]
|
159 |
+
crop_height = int(height * 0.6)
|
160 |
+
frame = frame[height - crop_height:height, :]
|
161 |
+
frame = cv2.resize(frame, (64, 64))
|
162 |
+
|
163 |
+
hsv = cv2.cvtColor(frame, cv2.COLOR_BGR2HSV)
|
164 |
+
|
165 |
+
lower_red = np.array([0, 50, 50])
|
166 |
+
upper_red = np.array([10, 255, 255])
|
167 |
+
lower_green = np.array([40, 50, 50])
|
168 |
+
upper_green = np.array([80, 255, 255])
|
169 |
+
|
170 |
+
red_mask = cv2.inRange(hsv, lower_red, upper_red)
|
171 |
+
green_mask = cv2.inRange(hsv, lower_green, upper_green)
|
172 |
+
|
173 |
+
segment_width = 21 # 64 // 3
|
174 |
+
image_segments = np.zeros(3, dtype=np.uint8)
|
175 |
+
|
176 |
+
for i in range(3):
|
177 |
+
start = i * segment_width
|
178 |
+
end = (i + 1) * segment_width
|
179 |
+
red_area = np.sum(red_mask[:, start:end] > 0)
|
180 |
+
green_area = np.sum(green_mask[:, start:end] > 0)
|
181 |
+
|
182 |
+
if red_area > 100 and green_area > 100:
|
183 |
+
image_segments[i] = 3
|
184 |
+
elif red_area > 100:
|
185 |
+
image_segments[i] = 1
|
186 |
+
elif green_area > 100:
|
187 |
+
image_segments[i] = 2
|
188 |
+
|
189 |
+
all_ir_sensors = self.robot.read_irs()
|
190 |
+
ir_sensors = np.array([all_ir_sensors[4], all_ir_sensors[5], all_ir_sensors[7]], dtype=np.float32) / 1000.0
|
191 |
+
|
192 |
+
return {
|
193 |
+
'image_segments': image_segments,
|
194 |
+
'ir_sensors': ir_sensors
|
195 |
+
}
|
196 |
+
|
197 |
+
def _stream_frame(self, image_segments):
|
198 |
+
# Create a visual representation of the image segments
|
199 |
+
frame = np.zeros((64, 64, 3), dtype=np.uint8)
|
200 |
+
segment_width = frame.shape[1] // 3
|
201 |
+
for i, color in enumerate(image_segments):
|
202 |
+
if color == 1: # Red
|
203 |
+
frame[:, i*segment_width:(i+1)*segment_width] = [0, 0, 255]
|
204 |
+
elif color == 2: # Green
|
205 |
+
frame[:, i*segment_width:(i+1)*segment_width] = [0, 255, 0]
|
206 |
+
elif color == 3: # Both red and green
|
207 |
+
frame[:, i*segment_width:(i+1)*segment_width] = [0, 255, 255] # Yellow to represent both
|
208 |
+
cv2.imshow("Robot View", cv2.resize(frame, (320, 320)))
|
209 |
+
cv2.waitKey(1)
|
210 |
+
|
211 |
+
def close(self):
|
212 |
+
if stream:
|
213 |
+
cv2.destroyAllWindows()
|
214 |
+
|
215 |
+
def get_stage_threshold(self):
|
216 |
+
return self.stage_threshold_rewards[self.curriculum_stage]
|
217 |
+
|
218 |
+
def _calculate_exploration_reward(self):
|
219 |
+
# Get current position
|
220 |
+
position = self.robot.get_position()
|
221 |
+
x, y = position.x, position.y
|
222 |
+
|
223 |
+
# Discretize the position to grid coordinates
|
224 |
+
grid_x = int(x / self.grid_size)
|
225 |
+
grid_y = int(y / self.grid_size)
|
226 |
+
|
227 |
+
# Check if this grid cell has been visited before
|
228 |
+
grid_pos = (grid_x, grid_y)
|
229 |
+
if grid_pos not in self.visited_places:
|
230 |
+
self.visited_places.add(grid_pos)
|
231 |
+
return 10*self.exploration_reward_scale**2
|
232 |
+
return -100
|
233 |
+
|
234 |
+
|
235 |
+
|
236 |
+
def _calculate_reward(self, obs, action):
|
237 |
+
image_segments = obs['image_segments']
|
238 |
+
ir_sensors = obs['ir_sensors']
|
239 |
+
|
240 |
+
reward = 0
|
241 |
+
|
242 |
+
# if wall_detected: artifact for training till 30712
|
243 |
+
# reward -= 10
|
244 |
+
|
245 |
+
# Common penalties for all stages
|
246 |
+
movement_penalty = np.sum(np.abs(action - self.previous_action))
|
247 |
+
reward -= movement_penalty * 0.75 # Reduced penalty for unnecessary movement
|
248 |
+
|
249 |
+
if action[0] == action[1]:
|
250 |
+
reward += 100
|
251 |
+
else:
|
252 |
+
reward -= 50
|
253 |
+
|
254 |
+
# Penalty for spinning
|
255 |
+
if np.abs(action[0] - action[1]) > 0.3:
|
256 |
+
reward -= 1001 # Reduced penalty for spinning
|
257 |
+
|
258 |
+
# New penalty for fast turns
|
259 |
+
turn_speed = abs(action[0] - action[1]) # Difference between left and right wheel speeds
|
260 |
+
fast_turn_threshold = 0.3 # Adjust this value as needed
|
261 |
+
if turn_speed > fast_turn_threshold:
|
262 |
+
fast_turn_penalty = -200 * (turn_speed - fast_turn_threshold)**2 # Quadratic penalty
|
263 |
+
reward += fast_turn_penalty
|
264 |
+
|
265 |
+
red_visible = 1 in image_segments
|
266 |
+
red_centered = image_segments[1] == 1
|
267 |
+
|
268 |
+
# Improved object detection
|
269 |
+
object_close = ir_sensors[0] > 0.1 and ir_sensors[1] < 0.01 and ir_sensors[2] < 0.01
|
270 |
+
|
271 |
+
# New wall avoidance logic with controlled penalties
|
272 |
+
front_distance = ir_sensors[0]
|
273 |
+
left_distance = ir_sensors[1]
|
274 |
+
right_distance = ir_sensors[2]
|
275 |
+
|
276 |
+
wall_penalty = 0
|
277 |
+
|
278 |
+
# Quadratic penalty for getting close to walls
|
279 |
+
wall_threshold = 0.1
|
280 |
+
for distance in [left_distance, right_distance]:
|
281 |
+
if distance > wall_threshold:
|
282 |
+
wall_penalty -= min(1000, 100 * ((distance - wall_threshold) ** 2))
|
283 |
+
|
284 |
+
# Additional penalty for being too close to walls on multiple sides
|
285 |
+
close_walls = sum(1 for d in [front_distance, left_distance, right_distance] if d > wall_threshold)
|
286 |
+
if close_walls > 1:
|
287 |
+
wall_penalty -= 100 * close_walls
|
288 |
+
|
289 |
+
# Encourage turning away from walls
|
290 |
+
if front_distance > 0.08 or max(left_distance, right_distance) > 0.08:
|
291 |
+
turn_encouragement = 50 * abs(action[0] - action[1])
|
292 |
+
wall_penalty += turn_encouragement
|
293 |
+
|
294 |
+
# Reward for moving away from walls
|
295 |
+
if self.previous_front_distance is not None:
|
296 |
+
if front_distance < self.previous_front_distance:
|
297 |
+
wall_penalty += 20
|
298 |
+
|
299 |
+
self.previous_front_distance = front_distance
|
300 |
+
|
301 |
+
# Reward for safe navigation in open space
|
302 |
+
if all(sensor < 0.05 for sensor in ir_sensors):
|
303 |
+
reward += 10
|
304 |
+
|
305 |
+
# Stuck detection with increased penalty
|
306 |
+
if np.all(np.abs(action) < 0.1): # If the action is very small in all dimensions
|
307 |
+
self.stuck_count += 1
|
308 |
+
if self.stuck_count > 5: # If stuck for more than 5 timesteps
|
309 |
+
reward -= 100 # Increased penalty for being stuck
|
310 |
+
else:
|
311 |
+
self.stuck_count = 0
|
312 |
+
|
313 |
+
# Red box detection rewards (scaled down compared to wall penalties)
|
314 |
+
if red_visible:
|
315 |
+
self.consecutive_red_sightings += 1
|
316 |
+
reward += min(12000, self.consecutive_red_sightings**1.1) # Cap the reward
|
317 |
+
if red_centered:
|
318 |
+
reward += 1
|
319 |
+
if object_close:
|
320 |
+
reward += 3
|
321 |
+
else:
|
322 |
+
self.consecutive_red_sightings = 0
|
323 |
+
reward -= 10
|
324 |
+
|
325 |
+
# Check for red box inside green area at every stage
|
326 |
+
if self._is_red_inside_green(obs):
|
327 |
+
steps_factor = max(0, 1 - (self.current_step / self.ep_steps))
|
328 |
+
omega_reward = 100000 * (1 + steps_factor) # Reduced large reward for completing the task
|
329 |
+
reward += omega_reward
|
330 |
+
print(f"Red box inside green area! Omega reward: {omega_reward}")
|
331 |
+
return reward # End the episode immediately with a large reward
|
332 |
+
|
333 |
+
# Stage-specific rewards
|
334 |
+
if self.curriculum_stage == 0: # find_red_box stage
|
335 |
+
if red_visible:
|
336 |
+
reward += self.consecutive_red_sightings
|
337 |
+
if red_centered:
|
338 |
+
reward += 1
|
339 |
+
if object_close:
|
340 |
+
reward += 2
|
341 |
+
else:
|
342 |
+
reward -= 0.5
|
343 |
+
|
344 |
+
elif self.curriculum_stage == 1: # smooth_movement stage
|
345 |
+
# Reward for moving straight
|
346 |
+
if np.abs(action[0] - action[1]) < 0.1:
|
347 |
+
reward += 1
|
348 |
+
|
349 |
+
# Penalty for frequent direction changes
|
350 |
+
if len(self.action_history) >= 3:
|
351 |
+
last_three_actions = self.action_history[-3:]
|
352 |
+
if all(np.abs(a[0] - a[1]) > 0.2 for a in last_three_actions):
|
353 |
+
reward -= 2 # Reduced penalty for 3 twists/turns in a row
|
354 |
+
|
355 |
+
# # Reward for detecting and approaching red box (ARTIFACTR2)
|
356 |
+
# if red_visible:
|
357 |
+
# reward += 30
|
358 |
+
# if red_centered:
|
359 |
+
# reward += 20
|
360 |
+
# if object_close:
|
361 |
+
# reward += 50
|
362 |
+
# Keep the existing red box logic, but scale down rewards
|
363 |
+
|
364 |
+
elif self.curriculum_stage == 2: # push_red_box stage
|
365 |
+
if red_visible and object_close:
|
366 |
+
reward += 5
|
367 |
+
if red_centered:
|
368 |
+
reward += 2
|
369 |
+
elif red_visible or object_close:
|
370 |
+
reward += 2
|
371 |
+
else:
|
372 |
+
reward -= 1
|
373 |
+
|
374 |
+
elif self.curriculum_stage == 3: # find_green_box stage
|
375 |
+
red_reward = self._get_color_reward(obs, 1)
|
376 |
+
green_reward = self._get_color_reward(obs, 2)
|
377 |
+
if red_reward > 0 and green_reward > 0:
|
378 |
+
reward += (red_reward + green_reward) * 1.5
|
379 |
+
elif green_reward > 0:
|
380 |
+
reward += green_reward
|
381 |
+
elif red_reward > 0:
|
382 |
+
reward += red_reward * 0.5
|
383 |
+
else:
|
384 |
+
reward -= 5
|
385 |
+
|
386 |
+
elif self.curriculum_stage == 4: # push_to_green_box stage
|
387 |
+
red_reward = self._get_color_reward(obs, 1)
|
388 |
+
green_reward = self._get_color_reward(obs, 2)
|
389 |
+
both_colors_reward = 10 if 3 in obs['image_segments'] else 0
|
390 |
+
|
391 |
+
if red_reward > 0 and green_reward > 0:
|
392 |
+
red_positions = np.where(obs['image_segments'] == 1)[0]
|
393 |
+
green_positions = np.where(obs['image_segments'] == 2)[0]
|
394 |
+
if len(red_positions) > 0 and len(green_positions) > 0:
|
395 |
+
red_index = red_positions[0]
|
396 |
+
green_index = green_positions[0]
|
397 |
+
distance = abs(red_index - green_index)
|
398 |
+
proximity_reward = 25 / (distance + 1)
|
399 |
+
reward += red_reward + green_reward + proximity_reward + both_colors_reward
|
400 |
+
else:
|
401 |
+
reward += red_reward + green_reward + both_colors_reward
|
402 |
+
elif green_reward > 0:
|
403 |
+
reward += green_reward
|
404 |
+
elif red_reward > 0:
|
405 |
+
reward += red_reward * 0.5
|
406 |
+
else:
|
407 |
+
reward -= 5
|
408 |
+
|
409 |
+
reward += wall_penalty
|
410 |
+
# Encourage forward movement for all stages
|
411 |
+
# if action[0] > 0.1 and action[1] > 0.1: # if action[0] > 0.25 ARTIFACT 5
|
412 |
+
# reward += 5
|
413 |
+
# Add exploration reward
|
414 |
+
exploration_reward = self._calculate_exploration_reward()
|
415 |
+
reward += exploration_reward
|
416 |
+
|
417 |
+
# Apply omega penalty if episode is ending due to losing sight of red box
|
418 |
+
if self.lost_red_box:
|
419 |
+
omega_penalty = -100000 # Large negative reward
|
420 |
+
print(f"Lost sight of red box! Omega penalty: {omega_penalty}")
|
421 |
+
reward += omega_penalty
|
422 |
+
self.lost_red_box = False # Reset the flag
|
423 |
+
|
424 |
+
return reward
|
425 |
+
|
426 |
+
def _is_red_inside_green(self, obs):
|
427 |
+
image_segments = obs['image_segments']
|
428 |
+
return 3 in image_segments and image_segments[1] == 3 and obs['ir_sensors'][0] > 0.1
|
429 |
+
|
430 |
+
def _check_done(self, obs):
|
431 |
+
if self.current_step >= self.ep_steps:
|
432 |
+
return True
|
433 |
+
|
434 |
+
if self._is_red_inside_green(obs):
|
435 |
+
return True
|
436 |
+
|
437 |
+
if self.consecutive_red_sightings == 0 and self.current_step > 100:
|
438 |
+
self.lost_red_box = True
|
439 |
+
return True
|
440 |
+
return False
|
441 |
+
|
442 |
+
def _get_color_reward(self, obs, color, multiplier=1):
|
443 |
+
if color in obs['image_segments']:
|
444 |
+
base_reward = 2 if color == 1 else 1 # Higher base reward for red
|
445 |
+
if obs['image_segments'][1] == color: # Check if color is in center segment
|
446 |
+
return base_reward * 3 * multiplier # Triple reward for center
|
447 |
+
else:
|
448 |
+
return base_reward * multiplier
|
449 |
+
return 0
|
450 |
+
|
451 |
+
def _check_curriculum_progress(self):
|
452 |
+
self.recent_episode_rewards.append(self.episode_reward)
|
453 |
+
if len(self.recent_episode_rewards) > 3:
|
454 |
+
self.recent_episode_rewards.pop(0)
|
455 |
+
|
456 |
+
if len(self.recent_episode_rewards) == 3:
|
457 |
+
avg_reward = sum(self.recent_episode_rewards) / 3
|
458 |
+
if avg_reward >= self.stage_threshold_rewards[self.curriculum_stage]:
|
459 |
+
self.stage_success_count += 1
|
460 |
+
else:
|
461 |
+
self.stage_success_count = 0
|
462 |
+
|
463 |
+
if self.stage_success_count >= 1:
|
464 |
+
self.increase_difficulty()
|
465 |
+
self.stage_success_count = 0
|
466 |
+
self.recent_episode_rewards = []
|
467 |
+
|
468 |
+
if self.curriculum_stage == len(self.stages) - 1:
|
469 |
+
if self._is_red_inside_green(self._get_obs()):
|
470 |
+
if self.current_step < self.best_completion_steps:
|
471 |
+
self.best_completion_steps = self.current_step
|
472 |
+
print(f"New best completion in {self.best_completion_steps} steps!")
|
473 |
+
|
474 |
+
def increase_difficulty(self):
|
475 |
+
if self.curriculum_stage < len(self.stages) - 1:
|
476 |
+
self.curriculum_stage += 1
|
477 |
+
print(f"Progressing to stage: {self.stages[self.curriculum_stage]}")
|
478 |
+
else:
|
479 |
+
print("Maximum stage reached")
|
480 |
+
|
481 |
+
def get_current_stage(self):
|
482 |
+
return self.stages[self.curriculum_stage]
|
483 |
+
|
484 |
+
def run(rob: IRobobo):
|
485 |
+
train = True
|
486 |
+
|
487 |
+
load_model = True # Set to True if you want to load a pre-trained model
|
488 |
+
steps = 512 # Reduced number of steps per episode
|
489 |
+
episodes = 2000 # Increased number of episodes
|
490 |
+
total_timesteps = steps * episodes
|
491 |
+
|
492 |
+
# Common setup
|
493 |
+
env = RoboboEnv(rob)
|
494 |
+
env = DummyVecEnv([lambda: env])
|
495 |
+
|
496 |
+
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
497 |
+
log_dir = os.path.join(FIGRURES_DIR, "models", f"PPO_{current_datetime}")
|
498 |
+
os.makedirs(log_dir, exist_ok=True)
|
499 |
+
if train:
|
500 |
+
# Callbacks setup
|
501 |
+
checkpoint_callback = CheckpointCallback(save_freq=total_timesteps // 1000, save_path=log_dir, name_prefix="ppo_model")
|
502 |
+
event_callback = EveryNTimesteps(n_steps=total_timesteps // 1000, callback=checkpoint_callback)
|
503 |
+
eval_callback = EvalCallback(env, best_model_save_path=log_dir,
|
504 |
+
log_path=log_dir, eval_freq=steps * 20,
|
505 |
+
deterministic=True, render=False)
|
506 |
+
callbacks = CallbackList([checkpoint_callback, event_callback, eval_callback])
|
507 |
+
|
508 |
+
if not load_model:
|
509 |
+
# Create new model with curriculum learning
|
510 |
+
# model = PPO("MultiInputPolicy", env, verbose=1, tensorboard_log=log_dir,
|
511 |
+
# learning_rate=lambda f: f * 3e-4, # Increased initial learning rate
|
512 |
+
# n_steps=256,
|
513 |
+
# batch_size=64,
|
514 |
+
# n_epochs=10,
|
515 |
+
# gamma=0.99,
|
516 |
+
# ent_coef=0.01,
|
517 |
+
# use_sde=True,
|
518 |
+
# sde_sample_freq=4,
|
519 |
+
# target_kl=0.02,
|
520 |
+
# clip_range=lambda f: f * 0.2) # Clip range annealing
|
521 |
+
# model = PPO("MultiInputPolicy", env, verbose=1, tensorboard_log=log_dir,
|
522 |
+
# learning_rate=1e-3, # Fixed, higher learning rate
|
523 |
+
# n_steps=512, # Increased steps per update
|
524 |
+
# batch_size=64,
|
525 |
+
# n_epochs=10,
|
526 |
+
# gamma=0.99,
|
527 |
+
# ent_coef=0.05,
|
528 |
+
# use_sde=False, # Disabled SDE
|
529 |
+
# target_kl=0.02,
|
530 |
+
# clip_range=0.3)
|
531 |
+
# model = PPO("MultiInputPolicy", env, verbose=1, tensorboard_log=log_dir,
|
532 |
+
# learning_rate=5e-4, # Reduced learning rate
|
533 |
+
# n_steps=100, # Keep increased steps per update
|
534 |
+
# batch_size=64,
|
535 |
+
# n_epochs=10,
|
536 |
+
# gamma=0.99,
|
537 |
+
# ent_coef=0.02,
|
538 |
+
# use_sde=False,
|
539 |
+
# target_kl=0.015,
|
540 |
+
# clip_range=0.2)
|
541 |
+
model = PPO("MultiInputPolicy", env, verbose=1, tensorboard_log=log_dir,
|
542 |
+
learning_rate=1e-3,
|
543 |
+
batch_size=32,
|
544 |
+
n_epochs=5,
|
545 |
+
gamma=0.99,
|
546 |
+
ent_coef=0.01,
|
547 |
+
use_sde=False,
|
548 |
+
target_kl=0.02,
|
549 |
+
clip_range=0.2)
|
550 |
+
else:
|
551 |
+
# Load pre-trained model
|
552 |
+
# path = os.path.join(FIGRURES_DIR, "models", "PPO_2024-06-28_11-53-39", "ppo_model_2047_steps") #30718 20479 || ppo_model_10240_stepsGOOD+8185+ppo_model_6139_steps(c3) || back to PPO_2024-06-27_16-47-34 ppo_model_5116_steps(c3) +PPO_2024-06-27_19-43-11ppo_model_4093_steps+PPO_2024-06-27_20-59-07ppo_model_3570_steps+ppo_model_5116_stepsGOODPPO_2024-06-27_21-22-07+ppo_model_71611_steps
|
553 |
+
# model = PPO.load(path, env=env)
|
554 |
+
# model.learning_rate = 2e-2
|
555 |
+
# model.learn(total_timesteps=5000) # Train for a short while with higher rate
|
556 |
+
# model.learning_rate = 1e-3 # Then reduce it
|
557 |
+
# model.learn(total_timesteps=20000) # Continue training
|
558 |
+
# # Final lower learning rate phase
|
559 |
+
# model.learning_rate = 5e-4
|
560 |
+
# model.learn(total_timesteps=5000)
|
561 |
+
# model.n_steps = 256
|
562 |
+
# model.batch_size = 64
|
563 |
+
# model.n_epochs = 10
|
564 |
+
# model.ent_coef = 0.05
|
565 |
+
|
566 |
+
# model.gamma = 0.99
|
567 |
+
|
568 |
+
# model.rollout_buffer = model.rollout_buffer.__class__(
|
569 |
+
# model.n_steps,
|
570 |
+
# model.observation_space,
|
571 |
+
# model.action_space,
|
572 |
+
# device=model.device,
|
573 |
+
# gamma=model.gamma,
|
574 |
+
# gae_lambda=model.gae_lambda,
|
575 |
+
# n_envs=model.n_envs,
|
576 |
+
# )
|
577 |
+
# model.rollout_buffer.reset()
|
578 |
+
# path = os.path.join(FIGRURES_DIR, "models", "PPO_2024-06-27_23-43-49", "ppo_model_5116_steps")
|
579 |
+
path = os.path.join(FIGRURES_DIR, "models", "PPO_2024-06-28_13-57-22", "ppo_model_10231_stepsWORKWORKWORK.zip")
|
580 |
+
|
581 |
+
model = PPO.load(path, env=env)
|
582 |
+
|
583 |
+
# Define a function for a very high clip range (effectively no clipping)
|
584 |
+
def high_clip(progress_remaining):
|
585 |
+
return 5.0
|
586 |
+
|
587 |
+
# Initial phase with high learning rate and effectively no clipping
|
588 |
+
model.learning_rate = 1e-3
|
589 |
+
model.clip_range = high_clip
|
590 |
+
model.learn(total_timesteps=5000)
|
591 |
+
|
592 |
+
# Second phase with slightly lower learning rate but still high
|
593 |
+
model.learning_rate = 1e-4
|
594 |
+
model.learn(total_timesteps=20000)
|
595 |
+
|
596 |
+
# Define a function for standard clipping
|
597 |
+
def standard_clip(progress_remaining):
|
598 |
+
return 0.2
|
599 |
+
|
600 |
+
# Final phase with lower learning rate and standard clipping
|
601 |
+
model.learning_rate = 5e-4
|
602 |
+
model.clip_range = standard_clip
|
603 |
+
model.learn(total_timesteps=5000)
|
604 |
+
|
605 |
+
# Set other parameters
|
606 |
+
model.n_steps = 512
|
607 |
+
model.batch_size = 64
|
608 |
+
model.n_epochs = 10
|
609 |
+
model.ent_coef = 0.05
|
610 |
+
model.gamma = 0.99
|
611 |
+
|
612 |
+
# Reset the rollout buffer with the final parameters
|
613 |
+
model.rollout_buffer = model.rollout_buffer.__class__(
|
614 |
+
model.n_steps,
|
615 |
+
model.observation_space,
|
616 |
+
model.action_space,
|
617 |
+
device=model.device,
|
618 |
+
gamma=model.gamma,
|
619 |
+
gae_lambda=model.gae_lambda,
|
620 |
+
n_envs=model.n_envs,
|
621 |
+
)
|
622 |
+
model.rollout_buffer.reset()
|
623 |
+
|
624 |
+
## Curriculum learning
|
625 |
+
for i in range(5): # 5 stages of curriculum
|
626 |
+
current_stage = env.env_method("get_current_stage")[0]
|
627 |
+
print(f"Curriculum stage {i+1}/5: {current_stage}")
|
628 |
+
print(f"Threshold reward for this stage: {env.env_method('get_stage_threshold')[0]}")
|
629 |
+
model.learn(total_timesteps=total_timesteps // 5, callback=callbacks)
|
630 |
+
# Progress to the next stage
|
631 |
+
env.env_method("increase_difficulty")
|
632 |
+
|
633 |
+
# Save the final model
|
634 |
+
model.save(os.path.join(log_dir, "final_model"))
|
635 |
+
else:
|
636 |
+
# Evaluation code
|
637 |
+
# Load the specific model you want to evaluate
|
638 |
+
path = os.path.join(FIGRURES_DIR, "models", "PPO_2024-06-28_13-57-22", "ppo_model_10231_stepsWORKWORKWORK.zip")
|
639 |
+
model = PPO.load(path, env=env)
|
640 |
+
|
641 |
+
# Evaluate the model
|
642 |
+
eval_episodes = 10
|
643 |
+
for episode in range(eval_episodes):
|
644 |
+
obs = env.reset()
|
645 |
+
done = False
|
646 |
+
total_reward = 0
|
647 |
+
step = 0
|
648 |
+
|
649 |
+
while not done:
|
650 |
+
action, _ = model.predict(obs, deterministic=True)
|
651 |
+
obs, reward, done, info = env.step(action)
|
652 |
+
reward = reward[0] # Extract scalar value from numpy array
|
653 |
+
total_reward += reward
|
654 |
+
step += 1
|
655 |
+
|
656 |
+
# Print step information
|
657 |
+
print(f"Episode {episode+1}, Step {step}: Reward = {reward:.2f}, Total Reward = {total_reward:.2f}")
|
658 |
+
|
659 |
+
# Check if the episode is done
|
660 |
+
if done[0]:
|
661 |
+
break
|
662 |
+
|
663 |
+
# Check if the episode is done due to reaching the maximum steps
|
664 |
+
if step >= env.get_attr('ep_steps')[0]:
|
665 |
+
print(f"Episode {episode+1} reached maximum steps.")
|
666 |
+
break
|
667 |
+
|
668 |
+
print(f"Episode {episode+1} finished: Total Reward = {total_reward:.2f}, Steps = {step}")
|
669 |
+
|
670 |
+
# Print the final stage reached
|
671 |
+
final_stage = env.get_attr('get_current_stage')[0]
|
672 |
+
print(f"Final stage reached: {final_stage}")
|
673 |
+
|
674 |
+
print("Evaluation complete.")
|
675 |
+
|
676 |
+
# Position: -2.4000000000000026 0.07699995934963236 0.03970504179596901
|
677 |
+
# Orientation: -1.57191039448275 -1.5144899542893442 -1.5719101704888712
|
678 |
+
|
679 |
+
|
680 |
+
#PPO_2024-06-27_23-43-49 ppo_model_71611_steps longest training + PPO_2024-06-28_09-09-00 ppo_model_10231_steps
|
681 |
+
|
682 |
+
"""
|
683 |
+
+
|
684 |
+
PPO_2024-06-28_10-47-03
|
685 |
+
ppo_model_8185_steps
|
686 |
+
+
|
687 |
+
PPO_2024-06-28_11-53-39
|
688 |
+
ppo_model_2047_steps
|
689 |
+
------------------------------------------------------------
|
690 |
+
PPO_2024-06-27_23-43-49
|
691 |
+
ppo_model_38875_steps
|
692 |
+
ppo_model_10231_steps
|
693 |
+
ppo_model_5116_steps
|
694 |
+
|
695 |
+
"""
|