|
import d4rl |
|
import gym |
|
import tqdm |
|
from diffusers.experimental import ValueGuidedRLPipeline |
|
|
|
|
|
config = { |
|
"n_samples": 64, |
|
"horizon": 32, |
|
"num_inference_steps": 20, |
|
"n_guide_steps": 2, |
|
"scale_grad_by_std": True, |
|
"scale": 0.1, |
|
"eta": 0.0, |
|
"t_grad_cutoff": 2, |
|
"device": "cpu", |
|
} |
|
|
|
|
|
if __name__ == "__main__": |
|
env_name = "hopper-medium-v2" |
|
env = gym.make(env_name) |
|
|
|
pipeline = ValueGuidedRLPipeline.from_pretrained( |
|
"bglick13/hopper-medium-v2-value-function-hor32", |
|
env=env, |
|
) |
|
|
|
env.seed(0) |
|
obs = env.reset() |
|
total_reward = 0 |
|
total_score = 0 |
|
T = 1000 |
|
rollout = [obs.copy()] |
|
try: |
|
for t in tqdm.tqdm(range(T)): |
|
|
|
denorm_actions = pipeline(obs, planning_horizon=32) |
|
|
|
|
|
next_observation, reward, terminal, _ = env.step(denorm_actions) |
|
score = env.get_normalized_score(total_reward) |
|
|
|
|
|
total_reward += reward |
|
total_score += score |
|
print( |
|
f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:" |
|
f" {total_score}" |
|
) |
|
|
|
|
|
rollout.append(next_observation.copy()) |
|
|
|
obs = next_observation |
|
except KeyboardInterrupt: |
|
pass |
|
|
|
print(f"Total reward: {total_reward}") |
|
|