| import d4rl |
| import gym |
| import tqdm |
| from diffusers.experimental import ValueGuidedRLPipeline |
|
|
|
|
| config = { |
| "n_samples": 64, |
| "horizon": 32, |
| "num_inference_steps": 20, |
| "n_guide_steps": 2, |
| "scale_grad_by_std": True, |
| "scale": 0.1, |
| "eta": 0.0, |
| "t_grad_cutoff": 2, |
| "device": "cpu", |
| } |
|
|
|
|
| if __name__ == "__main__": |
| env_name = "hopper-medium-v2" |
| env = gym.make(env_name) |
|
|
| pipeline = ValueGuidedRLPipeline.from_pretrained( |
| "bglick13/hopper-medium-v2-value-function-hor32", |
| env=env, |
| ) |
|
|
| env.seed(0) |
| obs = env.reset() |
| total_reward = 0 |
| total_score = 0 |
| T = 1000 |
| rollout = [obs.copy()] |
| try: |
| for t in tqdm.tqdm(range(T)): |
| |
| denorm_actions = pipeline(obs, planning_horizon=32) |
|
|
| |
| next_observation, reward, terminal, _ = env.step(denorm_actions) |
| score = env.get_normalized_score(total_reward) |
|
|
| |
| total_reward += reward |
| total_score += score |
| print( |
| f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:" |
| f" {total_score}" |
| ) |
|
|
| |
| rollout.append(next_observation.copy()) |
|
|
| obs = next_observation |
| except KeyboardInterrupt: |
| pass |
|
|
| print(f"Total reward: {total_reward}") |
|
|