Spaces:
Running
on
Zero
Running
on
Zero
| import d4rl # noqa | |
| import gym | |
| import tqdm | |
| from diffusers.experimental import ValueGuidedRLPipeline | |
| config = { | |
| "n_samples": 64, | |
| "horizon": 32, | |
| "num_inference_steps": 20, | |
| "n_guide_steps": 2, # can set to 0 for faster sampling, does not use value network | |
| "scale_grad_by_std": True, | |
| "scale": 0.1, | |
| "eta": 0.0, | |
| "t_grad_cutoff": 2, | |
| "device": "cpu", | |
| } | |
| if __name__ == "__main__": | |
| env_name = "hopper-medium-v2" | |
| env = gym.make(env_name) | |
| pipeline = ValueGuidedRLPipeline.from_pretrained( | |
| "bglick13/hopper-medium-v2-value-function-hor32", | |
| env=env, | |
| ) | |
| env.seed(0) | |
| obs = env.reset() | |
| total_reward = 0 | |
| total_score = 0 | |
| T = 1000 | |
| rollout = [obs.copy()] | |
| try: | |
| for t in tqdm.tqdm(range(T)): | |
| # call the policy | |
| denorm_actions = pipeline(obs, planning_horizon=32) | |
| # execute action in environment | |
| next_observation, reward, terminal, _ = env.step(denorm_actions) | |
| score = env.get_normalized_score(total_reward) | |
| # update return | |
| total_reward += reward | |
| total_score += score | |
| print( | |
| f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:" | |
| f" {total_score}" | |
| ) | |
| # save observations for rendering | |
| rollout.append(next_observation.copy()) | |
| obs = next_observation | |
| except KeyboardInterrupt: | |
| pass | |
| print(f"Total reward: {total_reward}") | |