Spaces:

gberseth
/

robo-eval

Runtime error

robo-eval / sim_eval.py

Neo-X

Updating dockerfile for learning.

84fe74c about 2 months ago

22.3 kB


	import dill
	import h5py
	import numpy as np
	import torch

	def get_text_tokens(cfg, tokenizer, text_model, goal, model=None):
	"""
	Get the text tokens/embeddings for the goal.
	If a `model` with `encode_text_goal` is provided, use it so callers don't need a buffer.
	"""
	if model is not None:
	return model.encode_text_goal(goal, tokenizer=tokenizer, text_model=text_model)
	# fallback to legacy behaviour
	if cfg.dataset.encode_with_t5:
	goal_ = np.zeros((cfg.max_block_size, cfg.n_embd), dtype=np.float32)
	input_ids = tokenizer(goal, return_tensors="pt").input_ids
	goal_t = text_model.encoder(input_ids).last_hidden_state.detach().cpu().numpy() ## Get the goal embedding
	goal_[:len(goal_t[0]), :] = goal_t[0][:cfg.max_block_size] ## Overwrite just the zeros up to the size of this vector, smaller vectors will have < max_block_size
	else:
	goal_ = " " * cfg.max_block_size
	goal_ = goal[:cfg.max_block_size] + goal_[len(goal):cfg.max_block_size]
	# legacy buffer-based encoding is not available here
	raise RuntimeError("Text encoding without model requires a buffer; pass model into get_text_tokens")
	return np.expand_dims(goal_, axis=0)

	def get_blocked_mask(cfg, targets=None, T=0):
	## Compute blocked masks
	c=192 ## Number of patches/channels in the image
	mask = torch.ones((1 + (c * cfg.policy.obs_stacking) + T + c, ), device=cfg.device) ## (1, T)
	if targets is None:
	pass
	elif (torch.rand(1)[0] > 0.66):
	mask[1 + (c * cfg.policy.obs_stacking): 1 + (c * cfg.policy.obs_stacking) + T] = torch.zeros((1,T), device=cfg.device) ## Mask goal string
	elif (torch.rand(1)[0] > 0.33):
	mask[1 + (c * cfg.policy.obs_stacking) + T: 1 + (c * cfg.policy.obs_stacking) + T + c] = torch.zeros((1,c), device=cfg.device) ## Mask goal image

	def eval_model_in_sim(cfg, model, device, log_dir, env, env_unwrapped,
	wandb, iter_, tokenizer=None, text_model=None):
	from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict
	print("Evaluating model in sim environment")
	from collections import deque
	from einops import rearrange

	rewards = []
	for j in range(cfg.sim.eval_episodes): ## Better to eval over a few different goal configurations
	obs, reset_info = env.reset()
	obs_ = get_image_from_maniskill2_obs_dict(env_unwrapped, obs)[:,:,:3]
	obs_hist = deque(maxlen=cfg.policy.obs_stacking)
	last_action = np.zeros(cfg.action_dim) # Track last action taken
	for _ in range(cfg.policy.obs_stacking):
	obs_hist.append(obs_)
	instruction = env_unwrapped.get_language_instruction()
	# print("Reset info", reset_info)
	print("Instruction", instruction)
	frames = []
	done, truncated, timeLimit, t = False, False, 100, 0
	txt_goal = get_text_tokens(cfg, tokenizer, text_model, instruction, model=model)
	# obs_hist.append(image) ## Add the new observation to the history buffer
	while not (done or truncated or (t > timeLimit)):
	# action[:3]: delta xyz; action[3:6]: delta rotation in axis-angle representation;
	# action[6:7]: gripper (the meaning of open / close depends on robot URDF)
	# obs = [obs_["image"] for obs_ in obs] # obs is a list of dicts
	image = np.stack(obs_hist, axis=-1) # stack along the last dimension
	image = rearrange(image, 'h w c t -> h w (c t)') # add batch dimension

	obs_state = torch.tensor(model.preprocess_state(image), dtype=torch.float32)
	goal_state = torch.tensor(model.preprocess_goal_image(image[:,:,:3]), dtype=torch.float32)

	# Prepare last_action tensor if available
	last_action_tensor = None
	if last_action is not None:
	last_action_tensor = torch.tensor(last_action[:cfg.action_dim], dtype=torch.float32).unsqueeze(0).to(device)

	action, loss = model.forward(torch.tensor(obs_state.unsqueeze(0), dtype=torch.float32).to(device)
	,torch.tensor(txt_goal).to(device)
	,torch.tensor(goal_state.unsqueeze(0), dtype=torch.float32).to(device),
	mask_=True, ## Masks goal image
	pose=torch.tensor([[obs["extra"]["tcp_pose"]]], dtype=torch.float32).to(device),
	last_action=last_action_tensor,
	)

	action = model.decode_action(action[0]).cpu().detach().numpy() ## Add in the gripper close action
	last_action = action.copy() # Store for next iteration
	## If the actions are stacked into a longer vector execute the sequence of actions
	for step_ in range(cfg.policy.action_stacking):
	act_ = action[cfg.action_dimstep_:(cfg.action_dim(step_+1))]
	obs, reward, done, truncated, info = env.step(act_)
	image = get_image_from_maniskill2_obs_dict(env_unwrapped, obs)
	image = image[:,:,:3] ## Remove last dimension of image color
	# Store the original image for video before stacking/processing
	frames.append(image)
	reward = -(np.linalg.norm(info["eof_to_obj1_diff"]) + np.linalg.norm(info["eof_to_obj1_diff"])) ## Use a shaped reward as distance between gripper and objects
	rewards.append(reward)
	t=t+1
	if done or truncated:
	break


	episode_stats = info.get('episode_stats', {})
	episode_stats['rewards'] = np.mean(rewards)
	# print("Episode stats", episode_stats)
	print(f"avg reward {np.mean(episode_stats['rewards']):.8f}")
	if not cfg.testing:
	wandb.log({"avg reward": np.mean(rewards)})

	import os
	path_ = os.path.join(log_dir, f"simple-env-{iter_}.mp4")
	import imageio
	imageio.mimsave(path_, frames, fps=20)
	episode_stats['video_url'] = path_

	if not cfg.testing:
	try:
	wandb.log({"example": wandb.Video(path_)})
	except Exception as e:
	print(f"Warning: failed to log video to wandb: {e}")

	return episode_stats

	import gymnasium as gym
	# --- History Stacking Wrapper ---
	class DictWrapper(gym.ObservationWrapper):
	# from gymnasium.spaces import Box
	"""
	A wrapper that grabs the observation from a specific key in the dictionary.
	"""
	def __init__(self, env, obs_key=""):
	# gym.Wrapper.__init__(self, env)
	self.env = env
	self.observation_space = gym.spaces.Box(
	low=0,
	high=255,
	shape=(256,256,3), # Assuming the observation is an image of size 256x256 with 3 color channels
	dtype=np.uint8)
	self._obs_key = obs_key

	def observation(self, observation):
	"""
	This method is called by the gym.ObservationWrapper after the environment's
	step or reset methods return an observation.
	"""
	# Add the new observation to the history buffer
	return observation[self._obs_key]

	def step(self, action):
	"""
	Step the environment and return the observation from the specified key.
	"""
	obs, reward, done, info = self.env.step(action) ## LIBERO does not return truncated
	return obs[self._obs_key][::-1, :, :], reward, done, False, obs ## Not sure why the image was upside down.

	def reset(self, **kwargs):
	"""
	Reset the environment and return the observation from the specified key.
	"""
	obs = self.env.reset()
	return obs[self._obs_key][::-1, :, :], obs

	def eval_libero(model, device, cfg, iter_=0, log_dir="./",
	tokenizer=None, text_model=None, wandb=None):
	# cfg, model, device, log_dir, env, env_unwrapped, buffer,
	# wandb, iter_, tokenizer=None, text_model=None):

	from libero.libero import benchmark
	from libero.libero.envs import OffScreenRenderEnv, DenseRewardEnv
	import os
	from libero.libero.utils import get_libero_path
	from gymnasium.wrappers import FrameStackObservation
	from einops import rearrange

	benchmark_dict = benchmark.get_benchmark_dict()
	task_suite_name = cfg.sim.task_set # can also choose libero_spatial, libero_object, etc.
	task_suite = benchmark_dict[task_suite_name]()

	# Load initial states and goal images from Hugging Face dataset if provided
	init_states_dataset = None
	if hasattr(cfg.sim, 'libero_init_state_hf_repo') and cfg.sim.libero_init_state_hf_repo:
	print(f"Loading initial states from Hugging Face: {cfg.sim.libero_init_state_hf_repo}")
	from datasets import load_dataset
	init_states_dataset = load_dataset(cfg.sim.libero_init_state_hf_repo, split='train')
	print(f"Loaded dataset with {len(init_states_dataset)} entries")
	elif hasattr(cfg.sim, 'libero_init_state_file') and cfg.sim.libero_init_state_file:
	print(f"Loading initial states from HDF5: {cfg.sim.libero_init_state_file}")
	init_states_dataset = h5py.File(hydra.utils.get_original_cwd()+cfg.sim.libero_init_state_file, 'r')

	# retrieve a specific task
	tasks = cfg.sim.eval_tasks
	for task_id in tasks:
	task = task_suite.get_task(task_id)
	task_name = task.name
	instruction = task.language
	task_bddl_file = os.path.join(get_libero_path("bddl_files"), task.problem_folder, task.bddl_file)
	print(f"[info] retrieving task {task_id} from suite {task_suite_name}, the " + \
	f"language instruction is {instruction}, and the bddl file is {task_bddl_file}")

	# step over the environment
	env_args = {
	"bddl_file_name": task_bddl_file,
	"camera_heights": 256,
	"camera_widths": 256
	}
	env = DenseRewardEnv(env_args) # env = OffScreenRenderEnv(env_args)
	env.seed(0)

	# Load initial states from dataset if available, otherwise use default
	task_description = instruction.replace(" ", "_")
	task_demos = None
	if init_states_dataset is not None:
	if isinstance(init_states_dataset, h5py.File):
	# HDF5 format
	if task_description in init_states_dataset:
	task_grp = init_states_dataset[task_description]
	num_init_states = len(task_grp.keys())
	print(f"Loaded {num_init_states} initial states from HDF5 for task: {task_description}")
	else:
	task_grp = None
	init_states = task_suite.get_task_init_states(task_id)
	num_init_states = len(init_states)
	print(f"Using default initial states for task: {task_description}")
	else:
	# Hugging Face dataset format
	task_demos = [item for item in init_states_dataset if item.get('task_description') == task_description]
	num_init_states = len(task_demos)
	if num_init_states > 0:
	print(f"Loaded {num_init_states} initial states from HF dataset for task: {task_description}")
	else:
	init_states = task_suite.get_task_init_states(task_id)
	num_init_states = len(init_states)
	print(f"Using default initial states for task: {task_description}")
	else:
	init_states = task_suite.get_task_init_states(task_id) # for benchmarking purpose, we fix the a set of initial states
	num_init_states = len(init_states)
	print(f"Using default initial states for task: {task_description}")

	# for init_state_id in range(len(init_states)):
	for init_state_id in range(min(2, num_init_states)): ## Just do a couple different initializations for eval
	# Load init_state and goal_img from dataset or use default
	if init_states_dataset is not None:
	if isinstance(init_states_dataset, h5py.File):
	# HDF5 format
	if task_grp is not None:
	demo_key = f"demo_{init_state_id}"
	if demo_key in task_grp:
	init_state = task_grp[demo_key]['init_state'][()]
	goal_img = task_grp[demo_key]['goal_img'][()] if 'goal_img' in task_grp[demo_key] else None
	print(f"Loaded init_state and goal_img from HDF5 for {demo_key}")
	else:
	init_state = init_states[init_state_id]
	goal_img = None
	else:
	init_state = init_states[init_state_id]
	goal_img = None
	else:
	# Hugging Face dataset format
	if task_demos and init_state_id < len(task_demos):
	demo = task_demos[init_state_id]
	init_state = np.array(demo['init_state'])
	goal_img = np.array(demo['goal_img']) if 'goal_img' in demo and demo['goal_img'] is not None else None
	print(f"Loaded init_state and goal_img from HF dataset for demo {init_state_id}")
	else:
	init_state = init_states[init_state_id]
	goal_img = None
	else:
	init_state = init_states[init_state_id]
	goal_img = None

	env.reset()
	env.set_init_state(init_state)
	env_ = FrameStackObservation(DictWrapper(env, obs_key="agentview_image"), cfg.policy.obs_stacking) ## Stacking the observations
	obs, info = env_.reset()

	mask = get_blocked_mask(cfg, targets=None, T=0) ## Get the blocked mask

	txt_goal = get_text_tokens(cfg, tokenizer, text_model, instruction, model=model)

	# Use goal image from HDF5 if available, otherwise use first observation
	if goal_img is not None:
	image_goal = goal_img
	print(f"Using goal image from HDF5, shape: {image_goal.shape}")
	else:
	image_goal = obs.reshape((256, 256, 3*cfg.policy.obs_stacking))[:,:,:3]
	print("Using first observation as goal image")
	frames = []
	rewards = []
	infos = []
	last_action = np.zeros(cfg.action_dim) # Track last action taken
	done, truncated, timeLimit, t, wait_steps = False, False, 400, 0, 00

	while not (done or truncated or (t > (timeLimit + wait_steps))):
	## Reshape the image to the correct size and stack the hostory on the last channel dimension
	# image = obs[0]
	if t < wait_steps: ## let object stabalize before acting.
	obs, reward, done, truncated, info = env_.step([0,0,0,0,0,0,0])
	t += 1
	continue
	# obs = obs.reshape((128, 128, 3*cfg.policy.obs_stacking)) ## Assuming the observation is an image of size 128x128 with 3 color channels
	obs = rearrange(obs, 't h w c -> h w (t c)', c=3, t=cfg.policy.obs_stacking) ## Rearranging the image to have the stacked history in the last channel dimension
	# image = obs[:,:,:3] ## Remove the last dimension of the image color
	obs_state = model.preprocess_state(obs)
	goal_state = model.preprocess_goal_image(image_goal)
	pose_ = model.encode_pose(torch.tensor([[np.concatenate(
	(info["robot0_eef_pos"],
	info["robot0_eef_quat"][:3],
	[(info["robot0_gripper_qpos"][0])]), axis=-1)]],
	dtype=torch.float32)).to(device)

	# Prepare last_action tensor if available
	last_action_tensor = None
	if last_action is not None:
	last_action_tensor = model.encode_action(torch.tensor([last_action[:cfg.action_dim]], dtype=torch.float32)).to(device)

	action, loss = model.forward(torch.tensor(np.array([obs_state])).to(device)
	,torch.tensor(txt_goal).to(device)
	,torch.tensor(np.array([goal_state])).to(device),
	mask_=True,
	pose=pose_,
	last_action=last_action_tensor,
	)

	action = model.decode_action(action[0]).cpu().detach().numpy()
	last_action = action.copy() # Store for next iteration
	## If the actions are stacked into a longer vector execute the sequence of actions
	for step_ in range(cfg.policy.action_stacking):
	act_ = action[cfg.action_dimstep_:(cfg.action_dim(step_+1))]
	## Need to process LIBERO gripper action [0, 1] -> [-1, 1], then invert, https://github.com/moojink/openvla-oft/blob/e4287e94541f459edc4feabc4e181f537cd569a8/experiments/robot/libero/run_libero_eval.py#L265
	## If the model is the RepayModel don't do this conversion
	# if not hasattr(model, 'trajectory_loaded'):
	# act_[6] = ((act_[6] - 0.5) * 2) # * -1.0

	obs, reward, done, truncated, info = env_.step(act_)
	# image = get_image_from_maniskill2_obs_dict(env_unwrapped, obs)
	# image = image[:,:,:3] ## Remove last dimension of image color
	# Store the original image for video before stacking/processing
	image = obs[0]
	frames.append(image)
	# reward = -(np.linalg.norm(info["eof_to_obj1_diff"]) + np.linalg.norm(info["eof_to_obj1_diff"])) ## Use a shaped reward as distance between gripper and objects
	rewards.append(reward)
	infos.append(info)
	t=t+1
	# print(f"Step {t}, reward: {reward:.4f}, done: {done}, truncated: {truncated}")
	if done or truncated:
	print("Episode finished with success after {} timesteps".format(step_))
	break
	if done:
	print("Episode finished with success after {} timesteps".format(step_))
	break

	import os
	path_ = os.path.join(log_dir, f"libero-{iter_}-task-id-{task_id}-init-id-{init_state_id}.mp4")
	import imageio
	imageio.mimsave(path_, frames, fps=20)
	episode_stats = info.get('episode_stats', {})
	episode_stats['rewards'] = np.mean(rewards)
	episode_stats['video_url'] = path_
	print(f"avg reward {np.mean(rewards):.8f}")
	if not cfg.testing:
	wandb.log({"avg reward_"+str(task_id): np.mean(rewards)})
	if not cfg.testing:
	wandb.log({"example": wandb.Video(path_)})
	env.close()

	# Close HDF5 file if it was opened
	if init_states_dataset is not None and isinstance(init_states_dataset, h5py.File):
	init_states_dataset.close()
	print("Closed HDF5 file")

	return episode_stats

	import hydra
	from omegaconf import DictConfig

	@hydra.main(config_path="./conf", config_name="64pix-pose")
	def my_main(cfg: DictConfig):
	import torch
	# ------------
	# Train and test splits
	# Loading data
	# create RLDS dataset builder
	log_dir = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir
	cfg.dataset.load_dataset = "skip"
	# model = GRP(cfg)
	# model_ = torch.load("/home/gberseth/playground/mini_grp/miniGRP.pth")
	model_dir = hydra.utils.get_original_cwd()+"/mini-grp/miniGRP.pth"
	print ("Loading model from:", model_dir)
	if "dataset" == cfg.model.type:
	## load the dataset
	from mini_shuffel_buffer import CircularBuffer
	from mock_grp_model import ReplayModel
	cfg.dataset.load_dataset = True
	model_ = ReplayModel(cfg)
	dataset_buffer = CircularBuffer(cfg.dataset.buffer_size, cfg, model=model_)
	model_.set_dataset(dataset_buffer)
	else:
	from grp_model import GRP
	model_ = torch.load(model_dir, pickle_module=dill)
	# model_._cgf = cfg

	tokenizer = None
	text_model = None
	if cfg.dataset.encode_with_t5: ## Load T5 model
	from transformers import T5Tokenizer, T5ForConditionalGeneration
	tokenizer = T5Tokenizer.from_pretrained(cfg.dataset.t5_version)
	text_model = T5ForConditionalGeneration.from_pretrained(cfg.dataset.t5_version)

	if "libero" in cfg.simEval:
	results = eval_libero(model_.to(cfg.device), device=cfg.device, cfg=cfg,
	iter_=0, tokenizer=tokenizer, text_model=text_model, wandb=None,
	log_dir=log_dir)
	if "simple_env" in cfg.simEval:
	import simpler_env
	task_name = "widowx_carrot_on_plate" # @param ["google_robot_pick_coke_can", "google_robot_move_near", "google_robot_open_drawer", "google_robot_close_drawer", "widowx_spoon_on_towel", "widowx_carrot_on_plate", "widowx_stack_cube", "widowx_put_eggplant_in_basket"]
	if 'env' in locals():
	print("Closing existing env")
	env.close()
	del env
	env = simpler_env.make(task_name)
	env_unwrapped = env.env.env.env ## Updated gymnasium wrapper adds lots of wrappers.
	results = eval_model_in_sim(cfg, model_.to(cfg.device), device=cfg.device, log_dir=log_dir,
	env=env, env_unwrapped=env_unwrapped,
	wandb=None, iter_=0, tokenizer=tokenizer, text_model=text_model)
	print("results:", results)

	# cbuffer.save(cfg.dataset.to_name)


	if __name__ == "__main__":
	results = my_main()
	print("results:", results)