Spaces:

zjowowen
/

gomoku

Sleeping

App Files Files Community

gomoku / DI-engine /dizoo /overcooked /envs /overcooked_env.py

zjowowen

init space

079c32c 11 months ago

raw

history blame contribute delete

12.3 kB

	from typing import Any, Union, List
	from collections import namedtuple
	from easydict import EasyDict
	import gym
	import copy
	import numpy as np

	from overcooked_ai_py.mdp.actions import Action, Direction
	from overcooked_ai_py.mdp.overcooked_mdp import PlayerState, OvercookedGridworld, OvercookedState, ObjectState, \
	SoupState, Recipe
	from overcooked_ai_py.mdp.overcooked_env import OvercookedEnv, DEFAULT_ENV_PARAMS

	from ding.envs import BaseEnv
	from ding.utils import ENV_REGISTRY, deep_merge_dicts

	OvercookEnvTimestep = namedtuple('OvercookEnvTimestep', ['obs', 'reward', 'done', 'info'])

	# n, s = Direction.NORTH, Direction.SOUTH
	# e, w = Direction.EAST, Direction.WEST
	# stay, interact = Action.STAY, Action.INTERACT
	# Action.ALL_ACTIONS: [n, s, e, w, stay, interact]


	@ENV_REGISTRY.register('overcooked')
	class OvercookEnv(BaseEnv):
	config = EasyDict(
	dict(
	env_name="cramped_room",
	horizon=400,
	concat_obs=False,
	action_mask=True,
	shape_reward=True,
	)
	)

	def __init__(self, cfg) -> None:
	self._cfg = deep_merge_dicts(self.config, cfg)
	self._env_name = self._cfg.env_name
	self._horizon = self._cfg.horizon
	self._concat_obs = self._cfg.concat_obs
	self._action_mask = self._cfg.action_mask
	self._shape_reward = self._cfg.shape_reward
	self.mdp = OvercookedGridworld.from_layout_name(self._env_name)
	self.base_env = OvercookedEnv.from_mdp(self.mdp, horizon=self._horizon, info_level=0)

	# rightnow overcook environment encoding only support 2 agent game
	self.agent_num = 2
	self.action_dim = len(Action.ALL_ACTIONS)
	self.action_space = gym.spaces.Discrete(len(Action.ALL_ACTIONS))
	# set up obs shape
	featurize_fn = lambda mdp, state: mdp.lossless_state_encoding(state)
	self.featurize_fn = featurize_fn
	dummy_mdp = self.base_env.mdp
	dummy_state = dummy_mdp.get_standard_start_state()
	obs_shape = self.featurize_fn(dummy_mdp, dummy_state)[0].shape # (5, 4, 26)
	obs_shape = (obs_shape[-1], *obs_shape[:-1]) # permute channel first
	if self._concat_obs:
	obs_shape = (obs_shape[0] * 2, *obs_shape[1:])
	else:
	obs_shape = (2, ) + obs_shape
	self.observation_space = gym.spaces.Box(low=0, high=1, shape=obs_shape, dtype=np.int64)
	if self._action_mask:
	self.observation_space = gym.spaces.Dict(
	{
	'agent_state': self.observation_space,
	'action_mask': gym.spaces.Box(
	low=0, high=1, shape=(self.agent_num, self.action_dim), dtype=np.int64
	)
	}
	)
	self.reward_space = gym.spaces.Box(low=0, high=100, shape=(1, ), dtype=np.float32)

	def seed(self, seed: int, dynamic_seed: bool = True) -> None:
	self._seed = seed
	self._dynamic_seed = dynamic_seed
	np.random.seed(self._seed)

	def close(self) -> None:
	# Note: the real env instance only has a empty close method, only pas
	pass

	def random_action(self):
	return [self.action_space.sample() for _ in range(self.agent_num)]

	def step(self, action):
	assert all(self.action_space.contains(a) for a in action), "%r (%s) invalid" % (action, type(action))
	agent_action, other_agent_action = [Action.INDEX_TO_ACTION[a] for a in action]

	if self.agent_idx == 0:
	joint_action = (agent_action, other_agent_action)
	else:
	joint_action = (other_agent_action, agent_action)

	next_state, reward, done, env_info = self.base_env.step(joint_action)
	reward = np.array([float(reward)])
	self._eval_episode_return += reward
	if self._shape_reward:
	self._eval_episode_return += sum(env_info['shaped_r_by_agent'])
	reward += sum(env_info['shaped_r_by_agent'])

	ob_p0, ob_p1 = self.featurize_fn(self.mdp, next_state)
	ob_p0, ob_p1 = self.obs_preprocess(ob_p0), self.obs_preprocess(ob_p1)
	if self.agent_idx == 0:
	both_agents_ob = [ob_p0, ob_p1]
	else:
	both_agents_ob = [ob_p1, ob_p0]
	if self._concat_obs:
	both_agents_ob = np.concatenate(both_agents_ob)
	else:
	both_agents_ob = np.stack(both_agents_ob)

	env_info["policy_agent_idx"] = self.agent_idx
	env_info["eval_episode_return"] = self._eval_episode_return
	env_info["other_agent_env_idx"] = 1 - self.agent_idx

	action_mask = self.get_action_mask()
	if self._action_mask:
	obs = {
	"agent_state": both_agents_ob,
	# "overcooked_state": self.base_env.state,
	"action_mask": action_mask
	}
	else:
	obs = both_agents_ob
	return OvercookEnvTimestep(obs, reward, done, env_info)

	def obs_preprocess(self, obs):
	obs = obs.transpose(2, 0, 1)
	return obs

	def reset(self):
	self.base_env.reset()
	self._eval_episode_return = 0
	self.mdp = self.base_env.mdp
	# random init agent index
	self.agent_idx = np.random.choice([0, 1])
	ob_p0, ob_p1 = self.featurize_fn(self.mdp, self.base_env.state)
	ob_p0, ob_p1 = self.obs_preprocess(ob_p0), self.obs_preprocess(ob_p1)

	if self.agent_idx == 0:
	both_agents_ob = [ob_p0, ob_p1]
	else:
	both_agents_ob = [ob_p1, ob_p0]
	if self._concat_obs:
	both_agents_ob = np.concatenate(both_agents_ob)
	else:
	both_agents_ob = np.stack(both_agents_ob)

	action_mask = self.get_action_mask()

	if self._action_mask:
	obs = {"agent_state": both_agents_ob, "action_mask": action_mask}
	else:
	obs = both_agents_ob
	return obs

	def get_available_actions(self):
	return self.mdp.get_actions(self.base_env.state)

	def get_action_mask(self):
	available_actions = self.get_available_actions()

	action_masks = np.zeros((self.agent_num, self.action_dim)).astype(np.int64)

	for i in range(self.action_dim):
	if Action.INDEX_TO_ACTION[i] in available_actions[0]:
	action_masks[0][i] = 1
	if Action.INDEX_TO_ACTION[i] in available_actions[1]:
	action_masks[1][i] = 1

	return action_masks

	def __repr__(self):
	return "DI-engine Overcooked Env"


	@ENV_REGISTRY.register('overcooked_game')
	class OvercookGameEnv(BaseEnv):
	config = EasyDict(
	dict(
	env_name="cramped_room",
	horizon=400,
	concat_obs=False,
	action_mask=False,
	shape_reward=True,
	)
	)

	def __init__(self, cfg) -> None:
	self._cfg = deep_merge_dicts(self.config, cfg)
	self._env_name = self._cfg.env_name
	self._horizon = self._cfg.horizon
	self._concat_obs = self._cfg.concat_obs
	self._action_mask = self._cfg.action_mask
	self._shape_reward = self._cfg.shape_reward
	self.mdp = OvercookedGridworld.from_layout_name(self._env_name)
	self.base_env = OvercookedEnv.from_mdp(self.mdp, horizon=self._horizon, info_level=0)

	# rightnow overcook environment encoding only support 2 agent game
	self.agent_num = 2
	self.action_dim = len(Action.ALL_ACTIONS)
	self.action_space = gym.spaces.Discrete(len(Action.ALL_ACTIONS))
	# set up obs shape
	featurize_fn = lambda mdp, state: mdp.lossless_state_encoding(state)
	self.featurize_fn = featurize_fn
	dummy_mdp = self.base_env.mdp
	dummy_state = dummy_mdp.get_standard_start_state()
	obs_shape = self.featurize_fn(dummy_mdp, dummy_state)[0].shape # (5, 4, 26)
	obs_shape = (obs_shape[-1], *obs_shape[:-1]) # permute channel first
	if self._concat_obs:
	obs_shape = (obs_shape[0] * 2, *obs_shape[1:])
	else:
	obs_shape = (2, ) + obs_shape
	self.observation_space = gym.spaces.Box(low=0, high=1, shape=obs_shape, dtype=np.int64)
	if self._action_mask:
	self.observation_space = gym.spaces.Dict(
	{
	'agent_state': self.observation_space,
	'action_mask': gym.spaces.Box(
	low=0, high=1, shape=(self.agent_num, self.action_dim), dtype=np.int64
	)
	}
	)

	self.reward_space = gym.spaces.Box(low=0, high=100, shape=(1, ), dtype=np.float32)

	def seed(self, seed: int, dynamic_seed: bool = True) -> None:
	self._seed = seed
	self._dynamic_seed = dynamic_seed
	np.random.seed(self._seed)

	def close(self) -> None:
	# Note: the real env instance only has a empty close method, only pass
	pass

	def random_action(self):
	return [self.action_space.sample() for _ in range(self.agent_num)]

	def step(self, action):
	assert all(self.action_space.contains(a) for a in action), "%r (%s) invalid" % (action, type(action))
	agent_action, other_agent_action = [Action.INDEX_TO_ACTION[a] for a in action]

	if self.agent_idx == 0:
	joint_action = (agent_action, other_agent_action)
	else:
	joint_action = (other_agent_action, agent_action)

	next_state, reward, done, env_info = self.base_env.step(joint_action)

	reward = np.array([float(reward)])
	self._eval_episode_return += reward
	if self._shape_reward:
	self._eval_episode_return += sum(env_info['shaped_r_by_agent'])
	reward += sum(env_info['shaped_r_by_agent'])
	ob_p0, ob_p1 = self.featurize_fn(self.mdp, next_state)
	ob_p0, ob_p1 = self.obs_preprocess(ob_p0), self.obs_preprocess(ob_p1)
	if self.agent_idx == 0:
	both_agents_ob = [ob_p0, ob_p1]
	else:
	both_agents_ob = [ob_p1, ob_p0]
	if self._concat_obs:
	both_agents_ob = np.concatenate(both_agents_ob)
	else:
	both_agents_ob = np.stack(both_agents_ob)

	env_info["policy_agent_idx"] = self.agent_idx
	env_info["eval_episode_return"] = self._eval_episode_return
	env_info["other_agent_env_idx"] = 1 - self.agent_idx

	action_mask = self.get_action_mask()
	if self._action_mask:
	obs = {"agent_state": both_agents_ob, "action_mask": action_mask}
	else:
	obs = both_agents_ob
	return OvercookEnvTimestep(obs, reward, done, env_info)

	def obs_preprocess(self, obs):
	obs = obs.transpose(2, 0, 1)
	return obs

	def reset(self):
	self.base_env.reset()
	self._eval_episode_return = 0
	self.mdp = self.base_env.mdp
	# random init agent index
	self.agent_idx = np.random.choice([0, 1])
	#fix init agent index
	self.agent_idx = 0
	ob_p0, ob_p1 = self.featurize_fn(self.mdp, self.base_env.state)
	ob_p0, ob_p1 = self.obs_preprocess(ob_p0), self.obs_preprocess(ob_p1)

	if self.agent_idx == 0:
	both_agents_ob = [ob_p0, ob_p1]
	else:
	both_agents_ob = [ob_p1, ob_p0]
	if self._concat_obs:
	both_agents_ob = np.concatenate(both_agents_ob)
	else:
	both_agents_ob = np.stack(both_agents_ob)

	action_mask = self.get_action_mask()

	if self._action_mask:
	obs = {"agent_state": both_agents_ob, "action_mask": action_mask}
	else:
	obs = both_agents_ob
	return obs

	def get_available_actions(self):
	return self.mdp.get_actions(self.base_env.state)

	def get_action_mask(self):
	available_actions = self.get_available_actions()

	action_masks = np.zeros((self.agent_num, self.action_dim)).astype(np.int64)

	for i in range(self.action_dim):
	if Action.INDEX_TO_ACTION[i] in available_actions[0]:
	action_masks[0][i] = 1
	if Action.INDEX_TO_ACTION[i] in available_actions[1]:
	action_masks[1][i] = 1

	return action_masks

	def __repr__(self):
	return "DI-engine Overcooked GameEnv"