grg's picture
Moving LLM obs to text in textworld utils, bugfixes.
11bd154
raw
history blame
9.36 kB
from gym_minigrid.minigrid import COLOR_TO_IDX, OBJECT_TO_IDX
def generate_text_obs(obs, info):
text_observation = obs_to_text(info)
llm_prompt = "Obs : "
llm_prompt += "".join(text_observation)
# add utterances
if obs["utterance_history"] != "Conversation: \n":
utt_hist = obs['utterance_history']
utt_hist = utt_hist.replace("Conversation: \n","")
llm_prompt += utt_hist
return llm_prompt
def obs_to_text(info):
image, vis_mask = info["image"], info["vis_mask"]
carrying = info["carrying"]
agent_pos_vx, agent_pos_vy = info["agent_pos_vx"], info["agent_pos_vy"]
npc_actions_dict = info["npc_actions_dict"]
# (OBJECT_TO_IDX[self.type], COLOR_TO_IDX[self.color], state)
# State, 0: open, 1: closed, 2: locked
IDX_TO_COLOR = dict(zip(COLOR_TO_IDX.values(), COLOR_TO_IDX.keys()))
IDX_TO_OBJECT = dict(zip(OBJECT_TO_IDX.values(), OBJECT_TO_IDX.keys()))
list_textual_descriptions = []
if carrying is not None:
list_textual_descriptions.append("You carry a {} {}".format(carrying.color, carrying.type))
# agent_pos_vx, agent_pos_vy = self.get_view_coords(self.agent_pos[0], self.agent_pos[1])
view_field_dictionary = dict()
for i in range(image.shape[0]):
for j in range(image.shape[1]):
if image[i][j][0] != 0 and image[i][j][0] != 1 and image[i][j][0] != 2:
if i not in view_field_dictionary.keys():
view_field_dictionary[i] = dict()
view_field_dictionary[i][j] = image[i][j]
else:
view_field_dictionary[i][j] = image[i][j]
# Find the wall if any
# We describe a wall only if there is no objects between the agent and the wall in straight line
# Find wall in front
add_wall_descr = False
if add_wall_descr:
j = agent_pos_vy - 1
object_seen = False
while j >= 0 and not object_seen:
if image[agent_pos_vx][j][0] != 0 and image[agent_pos_vx][j][0] != 1:
if image[agent_pos_vx][j][0] == 2:
list_textual_descriptions.append(
f"A wall is {agent_pos_vy - j} steps in front of you. \n") # forward
object_seen = True
else:
object_seen = True
j -= 1
# Find wall left
i = agent_pos_vx - 1
object_seen = False
while i >= 0 and not object_seen:
if image[i][agent_pos_vy][0] != 0 and image[i][agent_pos_vy][0] != 1:
if image[i][agent_pos_vy][0] == 2:
list_textual_descriptions.append(
f"A wall is {agent_pos_vx - i} steps to the left. \n") # left
object_seen = True
else:
object_seen = True
i -= 1
# Find wall right
i = agent_pos_vx + 1
object_seen = False
while i < image.shape[0] and not object_seen:
if image[i][agent_pos_vy][0] != 0 and image[i][agent_pos_vy][0] != 1:
if image[i][agent_pos_vy][0] == 2:
list_textual_descriptions.append(
f"A wall is {i - agent_pos_vx} steps to the right. \n") # right
object_seen = True
else:
object_seen = True
i += 1
# list_textual_descriptions.append("You see the following objects: ")
# returns the position of seen objects relative to you
for i in view_field_dictionary.keys():
for j in view_field_dictionary[i].keys():
if i != agent_pos_vx or j != agent_pos_vy:
object = view_field_dictionary[i][j]
# # don't show npc
# if IDX_TO_OBJECT[object[0]] == "npc":
# continue
front_dist = agent_pos_vy - j
left_right_dist = i - agent_pos_vx
loc_descr = ""
if front_dist == 1 and left_right_dist == 0:
loc_descr += "Right in front of you "
elif left_right_dist == 1 and front_dist == 0:
loc_descr += "Just to the right of you"
elif left_right_dist == -1 and front_dist == 0:
loc_descr += "Just to the left of you"
else:
front_str = str(front_dist) + " steps in front of you " if front_dist > 0 else ""
loc_descr += front_str
suff = "s" if abs(left_right_dist) > 0 else ""
and_ = "and" if loc_descr != "" else ""
if left_right_dist < 0:
left_right_str = f"{and_} {-left_right_dist} step{suff} to the left"
loc_descr += left_right_str
elif left_right_dist > 0:
left_right_str = f"{and_} {left_right_dist} step{suff} to the right"
loc_descr += left_right_str
else:
left_right_str = ""
loc_descr += left_right_str
loc_descr += f" there is a "
obj_type = IDX_TO_OBJECT[object[0]]
if obj_type == "npc":
IDX_TO_STATE = {0: 'friendly', 1: 'antagonistic'}
description = f"{IDX_TO_STATE[object[2]]} {IDX_TO_COLOR[object[1]]} peer. "
# gaze
gaze_dir = {
0: "towards you",
1: "to the left of you",
2: "in the same direction as you",
3: "to the right of you",
}
description += f"It is looking {gaze_dir[object[3]]}. "
# point
point_dir = {
0: "towards you",
1: "to the left of you",
2: "in the same direction as you",
3: "to the right of you",
}
if object[4] != 255:
description += f"It is pointing {point_dir[object[4]]}. "
# last action
last_action = {v: k for k, v in npc_actions_dict.items()}[object[5]]
last_action = {
"go_forward": "foward",
"rotate_left": "turn left",
"rotate_right": "turn right",
"toggle_action": "toggle",
"point_stop_point": "stop pointing",
"point_E": "",
"point_S": "",
"point_W": "",
"point_N": "",
"stop_point": "stop pointing",
"no_op": ""
}[last_action]
if last_action not in ["no_op", ""]:
description += f"It's last action is {last_action}. "
elif obj_type in ["switch", "apple", "generatorplatform", "marble", "marbletee", "fence"]:
# todo: this assumes that Switch.no_light == True
description = f"{IDX_TO_COLOR[object[1]]} {IDX_TO_OBJECT[object[0]]} "
assert object[2:].mean() == 0
elif obj_type == "lockablebox":
IDX_TO_STATE = {0: 'open', 1: 'closed', 2: 'locked'}
description = f"{IDX_TO_STATE[object[2]]} {IDX_TO_COLOR[object[1]]} {IDX_TO_OBJECT[object[0]]} "
assert object[3:].mean() == 0
elif obj_type == "applegenerator":
IDX_TO_STATE = {1: 'square', 2: 'round'}
description = f"{IDX_TO_STATE[object[2]]} {IDX_TO_COLOR[object[1]]} {IDX_TO_OBJECT[object[0]]} "
assert object[3:].mean() == 0
elif obj_type == "remotedoor":
IDX_TO_STATE = {0: 'open', 1: 'closed'}
description = f"{IDX_TO_STATE[object[2]]} {IDX_TO_COLOR[object[1]]} {IDX_TO_OBJECT[object[0]]} "
assert object[3:].mean() == 0
elif obj_type == "door":
IDX_TO_STATE = {0: 'open', 1: 'closed', 2: 'locked'}
description = f"{IDX_TO_STATE[object[2]]} {IDX_TO_COLOR[object[1]]} {IDX_TO_OBJECT[object[0]]} "
assert object[3:].mean() == 0
elif obj_type == "lever":
IDX_TO_STATE = {1: 'activated', 0: 'unactivated'}
if object[3] == 255:
countdown_txt = ""
else:
countdown_txt = f"with {object[3]} timesteps left. "
description = f"{IDX_TO_STATE[object[2]]} {IDX_TO_COLOR[object[1]]} {IDX_TO_OBJECT[object[0]]} {countdown_txt}"
assert object[4:].mean() == 0
else:
raise ValueError(f"Undefined object type {obj_type}")
full_destr = loc_descr + description + "\n"
list_textual_descriptions.append(full_destr)
if len(list_textual_descriptions) == 0:
list_textual_descriptions.append("\n")
return list_textual_descriptions