Spaces:

pivot-iterative-visual-optimization
/

pivot-demo

Sleeping

App Files Files Community

pivot-iterative-visual-optimization commited on Feb 2, 2024

Commit

5c80958

verified ·

1 Parent(s): afdb372

Upload 6 files

Browse files

Files changed (6) hide show

app.py +182 -0
requirements.txt +6 -0
vip.py +462 -0
vip_runner.py +163 -0
vip_utils.py +130 -0
vlms.py +33 -0

app.py ADDED Viewed

	@@ -0,0 +1,182 @@

+"""Visual Iterative Prompting Demo."""
+import gradio as gr
+import numpy as np
+from vip_runner import vip_runner
+from vlms import GPT4V
+# Adjust radius of annotations based on size of the image
+radius_per_pixel = 0.05
+def run_vip(
+    im,
+    query,
+    n_samples_init,
+    n_samples_opt,
+    n_iters,
+    n_recurssion,
+    openai_api_key,
+    progress=gr.Progress(track_tqdm=True),
+):
+  if not openai_api_key:
+    return [], 'Must provide OpenAI API Key'
+  if im is None:
+    return [], 'Must specify image'
+  if not query:
+    return [], 'Must specify description'
+  img_size = np.min(im.shape[:2])
+  print(int(img_size * radius_per_pixel))
+  # add some action spec
+  style = {
+      'num_samples': 12,
+      'circle_alpha': 0.6,
+      'alpha': 0.8,
+      'arrow_alpha': 0.0,
+      'radius': int(img_size * radius_per_pixel),
+      'thickness': 2,
+      'fontsize': int(img_size * radius_per_pixel),
+      'rgb_scale': 255,
+      'focal_offset': 1,  # camera distance / std of action in z
+  }
+  action_spec = {
+      'loc': [0, 0, 0],
+      'scale': [0.0, 100, 100],
+      'min_scale': [0.0, 30, 30],
+      'min': [0, -300.0, -300],
+      'max': [0, 300, 300],
+      'action_to_coord': 250,
+      'robot': 'meta',
+  }
+  vlm = GPT4V(openai_api_key=openai_api_key)
+  ims, center, _ = vip_runner(
+      vlm,
+      im,
+      query,
+      style,
+      action_spec,
+      n_samples_init=n_samples_init,
+      n_samples_opt=n_samples_opt,
+      n_iters=n_iters,
+      recursion_level=n_recurssion,
+  )
+  return ims, f'Final selected coordinate: {np.round(center, decimals=0)}'
+examples = [
+    {
+        'im_path': 'ims/aloha.png',
+        'desc': 'a point between the fork and the cup',
+    },
+    {
+        'im_path': 'ims/robot.png',
+        'desc': 'the toy in the middle of the table',
+    },
+    {
+        'im_path': 'ims/parking.jpg',
+        'desc': 'a place to park if I am handicapped',
+    },
+    {
+        'im_path': 'ims/tools.png',
+        'desc': 'what should I use pull a nail'
+    },
+]
+with gr.Blocks() as demo:
+  gr.Markdown("""
+# Visual Iterative Prompting Demo
+The demo below showcases the Visual Iterative Prompting (VIP) algorithm.
+Given an image and a description of an object or region,
+VIP leverages a Vision-Language Model (VLM) to iteratively search for the point in the image that best corresponds to the description.
+This is done through visual prompting, where instead of reasoning with text, the VLM reasons over images annotated with sampled points,
+in order to pick the best points.
+In each iteration, we take the points previously selected by the VLM, resample new points around the their mean, and repeat the process.
+To get started, you can use the provided example image and query pairs, or
+upload your own images.
+This demo uses GPT-4V, so it requires an OpenAI API key.
+To use the provided example images, you can right click on the image -> copy image, then click the clipboard icon in the Input Image box.
+Hyperparameters to set:
+* N Samples for Initialization - how many initial points are sampled for the first VIP iteration.
+* N Samples for Optimiazation - how many points are sampled for subsequent iterations.
+* N Iterations - how many optimization iterations to perform.
+* N Ensemble Recursions - how many ensembles for recursive VIP.
+Note that each iteration takes about ~10s, and each additional ensemble adds a multiple number of N Iterations.
+After VIP finishes, the image gallery below will visualize VIP results throughout all the iterations.
+There are two images for each iteration - the first one shows all the sampled points, and the second one shows which one VIP picked.
+The Info textbox will show the final selected pixel coordinate that VIP converged to.
+""".strip())
+  gr.Markdown(
+      '## Example Images and Queries\n Drag images into the image box below'
+  )
+  with gr.Row(equal_height=True):
+    for example in examples:
+      gr.Image(value=example['im_path'], label=example['desc'])
+  gr.Markdown('## New Query')
+  with gr.Row():
+    with gr.Column():
+      inp_im = gr.Image(label='Input Image', type='numpy', show_label=True)
+      inp_query = gr.Textbox(label='Description', lines=1)
+    with gr.Column():
+      inp_openai_api_key = gr.Textbox(
+          label='OpenAI API Key (not saved)', lines=1
+      )
+      with gr.Group():
+        inp_n_samples_init = gr.Slider(
+            label='N Samples for Initialization',
+            minimum=10,
+            maximum=40,
+            value=25,
+            step=1,
+        )
+        inp_n_samples_opt = gr.Slider(
+            label='N Samples for Optimization',
+            minimum=3,
+            maximum=20,
+            value=10,
+            step=1,
+        )
+        inp_n_iters = gr.Slider(
+            label='N Iterations', minimum=1, maximum=5, value=3, step=1
+        )
+        inp_n_recurssions = gr.Slider(
+            label='N Ensemble Recursions', minimum=0, maximum=3, value=0, step=1
+        )
+      btn_run = gr.Button('Run')
+  with gr.Group():
+    out_ims = gr.Gallery(
+        label='Images with Sampled and Chosen Points',
+        columns=4,
+        rows=1,
+        interactive=False,
+    )
+    out_info = gr.Textbox(label='Info', lines=1)
+  btn_run.click(
+      run_vip,
+      inputs=[
+          inp_im,
+          inp_query,
+          inp_n_samples_init,
+          inp_n_samples_opt,
+          inp_n_iters,
+          inp_n_recurssions,
+          inp_openai_api_key,
+      ],
+      outputs=[out_ims, out_info],
+  )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+numpy
+matplotlib
+opencv-python
+openai
+gradio
+scipy

vip.py ADDED Viewed

	@@ -0,0 +1,462 @@

+# pylint: disable=line-too-long
+"""Visual Iterative Prompting functions.
+Copied from experimental/users/ichter/vip/vip.py
+Code to implement visual iterative prompting, an approach for querying VLMs.
+See go/visual-iterative-prompting for more information.
+These are used within Colabs such as:
+*
+https://colab.corp.google.com/drive/1GnO-1urDCETWo3M3PpQKQ8TqT1Ql_jiS#scrollTo=5dUSoiz6Hplv
+*
+https://colab.corp.google.com/drive/14AYsa4W68NnsaREFTUX7lTkSxpD5eHCO#scrollTo=qA2A_oTcGTzN
+*
+https://colab.corp.google.com/drive/11H-WtHNYzBkr_lQpaa4ASeYy0HD29EXe#scrollTo=HapF0UIxdJM6
+"""
+import copy
+import dataclasses
+import enum
+import io
+from typing import Optional, Tuple
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import scipy.stats
+import vip_utils
+@enum.unique
+class SupportedEmbodiments(str, enum.Enum):
+  """Embodiments supported by VIP."""
+  META_MANIPULATION = 'meta_manipulation'
+  ALOHA_MANIPULATION = 'aloha_manipulation'
+  META_NAVIGATION = 'meta_navigation'
+@dataclasses.dataclass()
+class Coordinate:
+  """Coordinate with necessary information for visualizing annotation."""
+  # 2D image coordinates for the target annotation
+  xy: Tuple[int, int]
+  # Color and style of the coord.
+  color: Optional[float] = None
+  radius: Optional[int] = None
+@dataclasses.dataclass()
+class Sample:
+  """Single Sample mapping actions to Coordinates."""
+  # 2D or 3D action
+  action: np.ndarray
+  # Coordinates for the main annotation
+  coord: Coordinate
+  # Coordinates for the text label
+  text_coord: Coordinate
+  # Label to display in the text label
+  label: str
+class VisualIterativePrompter:
+  """Visual Iterative Prompting class."""
+  def __init__(self, style, action_spec, embodiment):
+    self.embodiment = embodiment
+    self.style = style
+    self.action_spec = action_spec
+    self.fig_scale_size = None
+    # image preparer
+    # robot_to_image_canonical_coords
+  def action_to_coord(self, action, image, arm_xy, do_project=False):
+    """Converts candidate action to image coordinate."""
+    if (self.embodiment == SupportedEmbodiments.META_MANIPULATION or
+        self.embodiment == SupportedEmbodiments.ALOHA_MANIPULATION):
+      return self.manipulation_action_to_coord(
+          action=action, image=image, arm_xy=arm_xy, do_project=do_project
+      )
+    elif self.embodiment == SupportedEmbodiments.META_NAVIGATION:
+      return self.navigation_action_to_coord(
+          action=action, image=image, center_xy=arm_xy, do_project=do_project
+      )
+    else:
+      raise NotImplementedError('Embodiment not supported.')
+  def manipulation_action_to_coord(
+      self, action, image, arm_xy, do_project=False
+  ):
+    """Converts a ZXY or XY action to an image coordinate.
+    Conversion is done based on style['focal_offset'] and action_spec['scale'].
+    Args:
+      action: z, y, x action in robot action space
+      image: image
+      arm_xy: x, y in image space
+      do_project: whether or not to project actions sampled outside the image to
+        the edge of the image
+    Returns:
+      Dict coordinate with image x, y, arrow color, and circle radius.
+    """
+    # TODO(tedxiao): Refactor into common utiliy fns, add embodiment specific
+    #                logic.
+    if self.action_spec['scale'][0] == 0:  # no z dimension
+      norm_action = [(action[d] - self.action_spec['loc'][d]) /
+                     (2 * self.action_spec['scale'][d]) for d in range(1, 3)]
+      norm_action_y, norm_action_x = norm_action
+      norm_action_z = 0
+    else:
+      norm_action = [(action[d] - self.action_spec['loc'][d]) /
+                     (2 * self.action_spec['scale'][d]) for d in range(3)]
+      norm_action_z, norm_action_y, norm_action_x = norm_action
+    focal_length = np.max(
+        [0.2,  # positive focal lengths only
+         self.style['focal_offset'] / (self.style['focal_offset'] + norm_action_z)])
+    image_x = arm_xy[0] - (
+        self.action_spec['action_to_coord'] * norm_action_x * focal_length
+    )
+    image_y = arm_xy[1] - (
+        self.action_spec['action_to_coord'] * norm_action_y * focal_length
+    )
+    if vip_utils.coord_outside_image(
+        coord=Coordinate(xy=(int(image_x), int(image_y))),
+        image=image,
+        radius=self.style['radius']) and do_project:
+      # project the arrow to the edge of the image if too large
+      height, width, _ = image.shape
+      max_x = (
+          width - arm_xy[0] - 2 * self.style['radius']
+          if norm_action_x < 0
+          else arm_xy[0] - 2 * self.style['radius']
+      )
+      max_y = (
+          height - arm_xy[1] - 2 * self.style['radius']
+          if norm_action_y < 0
+          else arm_xy[1] - 2 * self.style['radius']
+      )
+      rescale_ratio = min(np.abs([
+          max_x / (self.action_spec['action_to_coord'] * norm_action_x),
+          max_y / (self.action_spec['action_to_coord'] * norm_action_y)]))
+      image_x = (
+          arm_xy[0]
+          - self.action_spec['action_to_coord'] * norm_action_x * rescale_ratio
+      )
+      image_y = (
+          arm_xy[1]
+          - self.action_spec['action_to_coord'] * norm_action_y * rescale_ratio
+      )
+    # blue is out of the page, red is into the page
+    red_z = self.style['rgb_scale'] * ((norm_action[0] + 1) / 2)
+    blue_z = self.style['rgb_scale'] * (1 - (norm_action[0] + 1) / 2)
+    color_z = np.clip(
+        (red_z, 0, blue_z),
+        0, self.style['rgb_scale'])
+    radius_z = int(np.clip((0.75 - norm_action_z / 4) * self.style['radius'],
+                           0.5 * self.style['radius'], self.style['radius']))
+    return Coordinate(
+        xy=(int(image_x), int(image_y)),
+        color=color_z,
+        radius=radius_z,
+    )
+  def navigation_action_to_coord(
+      self, action, image, center_xy, do_project=False
+  ):
+    """Converts a ZXY or XY action to an image coordinate.
+    Conversion is done based on style['focal_offset'] and action_spec['scale'].
+    Args:
+      action: z, y, x action in robot action space
+      image: image
+      center_xy: x, y in image space
+      do_project: whether or not to project actions sampled outside the image to
+        the edge of the image
+    Returns:
+      Dict coordinate with image x, y, arrow color, and circle radius.
+    """
+    # TODO(tedxiao): Refactor into common utiliy fns, add embodiment specific
+    #                logic.
+    if self.action_spec['scale'][0] == 0:  # no z dimension
+      norm_action = [(action[d] - self.action_spec['loc'][d]) /
+                     (2 * self.action_spec['scale'][d]) for d in range(1, 3)]
+      norm_action_y, norm_action_x = norm_action
+      norm_action_z = 0
+    else:
+      norm_action = [(action[d] - self.action_spec['loc'][d]) /
+                     (2 * self.action_spec['scale'][d]) for d in range(3)]
+      norm_action_z, norm_action_y, norm_action_x = norm_action
+    focal_length = np.max(
+        [0.2,  # positive focal lengths only
+         self.style['focal_offset'] / (self.style['focal_offset'] + norm_action_z)])
+    image_x = center_xy[0] - (
+        self.action_spec['action_to_coord'] * norm_action_x * focal_length
+    )
+    image_y = center_xy[1] - (
+        self.action_spec['action_to_coord'] * norm_action_y * focal_length
+    )
+    if (
+        vip_utils.coord_outside_image(
+            Coordinate(xy=(image_x, image_y)), image, self.style['radius']
+        )
+        and do_project
+    ):
+      # project the arrow to the edge of the image if too large
+      height, width, _ = image.shape
+      max_x = (
+          width - center_xy[0] - 2 * self.style['radius']
+          if norm_action_x < 0
+          else center_xy[0] - 2 * self.style['radius']
+      )
+      max_y = (
+          height - center_xy[1] - 2 * self.style['radius']
+          if norm_action_y < 0
+          else center_xy[1] - 2 * self.style['radius']
+      )
+      rescale_ratio = min(np.abs([
+          max_x / (self.action_spec['action_to_coord'] * norm_action_x),
+          max_y / (self.action_spec['action_to_coord'] * norm_action_y)]))
+      image_x = (
+          center_xy[0]
+          - self.action_spec['action_to_coord'] * norm_action_x * rescale_ratio
+      )
+      image_y = (
+          center_xy[1]
+          - self.action_spec['action_to_coord'] * norm_action_y * rescale_ratio
+      )
+    return Coordinate(
+        xy=(int(image_x), int(image_y)),
+        color=0.1 * self.style['rgb_scale'],
+        radius=int(self.style['radius']),
+    )
+  def sample_actions(
+      self, image, arm_xy, loc, scale, true_action=None, max_itrs=1000
+  ):
+    """Sample actions from distribution.
+    Args:
+      image: image
+      arm_xy: x, y in image space of arm
+      loc: action distribution mean to sample from
+      scale: action distribution variance to sample from
+      true_action: action taken in demonstration if available
+      max_itrs: number of tries to get a valid sample
+    Returns:
+      samples: Samples with associated actions, coords, text_coords, labels.
+    """
+    image = copy.deepcopy(image)
+    samples = []
+    actions = []
+    coords = []
+    text_coords = []
+    labels = []
+    # Keep track of oracle action if available.
+    true_label = None
+    if true_action is not None:
+      actions.append(true_action)
+      coord = self.action_to_coord(true_action, image, arm_xy)
+      coords.append(coord)
+      text_coords.append(
+          vip_utils.coord_to_text_coord(coords[-1], arm_xy, coord.radius)
+      )
+      true_label = np.random.randint(self.style['num_samples'])
+      # labels.append(str(true_label) + '*')
+      labels.append(str(true_label))
+    # Generate all action samples.
+    for i in range(self.style['num_samples']):
+      if i == true_label:
+        continue
+      itrs = 0
+      # Generate action scaled appropriately.
+      action = np.clip(np.random.normal(loc, scale),
+                       self.action_spec['min'], self.action_spec['max'])
+      # Convert sampled action to image coordinates.
+      coord = self.action_to_coord(action, image, arm_xy)
+      # Resample action if it results in invalid image annotation.
+      adjusted_scale = np.array(scale)
+      while ((vip_utils.is_invalid_coord(coord, coords, self.style['radius']*1.5, image)
+              or vip_utils.coord_outside_image(coord, image, self.style['radius']))
+             and itrs < max_itrs):
+        action = np.clip(np.random.normal(loc, adjusted_scale),
+                         self.action_spec['min'], self.action_spec['max'])
+        coord = self.action_to_coord(action, image, arm_xy)
+        itrs += 1
+        # increase sampling range slightly if not finding a good sample
+        adjusted_scale *= 1.1
+        if itrs == max_itrs:
+          # If the final iteration results in invalid annotation, just clip
+          # to edge of image.
+          coord = self.action_to_coord(action, image, arm_xy, do_project=True)
+      # Compute image coordinates of text labels.
+      radius = coord.radius
+      text_coord = Coordinate(
+          xy=vip_utils.coord_to_text_coord(coord, arm_xy, radius)
+      )
+      actions.append(action)
+      coords.append(coord)
+      text_coords.append(text_coord)
+      labels.append(str(i))
+    for i in range(len(actions)):
+      sample = Sample(
+          action=actions[i],
+          coord=coords[i],
+          text_coord=text_coords[i],
+          label=str(i),
+      )
+      samples.append(sample)
+    return samples
+  def add_arrow_overlay_plt(self, image, samples, arm_xy, log_image=False):
+    """Add arrows and circles to the image.
+    Args:
+      image: image
+      samples: Samples to visualize.
+      arm_xy: x, y image coordinates for EEF center.
+      log_image: Boolean for whether to save to CNS.
+    Returns:
+      image: image with visual prompts.
+    """
+    # Add transparent arrows and circles
+    overlay = image.copy()
+    (original_image_height, original_image_width, _) = image.shape
+    white = (
+        self.style['rgb_scale'],
+        self.style['rgb_scale'],
+        self.style['rgb_scale'],
+    )
+    # Add arrows.
+    for sample in samples:
+      color = sample.coord.color
+      cv2.arrowedLine(
+          overlay, arm_xy, sample.coord.xy, color, self.style['thickness']
+      )
+    image = cv2.addWeighted(overlay, self.style['arrow_alpha'],
+                            image, 1 - self.style['arrow_alpha'], 0)
+    overlay = image.copy()
+    # Add circles.
+    for sample in samples:
+      color = sample.coord.color
+      radius = sample.coord.radius
+      cv2.circle(
+          overlay,
+          sample.text_coord.xy,
+          radius,
+          color,
+          self.style['thickness'] + 1,
+      )
+      cv2.circle(overlay, sample.text_coord.xy, radius, white, -1)
+    image = cv2.addWeighted(overlay, self.style['circle_alpha'],
+                            image, 1 - self.style['circle_alpha'], 0)
+    dpi = plt.rcParams['figure.dpi']
+    if self.fig_scale_size is None:
+      # test saving a figure to decide size for text figure
+      fig_size = (original_image_width / dpi, original_image_height / dpi)
+      plt.subplots(1, figsize=fig_size)
+      plt.imshow(image, cmap='binary')
+      plt.axis('off')
+      fig = plt.gcf()
+      fig.tight_layout(pad=0)
+      buf = io.BytesIO()
+      plt.savefig(buf, format='png')
+      plt.close()
+      buf.seek(0)
+      test_image = cv2.imdecode(
+          np.frombuffer(buf.getvalue(), dtype=np.uint8), cv2.IMREAD_COLOR)
+      self.fig_scale_size = original_image_width / test_image.shape[1]
+    # Add text to figure.
+    fig_size = (self.fig_scale_size * original_image_width / dpi,
+                self.fig_scale_size * original_image_height / dpi)
+    plt.subplots(1, figsize=fig_size)
+    plt.imshow(image, cmap='binary')
+    for sample in samples:
+      plt.text(
+          sample.text_coord.xy[0],
+          sample.text_coord.xy[1],
+          sample.label,
+          ha='center',
+          va='center',
+          color='k',
+          fontsize=self.style['fontsize'],
+      )
+    # Compile image.
+    plt.axis('off')
+    fig = plt.gcf()
+    fig.tight_layout(pad=0)
+    buf = io.BytesIO()
+    plt.savefig(buf, format='png')
+    plt.close()
+    image = cv2.imdecode(np.frombuffer(buf.getvalue(), dtype=np.uint8),
+                         cv2.IMREAD_COLOR)
+    image = cv2.resize(image, (original_image_width, original_image_height))
+    image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+    # Optionally log images to CNS.
+    if log_image:
+      raise NotImplementedError('TODO: log image too CNS')
+    return image
+  def fit(self, values, samples):
+    """Fit a loc and scale to selected actions.
+    Args:
+      values: list of selected labels
+      samples: list of all Samples
+    Returns:
+      loc: mean of selected distribution
+      scale: variance of selected distribution
+    """
+    actions = [sample.action for sample in samples]
+    labels = [sample.label for sample in samples]
+    if not values:  # revert to initial distribution
+      print('GPT failed to return integer arrows')
+      loc = self.action_spec['loc']
+      scale = self.action_spec['scale']
+    elif len(values) == 1:  # single response, add a distribution over it
+      index = np.where([label == str(values[-1]) for label in labels])[0][0]
+      action = actions[index]
+      print('action', action)
+      loc = action
+      scale = self.action_spec["min_scale"]
+    else:  # fit distribution
+      selected_actions = []
+      for value in values:
+        idx = np.where([label == str(value) for label in labels])[0][0]
+        selected_actions.append(actions[idx])
+      print('selected_actions', selected_actions)
+      loc_scale = [scipy.stats.norm.fit([action[d] for action in selected_actions]) for d in range(3)]
+      loc = [loc_scale[d][0] for d in range(3)]
+      scale = np.clip([loc_scale[d][1] for d in range(3)], self.action_spec['min_scale'], None)
+      print('loc', loc, '\nscale', scale)
+    return loc, scale

vip_runner.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""VIP."""
+import json
+import re
+import cv2
+from tqdm import trange
+import vip
+def make_prompt(description, top_n=3):
+  return f"""
+INSTRUCTIONS:
+You are tasked to locate an object, region, or point in space in the given annotated image according to a description.
+The image is annoated with numbered circles.
+Choose the top {top_n} circles that have the most overlap with and/or is closest to what the description is describing in the image.
+You are a five-time world champion in this game.
+Give a one sentence analysis of why you chose those points.
+Provide your answer at the end in a valid JSON of this format:
+{{"points": []}}
+DESCRIPTION: {description}
+IMAGE:
+""".strip()
+def extract_json(response, key):
+  json_part = re.search(r"\{.*\}", response, re.DOTALL)
+  parsed_json = {}
+  if json_part:
+    json_data = json_part.group()
+    # Parse the JSON data
+    parsed_json = json.loads(json_data)
+  else:
+    print("No JSON data found ******\n", response)
+  return parsed_json[key]
+def vip_perform_selection(prompter, vlm, im, desc, arm_coord, samples, top_n):
+  """Perform one selection pass given samples."""
+  image_circles_np = prompter.add_arrow_overlay_plt(
+      image=im, samples=samples, arm_xy=arm_coord, log_image=False
+  )
+  _, encoded_image_circles = cv2.imencode(".png", image_circles_np)
+  prompt_seq = [make_prompt(desc, top_n=top_n), encoded_image_circles]
+  response = vlm.query(prompt_seq)
+  arrow_ids = extract_json(response, "points")
+  return arrow_ids, image_circles_np
+def vip_runner(
+    vlm,
+    im,
+    desc,
+    style,
+    action_spec,
+    n_samples_init=25,
+    n_samples_opt=10,
+    n_iters=3,
+    recursion_level=0,
+):
+  """VIP."""
+  prompter = vip.VisualIterativePrompter(
+      style, action_spec, vip.SupportedEmbodiments.META_NAVIGATION
+  )
+  output_ims = []
+  arm_coord = (int(im.shape[1] / 2), int(im.shape[0] / 2))
+  if recursion_level == 0:
+    center_mean = action_spec["loc"]
+    center_std = action_spec["scale"]
+    selected_samples = []
+    for itr in trange(n_iters):
+      if itr == 0:
+        style["num_samples"] = n_samples_init
+      else:
+        style["num_samples"] = n_samples_opt
+      samples = prompter.sample_actions(im, arm_coord, center_mean, center_std)
+      arrow_ids, image_circles_np = vip_perform_selection(
+          prompter, vlm, im, desc, arm_coord, samples, top_n=3
+      )
+      # plot sampled circles as red
+      selected_samples = []
+      for selected_id in arrow_ids:
+        sample = samples[selected_id]
+        sample.coord.color = (255, 0, 0)
+        selected_samples.append(sample)
+      image_circles_marked_np = prompter.add_arrow_overlay_plt(
+          image_circles_np, selected_samples, arm_coord
+      )
+      output_ims.append(image_circles_marked_np)
+      # if at last iteration, pick one answer out of the selected ones
+      if itr == n_iters - 1:
+        arrow_ids, _ = vip_perform_selection(
+            prompter, vlm, im, desc, arm_coord, selected_samples, top_n=1
+        )
+        selected_samples = []
+        for selected_id in arrow_ids:
+          sample = samples[selected_id]
+          sample.coord.color = (255, 0, 0)
+          selected_samples.append(sample)
+        image_circles_marked_np = prompter.add_arrow_overlay_plt(
+            im, selected_samples, arm_coord
+        )
+        output_ims.append(image_circles_marked_np)
+      center_mean, center_std = prompter.fit(arrow_ids, samples)
+    if output_ims:
+      return (
+          output_ims,
+          prompter.action_to_coord(center_mean, im, arm_coord).xy,
+          selected_samples,
+      )
+  else:
+    new_samples = []
+    for i in range(3):
+      out_ims, _, cur_samples = vip_runner(
+          vlm=vlm,
+          im=im,
+          desc=desc,
+          style=style,
+          action_spec=action_spec,
+          n_samples_init=n_samples_init,
+          n_samples_opt=n_samples_opt,
+          n_iters=n_iters,
+          recursion_level=recursion_level - 1,
+      )
+      output_ims += out_ims
+      new_samples += cur_samples
+    # adjust sample label to avoid duplications
+    for sample_id in range(len(new_samples)):
+      new_samples[sample_id].label = str(sample_id)
+    arrow_ids, _ = vip_perform_selection(
+        prompter, vlm, im, desc, arm_coord, new_samples, top_n=1
+    )
+    selected_samples = []
+    for selected_id in arrow_ids:
+      sample = new_samples[selected_id]
+      sample.coord.color = (255, 0, 0)
+      selected_samples.append(sample)
+    image_circles_marked_np = prompter.add_arrow_overlay_plt(
+        im, selected_samples, arm_coord
+    )
+    output_ims.append(image_circles_marked_np)
+    center_mean, _ = prompter.fit(arrow_ids, new_samples)
+    if output_ims:
+      return (
+          output_ims,
+          prompter.action_to_coord(center_mean, im, arm_coord).xy,
+          selected_samples,
+      )
+  return [], "Unable to understand query"

vip_utils.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# pylint: disable=line-too-long
+"""Utils for visual iterative prompting.
+A number of utility functions for VIP.
+"""
+import copy
+import re
+import numpy as np
+import scipy.spatial.distance as distance
+import matplotlib.pyplot as plt
+def min_dist(coord, coords):
+  if not coords:
+    return np.inf
+  xys = np.asarray([[coord.xy] for coord in coords])
+  return np.linalg.norm(xys - np.asarray(coord.xy), axis=-1).min()
+def coord_outside_image(coord, image, radius):
+  (height, image_width, _) = image.shape
+  x, y = coord.xy
+  x_outside = x > image_width - 2 * radius or x < 2 * radius
+  y_outside = y > height - 2 * radius or y < 2 * radius
+  return x_outside or y_outside
+def is_invalid_coord(coord, coords, radius, image):
+  # invalid if too close to others or outside of the image
+  pos_overlaps = min_dist(coord, coords) < 1.5 * radius
+  return pos_overlaps or coord_outside_image(coord, image, radius)
+def angle_mag_2_x_y(angle, mag, arm_coord, is_circle=False, radius=40):
+  x, y = arm_coord
+  x += int(np.cos(angle) * mag)
+  y += int(np.sin(angle) * mag)
+  if is_circle:
+    x += int(np.cos(angle) * radius * np.sign(mag))
+    y += int(np.sin(angle) * radius * np.sign(mag))
+  return x, y
+def coord_to_text_coord(coord, arm_coord, radius):
+  delta_coord = np.asarray(coord.xy) - arm_coord
+  if np.linalg.norm(delta_coord) == 0:
+    return arm_coord
+  return (
+      int(coord.xy[0] + radius * delta_coord[0] / np.linalg.norm(delta_coord)),
+      int(coord.xy[1] + radius * delta_coord[1] / np.linalg.norm(delta_coord)))
+def prep_aloha_frames(real_frame):
+  """Prepare collage of ALOHA view frames."""
+  markup_frame = copy.deepcopy(real_frame)
+  top_frame = copy.deepcopy(markup_frame[
+      :int(markup_frame.shape[0] / 2), :int(markup_frame.shape[1] / 2)])
+  side_frame = copy.deepcopy(markup_frame[
+      int(markup_frame.shape[0] / 2):, :int(markup_frame.shape[1] / 2)])
+  right_frame = copy.deepcopy(markup_frame[
+      int(markup_frame.shape[0] / 2):, int(markup_frame.shape[1] / 2):])
+  left_frame = copy.deepcopy(markup_frame[
+      :int(markup_frame.shape[0] / 2), int(markup_frame.shape[1] / 2):])
+  markup_frame[int(markup_frame.shape[0] / 2):, :int(markup_frame.shape[1] / 2)] = left_frame
+  markup_frame[:int(markup_frame.shape[0] / 2), int(markup_frame.shape[1] / 2):] = side_frame
+  return markup_frame, right_frame, left_frame
+def parse_response(response, answer_key='Arrow: ['):
+  values = []
+  if answer_key in response:
+    print('parse_response from answer_key')
+    arrow_response = response.split(answer_key)[-1].split(']')[0]
+    for val in map(int, re.findall(r'\d+', arrow_response)):
+      values.append(val)
+  else:
+    print('parse_response for all ints')
+    for val in map(int, re.findall(r'\d+', response)):
+      values.append(val)
+  return values
+# TODO(ichter): normalize values by std
+def compute_errors(action, true_action, verbose=False):
+  """Compute errors between a predicted action and true action."""
+  l2_error = np.linalg.norm(action - true_action)
+  cos_sim = 1 - distance.cosine(action, true_action)
+  l2_xy_error = np.linalg.norm(action[-2:] - true_action[-2:])
+  cos_xy_sim = 1 - distance.cosine(action[-2:], true_action[-2:])
+  z_error = np.abs(action[0] - true_action[0])
+  errors = {'l2': l2_error,
+            'cos_sim': cos_sim,
+            'l2_xy_error': l2_xy_error,
+            'cos_xy_sim': cos_xy_sim,
+            'z_error': z_error}
+  if verbose:
+    print('action: \t', [f'{a:.3f}' for a in action])
+    print('true_action \t', [f'{a:.3f}' for a in true_action])
+    print(f'l2: \t\t{l2_error:.3f}')
+    print(f'l2_xy_error: \t{l2_xy_error:.3f}')
+    print(f'cos_sim: \t{cos_sim:.3f}')
+    print(f'cos_xy_sim: \t{cos_xy_sim:.3f}')
+    print(f'z_error: \t{z_error:.3f}')
+  return errors
+def plot_errors(all_errors, error_types=None):
+  """Plot errors across iterations."""
+  if error_types is None:
+    error_types = ['l2', 'l2_xy_error', 'z_error', 'cos_sim', 'cos_xy_sim',]
+  _, axs = plt.subplots(2, 3, figsize=(15, 8))
+  for i, error_type in enumerate(error_types):  # go through each error type
+    all_iter_errors = {}
+    for error_by_iter in all_errors:  # go through each call
+      for itr in error_by_iter:  # go through each iteration
+        if itr in all_iter_errors: # add error to the iteration it happened
+          all_iter_errors[itr].append(error_by_iter[itr][error_type])
+        else:
+          all_iter_errors[itr] = [error_by_iter[itr][error_type]]
+    mean_iter_errors = [np.mean(all_iter_errors[itr]) for itr in all_iter_errors]
+    axs[i // 3, i % 3].plot(all_iter_errors.keys(), mean_iter_errors)
+    axs[i // 3, i % 3].set_title(error_type)
+  plt.show()

vlms.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""VLM Helper Functions."""
+import base64
+import numpy as np
+from openai import OpenAI
+class GPT4V:
+  """GPT4V VLM."""
+  def __init__(self, openai_api_key):
+    self.client = OpenAI(api_key=openai_api_key)
+  def query(self, prompt_seq, temperature=0, max_tokens=512):
+    """Queries GPT-4V."""
+    content = []
+    for elem in prompt_seq:
+      if isinstance(elem, str):
+        content.append({'type': 'text', 'text': elem})
+      elif isinstance(elem, np.ndarray):
+        base64_image_str = base64.b64encode(elem).decode('utf-8')
+        image_url = f'data:image/jpeg;base64,{base64_image_str}'
+        content.append({'type': 'image_url', 'image_url': {'url': image_url}})
+    messages = [{'role': 'user', 'content': content}]
+    response = self.client.chat.completions.create(
+        model='gpt-4-vision-preview',
+        messages=messages,
+        temperature=temperature,
+        max_tokens=max_tokens
+    )
+    return response.choices[0].message.content