Spaces:

pivot-iterative-visual-optimization
/

pivot-demo

Sleeping

App Files Files Community

pivot-iterative-visual-optimization commited on Feb 12, 2024

Commit

f9a62da

verified ·

1 Parent(s): 53ef1bb

Upload 5 files

Browse files

Files changed (4) hide show

app.py +1 -1
vip.py +74 -139
vip_runner.py +2 -2
vip_utils.py +21 -29

app.py CHANGED Viewed

@@ -49,7 +49,7 @@ def run_vip(
       'min': [0, -300.0, -300],
       'max': [0, 300, 300],
       'action_to_coord': 250,
-      'robot': 'meta',
   }
   vlm = GPT4V(openai_api_key=openai_api_key)

       'min': [0, -300.0, -300],
       'max': [0, 300, 300],
       'action_to_coord': 250,
+      'robot': None,
   }
   vlm = GPT4V(openai_api_key=openai_api_key)

vip.py CHANGED Viewed

@@ -1,18 +1,6 @@
-# pylint: disable=line-too-long
 """Visual Iterative Prompting functions.
-Copied from experimental/users/ichter/vip/vip.py
 Code to implement visual iterative prompting, an approach for querying VLMs.
-See go/visual-iterative-prompting for more information.
-These are used within Colabs such as:
-*
-https://colab.corp.google.com/drive/1GnO-1urDCETWo3M3PpQKQ8TqT1Ql_jiS#scrollTo=5dUSoiz6Hplv
-*
-https://colab.corp.google.com/drive/14AYsa4W68NnsaREFTUX7lTkSxpD5eHCO#scrollTo=qA2A_oTcGTzN
-*
-https://colab.corp.google.com/drive/11H-WtHNYzBkr_lQpaa4ASeYy0HD29EXe#scrollTo=HapF0UIxdJM6
 """
 import copy
@@ -31,9 +19,7 @@ import vip_utils
 class SupportedEmbodiments(str, enum.Enum):
   """Embodiments supported by VIP."""
-  META_MANIPULATION = 'meta_manipulation'
-  ALOHA_MANIPULATION = 'aloha_manipulation'
-  META_NAVIGATION = 'meta_navigation'
 @dataclasses.dataclass()
@@ -74,95 +60,8 @@ class VisualIterativePrompter:
   def action_to_coord(self, action, image, arm_xy, do_project=False):
     """Converts candidate action to image coordinate."""
-    if (self.embodiment == SupportedEmbodiments.META_MANIPULATION or
-        self.embodiment == SupportedEmbodiments.ALOHA_MANIPULATION):
-      return self.manipulation_action_to_coord(
-          action=action, image=image, arm_xy=arm_xy, do_project=do_project
-      )
-    elif self.embodiment == SupportedEmbodiments.META_NAVIGATION:
-      return self.navigation_action_to_coord(
-          action=action, image=image, center_xy=arm_xy, do_project=do_project
-      )
-    else:
-      raise NotImplementedError('Embodiment not supported.')
-  def manipulation_action_to_coord(
-      self, action, image, arm_xy, do_project=False
-  ):
-    """Converts a ZXY or XY action to an image coordinate.
-    Conversion is done based on style['focal_offset'] and action_spec['scale'].
-    Args:
-      action: z, y, x action in robot action space
-      image: image
-      arm_xy: x, y in image space
-      do_project: whether or not to project actions sampled outside the image to
-        the edge of the image
-    Returns:
-      Dict coordinate with image x, y, arrow color, and circle radius.
-    """
-    # TODO(tedxiao): Refactor into common utiliy fns, add embodiment specific
-    #                logic.
-    if self.action_spec['scale'][0] == 0:  # no z dimension
-      norm_action = [(action[d] - self.action_spec['loc'][d]) /
-                     (2 * self.action_spec['scale'][d]) for d in range(1, 3)]
-      norm_action_y, norm_action_x = norm_action
-      norm_action_z = 0
-    else:
-      norm_action = [(action[d] - self.action_spec['loc'][d]) /
-                     (2 * self.action_spec['scale'][d]) for d in range(3)]
-      norm_action_z, norm_action_y, norm_action_x = norm_action
-    focal_length = np.max(
-        [0.2,  # positive focal lengths only
-         self.style['focal_offset'] / (self.style['focal_offset'] + norm_action_z)])
-    image_x = arm_xy[0] - (
-        self.action_spec['action_to_coord'] * norm_action_x * focal_length
-    )
-    image_y = arm_xy[1] - (
-        self.action_spec['action_to_coord'] * norm_action_y * focal_length
-    )
-    if vip_utils.coord_outside_image(
-        coord=Coordinate(xy=(int(image_x), int(image_y))),
-        image=image,
-        radius=self.style['radius']) and do_project:
-      # project the arrow to the edge of the image if too large
-      height, width, _ = image.shape
-      max_x = (
-          width - arm_xy[0] - 2 * self.style['radius']
-          if norm_action_x < 0
-          else arm_xy[0] - 2 * self.style['radius']
-      )
-      max_y = (
-          height - arm_xy[1] - 2 * self.style['radius']
-          if norm_action_y < 0
-          else arm_xy[1] - 2 * self.style['radius']
-      )
-      rescale_ratio = min(np.abs([
-          max_x / (self.action_spec['action_to_coord'] * norm_action_x),
-          max_y / (self.action_spec['action_to_coord'] * norm_action_y)]))
-      image_x = (
-          arm_xy[0]
-          - self.action_spec['action_to_coord'] * norm_action_x * rescale_ratio
-      )
-      image_y = (
-          arm_xy[1]
-          - self.action_spec['action_to_coord'] * norm_action_y * rescale_ratio
-      )
-    # blue is out of the page, red is into the page
-    red_z = self.style['rgb_scale'] * ((norm_action[0] + 1) / 2)
-    blue_z = self.style['rgb_scale'] * (1 - (norm_action[0] + 1) / 2)
-    color_z = np.clip(
-        (red_z, 0, blue_z),
-        0, self.style['rgb_scale'])
-    radius_z = int(np.clip((0.75 - norm_action_z / 4) * self.style['radius'],
-                           0.5 * self.style['radius'], self.style['radius']))
-    return Coordinate(
-        xy=(int(image_x), int(image_y)),
-        color=color_z,
-        radius=radius_z,
     )
   def navigation_action_to_coord(
@@ -182,20 +81,26 @@ class VisualIterativePrompter:
     Returns:
       Dict coordinate with image x, y, arrow color, and circle radius.
     """
-    # TODO(tedxiao): Refactor into common utiliy fns, add embodiment specific
-    #                logic.
     if self.action_spec['scale'][0] == 0:  # no z dimension
-      norm_action = [(action[d] - self.action_spec['loc'][d]) /
-                     (2 * self.action_spec['scale'][d]) for d in range(1, 3)]
       norm_action_y, norm_action_x = norm_action
       norm_action_z = 0
     else:
-      norm_action = [(action[d] - self.action_spec['loc'][d]) /
-                     (2 * self.action_spec['scale'][d]) for d in range(3)]
       norm_action_z, norm_action_y, norm_action_x = norm_action
-    focal_length = np.max(
-        [0.2,  # positive focal lengths only
-         self.style['focal_offset'] / (self.style['focal_offset'] + norm_action_z)])
     image_x = center_xy[0] - (
         self.action_spec['action_to_coord'] * norm_action_x * focal_length
     )
@@ -220,9 +125,12 @@ class VisualIterativePrompter:
           if norm_action_y < 0
           else center_xy[1] - 2 * self.style['radius']
       )
-      rescale_ratio = min(np.abs([
-          max_x / (self.action_spec['action_to_coord'] * norm_action_x),
-          max_y / (self.action_spec['action_to_coord'] * norm_action_y)]))
       image_x = (
           center_xy[0]
           - self.action_spec['action_to_coord'] * norm_action_x * rescale_ratio
@@ -282,19 +190,28 @@ class VisualIterativePrompter:
       itrs = 0
       # Generate action scaled appropriately.
-      action = np.clip(np.random.normal(loc, scale),
-                       self.action_spec['min'], self.action_spec['max'])
       # Convert sampled action to image coordinates.
       coord = self.action_to_coord(action, image, arm_xy)
       # Resample action if it results in invalid image annotation.
       adjusted_scale = np.array(scale)
-      while ((vip_utils.is_invalid_coord(coord, coords, self.style['radius']*1.5, image)
-              or vip_utils.coord_outside_image(coord, image, self.style['radius']))
-             and itrs < max_itrs):
-        action = np.clip(np.random.normal(loc, adjusted_scale),
-                         self.action_spec['min'], self.action_spec['max'])
         coord = self.action_to_coord(action, image, arm_xy)
         itrs += 1
         # increase sampling range slightly if not finding a good sample
@@ -325,7 +242,7 @@ class VisualIterativePrompter:
       samples.append(sample)
     return samples
-  def add_arrow_overlay_plt(self, image, samples, arm_xy, log_image=False):
     """Add arrows and circles to the image.
     Args:
@@ -353,8 +270,13 @@ class VisualIterativePrompter:
       cv2.arrowedLine(
           overlay, arm_xy, sample.coord.xy, color, self.style['thickness']
       )
-    image = cv2.addWeighted(overlay, self.style['arrow_alpha'],
-                            image, 1 - self.style['arrow_alpha'], 0)
     overlay = image.copy()
     # Add circles.
@@ -369,8 +291,13 @@ class VisualIterativePrompter:
           self.style['thickness'] + 1,
       )
       cv2.circle(overlay, sample.text_coord.xy, radius, white, -1)
-    image = cv2.addWeighted(overlay, self.style['circle_alpha'],
-                            image, 1 - self.style['circle_alpha'], 0)
     dpi = plt.rcParams['figure.dpi']
     if self.fig_scale_size is None:
@@ -386,12 +313,15 @@ class VisualIterativePrompter:
       plt.close()
       buf.seek(0)
       test_image = cv2.imdecode(
-          np.frombuffer(buf.getvalue(), dtype=np.uint8), cv2.IMREAD_COLOR)
       self.fig_scale_size = original_image_width / test_image.shape[1]
     # Add text to figure.
-    fig_size = (self.fig_scale_size * original_image_width / dpi,
-                self.fig_scale_size * original_image_height / dpi)
     plt.subplots(1, figsize=fig_size)
     plt.imshow(image, cmap='binary')
     for sample in samples:
@@ -412,15 +342,13 @@ class VisualIterativePrompter:
     buf = io.BytesIO()
     plt.savefig(buf, format='png')
     plt.close()
-    image = cv2.imdecode(np.frombuffer(buf.getvalue(), dtype=np.uint8),
-                         cv2.IMREAD_COLOR)
     image = cv2.resize(image, (original_image_width, original_image_height))
     image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
-    # Optionally log images to CNS.
-    if log_image:
-      raise NotImplementedError('TODO: log image too CNS')
     return image
   def fit(self, values, samples):
@@ -446,7 +374,7 @@ class VisualIterativePrompter:
       action = actions[index]
       print('action', action)
       loc = action
-      scale = self.action_spec["min_scale"]
     else:  # fit distribution
       selected_actions = []
       for value in values:
@@ -454,9 +382,16 @@ class VisualIterativePrompter:
         selected_actions.append(actions[idx])
       print('selected_actions', selected_actions)
-      loc_scale = [scipy.stats.norm.fit([action[d] for action in selected_actions]) for d in range(3)]
       loc = [loc_scale[d][0] for d in range(3)]
-      scale = np.clip([loc_scale[d][1] for d in range(3)], self.action_spec['min_scale'], None)
       print('loc', loc, '\nscale', scale)
     return loc, scale

 """Visual Iterative Prompting functions.
 Code to implement visual iterative prompting, an approach for querying VLMs.
 """
 import copy
 class SupportedEmbodiments(str, enum.Enum):
   """Embodiments supported by VIP."""
+  HF_DEMO = 'hf_demo'
 @dataclasses.dataclass()
   def action_to_coord(self, action, image, arm_xy, do_project=False):
     """Converts candidate action to image coordinate."""
+    return self.navigation_action_to_coord(
+        action=action, image=image, center_xy=arm_xy, do_project=do_project
     )
   def navigation_action_to_coord(
     Returns:
       Dict coordinate with image x, y, arrow color, and circle radius.
     """
     if self.action_spec['scale'][0] == 0:  # no z dimension
+      norm_action = [
+          (action[d] - self.action_spec['loc'][d])
+          / (2 * self.action_spec['scale'][d])
+          for d in range(1, 3)
+      ]
       norm_action_y, norm_action_x = norm_action
       norm_action_z = 0
     else:
+      norm_action = [
+          (action[d] - self.action_spec['loc'][d])
+          / (2 * self.action_spec['scale'][d])
+          for d in range(3)
+      ]
       norm_action_z, norm_action_y, norm_action_x = norm_action
+    focal_length = np.max([
+        0.2,  # positive focal lengths only
+        self.style['focal_offset']
+        / (self.style['focal_offset'] + norm_action_z),
+    ])
     image_x = center_xy[0] - (
         self.action_spec['action_to_coord'] * norm_action_x * focal_length
     )
           if norm_action_y < 0
           else center_xy[1] - 2 * self.style['radius']
       )
+      rescale_ratio = min(
+          np.abs([
+              max_x / (self.action_spec['action_to_coord'] * norm_action_x),
+              max_y / (self.action_spec['action_to_coord'] * norm_action_y),
+          ])
+      )
       image_x = (
           center_xy[0]
           - self.action_spec['action_to_coord'] * norm_action_x * rescale_ratio
       itrs = 0
       # Generate action scaled appropriately.
+      action = np.clip(
+          np.random.normal(loc, scale),
+          self.action_spec['min'],
+          self.action_spec['max'],
+      )
       # Convert sampled action to image coordinates.
       coord = self.action_to_coord(action, image, arm_xy)
       # Resample action if it results in invalid image annotation.
       adjusted_scale = np.array(scale)
+      while (
+          vip_utils.is_invalid_coord(
+              coord, coords, self.style['radius'] * 1.5, image
+          )
+          or vip_utils.coord_outside_image(coord, image, self.style['radius'])
+      ) and itrs < max_itrs:
+        action = np.clip(
+            np.random.normal(loc, adjusted_scale),
+            self.action_spec['min'],
+            self.action_spec['max'],
+        )
         coord = self.action_to_coord(action, image, arm_xy)
         itrs += 1
         # increase sampling range slightly if not finding a good sample
       samples.append(sample)
     return samples
+  def add_arrow_overlay_plt(self, image, samples, arm_xy):
     """Add arrows and circles to the image.
     Args:
       cv2.arrowedLine(
           overlay, arm_xy, sample.coord.xy, color, self.style['thickness']
       )
+    image = cv2.addWeighted(
+        overlay,
+        self.style['arrow_alpha'],
+        image,
+        1 - self.style['arrow_alpha'],
+        0,
+    )
     overlay = image.copy()
     # Add circles.
           self.style['thickness'] + 1,
       )
       cv2.circle(overlay, sample.text_coord.xy, radius, white, -1)
+    image = cv2.addWeighted(
+        overlay,
+        self.style['circle_alpha'],
+        image,
+        1 - self.style['circle_alpha'],
+        0,
+    )
     dpi = plt.rcParams['figure.dpi']
     if self.fig_scale_size is None:
       plt.close()
       buf.seek(0)
       test_image = cv2.imdecode(
+          np.frombuffer(buf.getvalue(), dtype=np.uint8), cv2.IMREAD_COLOR
+      )
       self.fig_scale_size = original_image_width / test_image.shape[1]
     # Add text to figure.
+    fig_size = (
+        self.fig_scale_size * original_image_width / dpi,
+        self.fig_scale_size * original_image_height / dpi,
+    )
     plt.subplots(1, figsize=fig_size)
     plt.imshow(image, cmap='binary')
     for sample in samples:
     buf = io.BytesIO()
     plt.savefig(buf, format='png')
     plt.close()
+    image = cv2.imdecode(
+        np.frombuffer(buf.getvalue(), dtype=np.uint8), cv2.IMREAD_COLOR
+    )
     image = cv2.resize(image, (original_image_width, original_image_height))
     image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
     return image
   def fit(self, values, samples):
       action = actions[index]
       print('action', action)
       loc = action
+      scale = self.action_spec['min_scale']
     else:  # fit distribution
       selected_actions = []
       for value in values:
         selected_actions.append(actions[idx])
       print('selected_actions', selected_actions)
+      loc_scale = [
+          scipy.stats.norm.fit([action[d] for action in selected_actions])
+          for d in range(3)
+      ]
       loc = [loc_scale[d][0] for d in range(3)]
+      scale = np.clip(
+          [loc_scale[d][1] for d in range(3)],
+          self.action_spec['min_scale'],
+          None,
+      )
       print('loc', loc, '\nscale', scale)
     return loc, scale

vip_runner.py CHANGED Viewed

@@ -41,7 +41,7 @@ def extract_json(response, key):
 def vip_perform_selection(prompter, vlm, im, desc, arm_coord, samples, top_n):
   """Perform one selection pass given samples."""
   image_circles_np = prompter.add_arrow_overlay_plt(
-      image=im, samples=samples, arm_xy=arm_coord, log_image=False
   )
   _, encoded_image_circles = cv2.imencode(".png", image_circles_np)
@@ -71,7 +71,7 @@ def vip_runner(
   """VIP."""
   prompter = vip.VisualIterativePrompter(
-      style, action_spec, vip.SupportedEmbodiments.META_NAVIGATION
   )
   output_ims = []

 def vip_perform_selection(prompter, vlm, im, desc, arm_coord, samples, top_n):
   """Perform one selection pass given samples."""
   image_circles_np = prompter.add_arrow_overlay_plt(
+      image=im, samples=samples, arm_xy=arm_coord
   )
   _, encoded_image_circles = cv2.imencode(".png", image_circles_np)
   """VIP."""
   prompter = vip.VisualIterativePrompter(
+      style, action_spec, vip.SupportedEmbodiments.HF_DEMO
   )
   output_ims = []

vip_utils.py CHANGED Viewed

@@ -1,15 +1,13 @@
-# pylint: disable=line-too-long
 """Utils for visual iterative prompting.
 A number of utility functions for VIP.
 """
-import copy
 import re
 import numpy as np
 import scipy.spatial.distance as distance
-import matplotlib.pyplot as plt
 def min_dist(coord, coords):
@@ -49,23 +47,8 @@ def coord_to_text_coord(coord, arm_coord, radius):
     return arm_coord
   return (
       int(coord.xy[0] + radius * delta_coord[0] / np.linalg.norm(delta_coord)),
-      int(coord.xy[1] + radius * delta_coord[1] / np.linalg.norm(delta_coord)))
-def prep_aloha_frames(real_frame):
-  """Prepare collage of ALOHA view frames."""
-  markup_frame = copy.deepcopy(real_frame)
-  top_frame = copy.deepcopy(markup_frame[
-      :int(markup_frame.shape[0] / 2), :int(markup_frame.shape[1] / 2)])
-  side_frame = copy.deepcopy(markup_frame[
-      int(markup_frame.shape[0] / 2):, :int(markup_frame.shape[1] / 2)])
-  right_frame = copy.deepcopy(markup_frame[
-      int(markup_frame.shape[0] / 2):, int(markup_frame.shape[1] / 2):])
-  left_frame = copy.deepcopy(markup_frame[
-      :int(markup_frame.shape[0] / 2), int(markup_frame.shape[1] / 2):])
-  markup_frame[int(markup_frame.shape[0] / 2):, :int(markup_frame.shape[1] / 2)] = left_frame
-  markup_frame[:int(markup_frame.shape[0] / 2), int(markup_frame.shape[1] / 2):] = side_frame
-  return markup_frame, right_frame, left_frame
 def parse_response(response, answer_key='Arrow: ['):
@@ -82,7 +65,6 @@ def parse_response(response, answer_key='Arrow: ['):
   return values
-# TODO(ichter): normalize values by std
 def compute_errors(action, true_action, verbose=False):
   """Compute errors between a predicted action and true action."""
   l2_error = np.linalg.norm(action - true_action)
@@ -90,11 +72,13 @@ def compute_errors(action, true_action, verbose=False):
   l2_xy_error = np.linalg.norm(action[-2:] - true_action[-2:])
   cos_xy_sim = 1 - distance.cosine(action[-2:], true_action[-2:])
   z_error = np.abs(action[0] - true_action[0])
-  errors = {'l2': l2_error,
-            'cos_sim': cos_sim,
-            'l2_xy_error': l2_xy_error,
-            'cos_xy_sim': cos_xy_sim,
-            'z_error': z_error}
   if verbose:
     print('action: \t', [f'{a:.3f}' for a in action])
@@ -111,19 +95,27 @@ def compute_errors(action, true_action, verbose=False):
 def plot_errors(all_errors, error_types=None):
   """Plot errors across iterations."""
   if error_types is None:
-    error_types = ['l2', 'l2_xy_error', 'z_error', 'cos_sim', 'cos_xy_sim',]
   _, axs = plt.subplots(2, 3, figsize=(15, 8))
   for i, error_type in enumerate(error_types):  # go through each error type
     all_iter_errors = {}
     for error_by_iter in all_errors:  # go through each call
       for itr in error_by_iter:  # go through each iteration
-        if itr in all_iter_errors: # add error to the iteration it happened
           all_iter_errors[itr].append(error_by_iter[itr][error_type])
         else:
           all_iter_errors[itr] = [error_by_iter[itr][error_type]]
-    mean_iter_errors = [np.mean(all_iter_errors[itr]) for itr in all_iter_errors]
     axs[i // 3, i % 3].plot(all_iter_errors.keys(), mean_iter_errors)
     axs[i // 3, i % 3].set_title(error_type)

 """Utils for visual iterative prompting.
 A number of utility functions for VIP.
 """
 import re
+import matplotlib.pyplot as plt
 import numpy as np
 import scipy.spatial.distance as distance
 def min_dist(coord, coords):
     return arm_coord
   return (
       int(coord.xy[0] + radius * delta_coord[0] / np.linalg.norm(delta_coord)),
+      int(coord.xy[1] + radius * delta_coord[1] / np.linalg.norm(delta_coord)),
+  )
 def parse_response(response, answer_key='Arrow: ['):
   return values
 def compute_errors(action, true_action, verbose=False):
   """Compute errors between a predicted action and true action."""
   l2_error = np.linalg.norm(action - true_action)
   l2_xy_error = np.linalg.norm(action[-2:] - true_action[-2:])
   cos_xy_sim = 1 - distance.cosine(action[-2:], true_action[-2:])
   z_error = np.abs(action[0] - true_action[0])
+  errors = {
+      'l2': l2_error,
+      'cos_sim': cos_sim,
+      'l2_xy_error': l2_xy_error,
+      'cos_xy_sim': cos_xy_sim,
+      'z_error': z_error,
+  }
   if verbose:
     print('action: \t', [f'{a:.3f}' for a in action])
 def plot_errors(all_errors, error_types=None):
   """Plot errors across iterations."""
   if error_types is None:
+    error_types = [
+        'l2',
+        'l2_xy_error',
+        'z_error',
+        'cos_sim',
+        'cos_xy_sim',
+    ]
   _, axs = plt.subplots(2, 3, figsize=(15, 8))
   for i, error_type in enumerate(error_types):  # go through each error type
     all_iter_errors = {}
     for error_by_iter in all_errors:  # go through each call
       for itr in error_by_iter:  # go through each iteration
+        if itr in all_iter_errors:  # add error to the iteration it happened
           all_iter_errors[itr].append(error_by_iter[itr][error_type])
         else:
           all_iter_errors[itr] = [error_by_iter[itr][error_type]]
+    mean_iter_errors = [
+        np.mean(all_iter_errors[itr]) for itr in all_iter_errors
+    ]
     axs[i // 3, i % 3].plot(all_iter_errors.keys(), mean_iter_errors)
     axs[i // 3, i % 3].set_title(error_type)