use pytorch to do resize and clip to reduce gpu memory usage.

Change `resize_and_pad` to its original version as in molmo code so images are resized with pytorch utilities. Using tensorflow to resize image leads to large gpu memory increase, and the inference is impossible with 4090 24G, which is against the purpose of this project.

Files changed (1) hide show

image_preprocessing_molmo.py +20 -20

image_preprocessing_molmo.py CHANGED Viewed

@@ -85,26 +85,26 @@ def resize_and_pad(
     scaled_height = int(np.array(height, np.float32) * image_scale)
     scaled_width = int(np.array(width, np.float32) * image_scale)
-    # if resize_method == "tensorflow":
-    #     FIXME remove
-    import tensorflow as tf
-    image = tf.image.convert_image_dtype(tf.constant(image), dtype=tf.float32)
-    image = tf.image.resize(
-        image,
-        [scaled_height, scaled_width],
-        method=tf.image.ResizeMethod.BILINEAR,
-        antialias=True,
-    )
-    image = tf.clip_by_value(image, 0.0, 1.0)
-    image = image.numpy()
-    # else:
-    #     image = torch.permute(torch.from_numpy(image), [2, 0, 1])
-    #     image = convert_image_dtype(image)  # resize in flaot32
-    #     image = torchvision.transforms.Resize(
-    #         [scaled_height, scaled_width], InterpolationMode.BILINEAR, antialias=True
-    #     )(image)
-    #     image = torch.clip(image, 0.0, 1.0)
-    #     image = torch.permute(image, [1, 2, 0]).numpy()
     top_pad = (desired_height - scaled_height) // 2
     left_pad = (desired_width - scaled_width) // 2

     scaled_height = int(np.array(height, np.float32) * image_scale)
     scaled_width = int(np.array(width, np.float32) * image_scale)
+    if resize_method == "tensorflow":
+        # this option leads to large gpu mem increase likely due to how tensorflow handle memory allocation
+        import tensorflow as tf
+        image = tf.image.convert_image_dtype(tf.constant(image), dtype=tf.float32)
+        image = tf.image.resize(
+            image,
+            [scaled_height, scaled_width],
+            method=tf.image.ResizeMethod.BILINEAR,
+            antialias=True,
+        )
+        image = tf.clip_by_value(image, 0.0, 1.0)
+        image = image.numpy()
+    else:
+        image = torch.permute(torch.from_numpy(image), [2, 0, 1])
+        image = convert_image_dtype(image)  # resize in flaot32
+        image = torchvision.transforms.Resize(
+            [scaled_height, scaled_width], InterpolationMode.BILINEAR, antialias=True
+        )(image)
+        image = torch.clip(image, 0.0, 1.0)
+        image = torch.permute(image, [1, 2, 0]).numpy()
     top_pad = (desired_height - scaled_height) // 2
     left_pad = (desired_width - scaled_width) // 2