FLUX.1-inpaint-dev

Running on Zero

App Files Files Community

SkalskiP commited on Aug 14, 2024

Commit

741875a

1 Parent(s): 4796c10

Revert "Florence-2 + SAM2 + FLUX.1"

Browse files

This reverts commit b38c358bbb73c6626d065b797723ecdb9954331a.

Files changed (10) hide show

.gitattributes +0 -1
app.py +87 -71
configs/__init__.py +0 -5
configs/sam2_hiera_b+.yaml +0 -113
configs/sam2_hiera_l.yaml +0 -117
configs/sam2_hiera_s.yaml +0 -116
configs/sam2_hiera_t.yaml +0 -118
requirements.txt +1 -8
utils/florence.py +0 -54
utils/sam.py +0 -45

.gitattributes CHANGED Viewed

@@ -33,4 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-checkpoints/ filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,18 +1,14 @@
 from typing import Tuple
-import supervision as sv
 import random
 import numpy as np
 import gradio as gr
 import spaces
 import torch
-from PIL import Image, ImageFilter
 from diffusers import FluxInpaintPipeline
-from utils.florence import load_florence_model, run_florence_inference, \
-    FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
-from utils.sam import load_sam_image_model, run_sam_inference
 MARKDOWN = """
 # FLUX.1 Inpainting 🔥
@@ -23,16 +19,52 @@ for taking it to the next level by enabling inpainting with the FLUX.
 MAX_SEED = np.iinfo(np.int32).max
 IMAGE_SIZE = 1024
-DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-torch.autocast(device_type="cuda", dtype=torch.bfloat16).__enter__()
-if torch.cuda.get_device_properties(0).major >= 8:
-    torch.backends.cuda.matmul.allow_tf32 = True
-    torch.backends.cudnn.allow_tf32 = True
-FLORENCE_MODEL, FLORENCE_PROCESSOR = load_florence_model(device=DEVICE)
-SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
-FLUX_INPAINTING_PIPELINE = FluxInpaintPipeline.from_pretrained(
     "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16).to(DEVICE)
@@ -42,6 +74,11 @@ def resize_image_dimensions(
 ) -> Tuple[int, int]:
     width, height = original_resolution_wh
     if width > height:
         scaling_factor = maximum_dimension / width
     else:
@@ -56,20 +93,17 @@ def resize_image_dimensions(
     return new_width, new_height
-@spaces.GPU(duration=150)
-@torch.inference_mode()
-@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
 def process(
     input_image_editor: dict,
-    inpainting_prompt_text: str,
-    segmentation_prompt_text: str,
     seed_slicer: int,
     randomize_seed_checkbox: bool,
     strength_slider: float,
     num_inference_steps_slider: int,
     progress=gr.Progress(track_tqdm=True)
 ):
-    if not inpainting_prompt_text:
         gr.Info("Please enter a text prompt.")
         return None, None
@@ -80,50 +114,21 @@ def process(
         gr.Info("Please upload an image.")
         return None, None
-    if not mask and not segmentation_prompt_text:
-        gr.Info("Please draw a mask or enter a segmentation prompt.")
-        return None, None
-    if mask and segmentation_prompt_text:
-        gr.Info("Both mask and segmentation prompt are provided. Please provide only "
-                "one.")
         return None, None
     width, height = resize_image_dimensions(original_resolution_wh=image.size)
-    image = image.resize((width, height), Image.LANCZOS)
-    if segmentation_prompt_text:
-        _, result = run_florence_inference(
-            model=FLORENCE_MODEL,
-            processor=FLORENCE_PROCESSOR,
-            device=DEVICE,
-            image=image,
-            task=FLORENCE_OPEN_VOCABULARY_DETECTION_TASK,
-            text=segmentation_prompt_text
-        )
-        detections = sv.Detections.from_lmm(
-            lmm=sv.LMM.FLORENCE_2,
-            result=result,
-            resolution_wh=image.size
-        )
-        detections = run_sam_inference(SAM_IMAGE_MODEL, image, detections)
-        if len(detections) == 0:
-            gr.Info(f"{segmentation_prompt_text} prompt did not return any detections.")
-            return None, None
-        mask = Image.fromarray((detections.mask[0].astype(np.uint8)) * 255)
-    mask = mask.resize((width, height), Image.LANCZOS)
-    mask = mask.filter(ImageFilter.GaussianBlur(radius=10))
     if randomize_seed_checkbox:
         seed_slicer = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(seed_slicer)
-    result = FLUX_INPAINTING_PIPELINE(
-        prompt=inpainting_prompt_text,
-        image=image,
-        mask_image=mask,
         width=width,
         height=height,
         strength=strength_slider,
@@ -131,7 +136,7 @@ def process(
         num_inference_steps=num_inference_steps_slider
     ).images[0]
     print('INFERENCE DONE')
-    return result, mask
 with gr.Blocks() as demo:
@@ -147,24 +152,17 @@ with gr.Blocks() as demo:
                 brush=gr.Brush(colors=["#FFFFFF"], color_mode="fixed"))
             with gr.Row():
-                inpainting_prompt_text_component = gr.Text(
                     label="Prompt",
                     show_label=False,
                     max_lines=1,
-                    placeholder="Enter inpainting prompt",
                     container=False,
                 )
                 submit_button_component = gr.Button(
                     value='Submit', variant='primary', scale=0)
             with gr.Accordion("Advanced Settings", open=False):
-                segmentation_prompt_text_component = gr.Text(
-                    label="Prompt",
-                    show_label=False,
-                    max_lines=1,
-                    placeholder="Enter segmentation prompt",
-                    container=False,
-                )
                 seed_slicer_component = gr.Slider(
                     label="Seed",
                     minimum=0,
@@ -203,13 +201,31 @@ with gr.Blocks() as demo:
             with gr.Accordion("Debug", open=False):
                 output_mask_component = gr.Image(
                     type='pil', image_mode='RGB', label='Input mask', format="png")
     submit_button_component.click(
         fn=process,
         inputs=[
             input_image_editor_component,
-            inpainting_prompt_text_component,
-            segmentation_prompt_text_component,
             seed_slicer_component,
             randomize_seed_checkbox_component,
             strength_slider_component,

 from typing import Tuple
+import requests
 import random
 import numpy as np
 import gradio as gr
 import spaces
 import torch
+from PIL import Image
 from diffusers import FluxInpaintPipeline
 MARKDOWN = """
 # FLUX.1 Inpainting 🔥
 MAX_SEED = np.iinfo(np.int32).max
 IMAGE_SIZE = 1024
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+def remove_background(image: Image.Image, threshold: int = 50) -> Image.Image:
+    image = image.convert("RGBA")
+    data = image.getdata()
+    new_data = []
+    for item in data:
+        avg = sum(item[:3]) / 3
+        if avg < threshold:
+            new_data.append((0, 0, 0, 0))
+        else:
+            new_data.append(item)
+    image.putdata(new_data)
+    return image
+EXAMPLES = [
+    [
+        {
+            "background": Image.open(requests.get("https://media.roboflow.com/spaces/doge-2-image.png", stream=True).raw),
+            "layers": [remove_background(Image.open(requests.get("https://media.roboflow.com/spaces/doge-2-mask-2.png", stream=True).raw))],
+            "composite": Image.open(requests.get("https://media.roboflow.com/spaces/doge-2-composite-2.png", stream=True).raw),
+        },
+        "little lion",
+        42,
+        False,
+        0.85,
+        30
+    ],
+    [
+        {
+            "background": Image.open(requests.get("https://media.roboflow.com/spaces/doge-2-image.png", stream=True).raw),
+            "layers": [remove_background(Image.open(requests.get("https://media.roboflow.com/spaces/doge-2-mask-3.png", stream=True).raw))],
+            "composite": Image.open(requests.get("https://media.roboflow.com/spaces/doge-2-composite-3.png", stream=True).raw),
+        },
+        "tattoos",
+        42,
+        False,
+        0.85,
+        30
+    ]
+]
+pipe = FluxInpaintPipeline.from_pretrained(
     "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16).to(DEVICE)
 ) -> Tuple[int, int]:
     width, height = original_resolution_wh
+    # if width <= maximum_dimension and height <= maximum_dimension:
+    #     width = width - (width % 32)
+    #     height = height - (height % 32)
+    #     return width, height
     if width > height:
         scaling_factor = maximum_dimension / width
     else:
     return new_width, new_height
+@spaces.GPU(duration=100)
 def process(
     input_image_editor: dict,
+    input_text: str,
     seed_slicer: int,
     randomize_seed_checkbox: bool,
     strength_slider: float,
     num_inference_steps_slider: int,
     progress=gr.Progress(track_tqdm=True)
 ):
+    if not input_text:
         gr.Info("Please enter a text prompt.")
         return None, None
         gr.Info("Please upload an image.")
         return None, None
+    if not mask:
+        gr.Info("Please draw a mask on the image.")
         return None, None
     width, height = resize_image_dimensions(original_resolution_wh=image.size)
+    resized_image = image.resize((width, height), Image.LANCZOS)
+    resized_mask = mask.resize((width, height), Image.LANCZOS)
     if randomize_seed_checkbox:
         seed_slicer = random.randint(0, MAX_SEED)
     generator = torch.Generator().manual_seed(seed_slicer)
+    result = pipe(
+        prompt=input_text,
+        image=resized_image,
+        mask_image=resized_mask,
         width=width,
         height=height,
         strength=strength_slider,
         num_inference_steps=num_inference_steps_slider
     ).images[0]
     print('INFERENCE DONE')
+    return result, resized_mask
 with gr.Blocks() as demo:
                 brush=gr.Brush(colors=["#FFFFFF"], color_mode="fixed"))
             with gr.Row():
+                input_text_component = gr.Text(
                     label="Prompt",
                     show_label=False,
                     max_lines=1,
+                    placeholder="Enter your prompt",
                     container=False,
                 )
                 submit_button_component = gr.Button(
                     value='Submit', variant='primary', scale=0)
             with gr.Accordion("Advanced Settings", open=False):
                 seed_slicer_component = gr.Slider(
                     label="Seed",
                     minimum=0,
             with gr.Accordion("Debug", open=False):
                 output_mask_component = gr.Image(
                     type='pil', image_mode='RGB', label='Input mask', format="png")
+    with gr.Row():
+        gr.Examples(
+            fn=process,
+            examples=EXAMPLES,
+            inputs=[
+                input_image_editor_component,
+                input_text_component,
+                seed_slicer_component,
+                randomize_seed_checkbox_component,
+                strength_slider_component,
+                num_inference_steps_slider_component
+            ],
+            outputs=[
+                output_image_component,
+                output_mask_component
+            ],
+            run_on_click=True,
+            cache_examples=True
+        )
     submit_button_component.click(
         fn=process,
         inputs=[
             input_image_editor_component,
+            input_text_component,
             seed_slicer_component,
             randomize_seed_checkbox_component,
             strength_slider_component,

configs/__init__.py CHANGED Viewed

@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.

configs/sam2_hiera_b+.yaml DELETED Viewed

@@ -1,113 +0,0 @@
-# @package _global_
-# Model
-model:
-  _target_: sam2.modeling.sam2_base.SAM2Base
-  image_encoder:
-    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
-    scalp: 1
-    trunk:
-      _target_: sam2.modeling.backbones.hieradet.Hiera
-      embed_dim: 112
-      num_heads: 2
-    neck:
-      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 256
-        normalize: true
-        scale: null
-        temperature: 10000
-      d_model: 256
-      backbone_channel_list: [896, 448, 224, 112]
-      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
-      fpn_interp_model: nearest
-  memory_attention:
-    _target_: sam2.modeling.memory_attention.MemoryAttention
-    d_model: 256
-    pos_enc_at_input: true
-    layer:
-      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
-      activation: relu
-      dim_feedforward: 2048
-      dropout: 0.1
-      pos_enc_at_attn: false
-      self_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-      d_model: 256
-      pos_enc_at_cross_attn_keys: true
-      pos_enc_at_cross_attn_queries: false
-      cross_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        rope_k_repeat: True
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-        kv_in_dim: 64
-    num_layers: 4
-  memory_encoder:
-      _target_: sam2.modeling.memory_encoder.MemoryEncoder
-      out_dim: 64
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 64
-        normalize: true
-        scale: null
-        temperature: 10000
-      mask_downsampler:
-        _target_: sam2.modeling.memory_encoder.MaskDownSampler
-        kernel_size: 3
-        stride: 2
-        padding: 1
-      fuser:
-        _target_: sam2.modeling.memory_encoder.Fuser
-        layer:
-          _target_: sam2.modeling.memory_encoder.CXBlock
-          dim: 256
-          kernel_size: 7
-          padding: 3
-          layer_scale_init_value: 1e-6
-          use_dwconv: True  # depth-wise convs
-        num_layers: 2
-  num_maskmem: 7
-  image_size: 1024
-  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
-  sigmoid_scale_for_mem_enc: 20.0
-  sigmoid_bias_for_mem_enc: -10.0
-  use_mask_input_as_output_without_sam: true
-  # Memory
-  directly_add_no_mem_embed: true
-  # use high-resolution feature map in the SAM mask decoder
-  use_high_res_features_in_sam: true
-  # output 3 masks on the first click on initial conditioning frames
-  multimask_output_in_sam: true
-  # SAM heads
-  iou_prediction_use_sigmoid: True
-  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
-  use_obj_ptrs_in_encoder: true
-  add_tpos_enc_to_obj_ptrs: false
-  only_obj_ptrs_in_the_past_for_eval: true
-  # object occlusion prediction
-  pred_obj_scores: true
-  pred_obj_scores_mlp: true
-  fixed_no_obj_ptr: true
-  # multimask tracking settings
-  multimask_output_for_tracking: true
-  use_multimask_token_for_obj_ptr: true
-  multimask_min_pt_num: 0
-  multimask_max_pt_num: 1
-  use_mlp_for_obj_ptr_proj: true
-  # Compilation flag
-  compile_image_encoder: False

configs/sam2_hiera_l.yaml DELETED Viewed

@@ -1,117 +0,0 @@
-# @package _global_
-# Model
-model:
-  _target_: sam2.modeling.sam2_base.SAM2Base
-  image_encoder:
-    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
-    scalp: 1
-    trunk:
-      _target_: sam2.modeling.backbones.hieradet.Hiera
-      embed_dim: 144
-      num_heads: 2
-      stages: [2, 6, 36, 4]
-      global_att_blocks: [23, 33, 43]
-      window_pos_embed_bkg_spatial_size: [7, 7]
-      window_spec: [8, 4, 16, 8]
-    neck:
-      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 256
-        normalize: true
-        scale: null
-        temperature: 10000
-      d_model: 256
-      backbone_channel_list: [1152, 576, 288, 144]
-      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
-      fpn_interp_model: nearest
-  memory_attention:
-    _target_: sam2.modeling.memory_attention.MemoryAttention
-    d_model: 256
-    pos_enc_at_input: true
-    layer:
-      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
-      activation: relu
-      dim_feedforward: 2048
-      dropout: 0.1
-      pos_enc_at_attn: false
-      self_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-      d_model: 256
-      pos_enc_at_cross_attn_keys: true
-      pos_enc_at_cross_attn_queries: false
-      cross_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        rope_k_repeat: True
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-        kv_in_dim: 64
-    num_layers: 4
-  memory_encoder:
-      _target_: sam2.modeling.memory_encoder.MemoryEncoder
-      out_dim: 64
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 64
-        normalize: true
-        scale: null
-        temperature: 10000
-      mask_downsampler:
-        _target_: sam2.modeling.memory_encoder.MaskDownSampler
-        kernel_size: 3
-        stride: 2
-        padding: 1
-      fuser:
-        _target_: sam2.modeling.memory_encoder.Fuser
-        layer:
-          _target_: sam2.modeling.memory_encoder.CXBlock
-          dim: 256
-          kernel_size: 7
-          padding: 3
-          layer_scale_init_value: 1e-6
-          use_dwconv: True  # depth-wise convs
-        num_layers: 2
-  num_maskmem: 7
-  image_size: 1024
-  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
-  sigmoid_scale_for_mem_enc: 20.0
-  sigmoid_bias_for_mem_enc: -10.0
-  use_mask_input_as_output_without_sam: true
-  # Memory
-  directly_add_no_mem_embed: true
-  # use high-resolution feature map in the SAM mask decoder
-  use_high_res_features_in_sam: true
-  # output 3 masks on the first click on initial conditioning frames
-  multimask_output_in_sam: true
-  # SAM heads
-  iou_prediction_use_sigmoid: True
-  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
-  use_obj_ptrs_in_encoder: true
-  add_tpos_enc_to_obj_ptrs: false
-  only_obj_ptrs_in_the_past_for_eval: true
-  # object occlusion prediction
-  pred_obj_scores: true
-  pred_obj_scores_mlp: true
-  fixed_no_obj_ptr: true
-  # multimask tracking settings
-  multimask_output_for_tracking: true
-  use_multimask_token_for_obj_ptr: true
-  multimask_min_pt_num: 0
-  multimask_max_pt_num: 1
-  use_mlp_for_obj_ptr_proj: true
-  # Compilation flag
-  compile_image_encoder: False

configs/sam2_hiera_s.yaml DELETED Viewed

@@ -1,116 +0,0 @@
-# @package _global_
-# Model
-model:
-  _target_: sam2.modeling.sam2_base.SAM2Base
-  image_encoder:
-    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
-    scalp: 1
-    trunk:
-      _target_: sam2.modeling.backbones.hieradet.Hiera
-      embed_dim: 96
-      num_heads: 1
-      stages: [1, 2, 11, 2]
-      global_att_blocks: [7, 10, 13]
-      window_pos_embed_bkg_spatial_size: [7, 7]
-    neck:
-      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 256
-        normalize: true
-        scale: null
-        temperature: 10000
-      d_model: 256
-      backbone_channel_list: [768, 384, 192, 96]
-      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
-      fpn_interp_model: nearest
-  memory_attention:
-    _target_: sam2.modeling.memory_attention.MemoryAttention
-    d_model: 256
-    pos_enc_at_input: true
-    layer:
-      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
-      activation: relu
-      dim_feedforward: 2048
-      dropout: 0.1
-      pos_enc_at_attn: false
-      self_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-      d_model: 256
-      pos_enc_at_cross_attn_keys: true
-      pos_enc_at_cross_attn_queries: false
-      cross_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        rope_k_repeat: True
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-        kv_in_dim: 64
-    num_layers: 4
-  memory_encoder:
-      _target_: sam2.modeling.memory_encoder.MemoryEncoder
-      out_dim: 64
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 64
-        normalize: true
-        scale: null
-        temperature: 10000
-      mask_downsampler:
-        _target_: sam2.modeling.memory_encoder.MaskDownSampler
-        kernel_size: 3
-        stride: 2
-        padding: 1
-      fuser:
-        _target_: sam2.modeling.memory_encoder.Fuser
-        layer:
-          _target_: sam2.modeling.memory_encoder.CXBlock
-          dim: 256
-          kernel_size: 7
-          padding: 3
-          layer_scale_init_value: 1e-6
-          use_dwconv: True  # depth-wise convs
-        num_layers: 2
-  num_maskmem: 7
-  image_size: 1024
-  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
-  sigmoid_scale_for_mem_enc: 20.0
-  sigmoid_bias_for_mem_enc: -10.0
-  use_mask_input_as_output_without_sam: true
-  # Memory
-  directly_add_no_mem_embed: true
-  # use high-resolution feature map in the SAM mask decoder
-  use_high_res_features_in_sam: true
-  # output 3 masks on the first click on initial conditioning frames
-  multimask_output_in_sam: true
-  # SAM heads
-  iou_prediction_use_sigmoid: True
-  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
-  use_obj_ptrs_in_encoder: true
-  add_tpos_enc_to_obj_ptrs: false
-  only_obj_ptrs_in_the_past_for_eval: true
-  # object occlusion prediction
-  pred_obj_scores: true
-  pred_obj_scores_mlp: true
-  fixed_no_obj_ptr: true
-  # multimask tracking settings
-  multimask_output_for_tracking: true
-  use_multimask_token_for_obj_ptr: true
-  multimask_min_pt_num: 0
-  multimask_max_pt_num: 1
-  use_mlp_for_obj_ptr_proj: true
-  # Compilation flag
-  compile_image_encoder: False

configs/sam2_hiera_t.yaml DELETED Viewed

@@ -1,118 +0,0 @@
-# @package _global_
-# Model
-model:
-  _target_: sam2.modeling.sam2_base.SAM2Base
-  image_encoder:
-    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
-    scalp: 1
-    trunk:
-      _target_: sam2.modeling.backbones.hieradet.Hiera
-      embed_dim: 96
-      num_heads: 1
-      stages: [1, 2, 7, 2]
-      global_att_blocks: [5, 7, 9]
-      window_pos_embed_bkg_spatial_size: [7, 7]
-    neck:
-      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 256
-        normalize: true
-        scale: null
-        temperature: 10000
-      d_model: 256
-      backbone_channel_list: [768, 384, 192, 96]
-      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
-      fpn_interp_model: nearest
-  memory_attention:
-    _target_: sam2.modeling.memory_attention.MemoryAttention
-    d_model: 256
-    pos_enc_at_input: true
-    layer:
-      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
-      activation: relu
-      dim_feedforward: 2048
-      dropout: 0.1
-      pos_enc_at_attn: false
-      self_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-      d_model: 256
-      pos_enc_at_cross_attn_keys: true
-      pos_enc_at_cross_attn_queries: false
-      cross_attention:
-        _target_: sam2.modeling.sam.transformer.RoPEAttention
-        rope_theta: 10000.0
-        feat_sizes: [32, 32]
-        rope_k_repeat: True
-        embedding_dim: 256
-        num_heads: 1
-        downsample_rate: 1
-        dropout: 0.1
-        kv_in_dim: 64
-    num_layers: 4
-  memory_encoder:
-      _target_: sam2.modeling.memory_encoder.MemoryEncoder
-      out_dim: 64
-      position_encoding:
-        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
-        num_pos_feats: 64
-        normalize: true
-        scale: null
-        temperature: 10000
-      mask_downsampler:
-        _target_: sam2.modeling.memory_encoder.MaskDownSampler
-        kernel_size: 3
-        stride: 2
-        padding: 1
-      fuser:
-        _target_: sam2.modeling.memory_encoder.Fuser
-        layer:
-          _target_: sam2.modeling.memory_encoder.CXBlock
-          dim: 256
-          kernel_size: 7
-          padding: 3
-          layer_scale_init_value: 1e-6
-          use_dwconv: True  # depth-wise convs
-        num_layers: 2
-  num_maskmem: 7
-  image_size: 1024
-  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
-  # SAM decoder
-  sigmoid_scale_for_mem_enc: 20.0
-  sigmoid_bias_for_mem_enc: -10.0
-  use_mask_input_as_output_without_sam: true
-  # Memory
-  directly_add_no_mem_embed: true
-  # use high-resolution feature map in the SAM mask decoder
-  use_high_res_features_in_sam: true
-  # output 3 masks on the first click on initial conditioning frames
-  multimask_output_in_sam: true
-  # SAM heads
-  iou_prediction_use_sigmoid: True
-  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
-  use_obj_ptrs_in_encoder: true
-  add_tpos_enc_to_obj_ptrs: false
-  only_obj_ptrs_in_the_past_for_eval: true
-  # object occlusion prediction
-  pred_obj_scores: true
-  pred_obj_scores_mlp: true
-  fixed_no_obj_ptr: true
-  # multimask tracking settings
-  multimask_output_for_tracking: true
-  use_multimask_token_for_obj_ptr: true
-  multimask_min_pt_num: 0
-  multimask_max_pt_num: 1
-  use_mlp_for_obj_ptr_proj: true
-  # Compilation flag
-  # HieraT does not currently support compilation, should always be set to False
-  compile_image_encoder: False

requirements.txt CHANGED Viewed

@@ -1,13 +1,6 @@
-tqdm
-einops
-timm
-samv2
-opencv-python
-pytest
 gradio
 spaces
 accelerate
 transformers==4.42.4
 sentencepiece
-supervision
-git+https://github.com/Gothos/diffusers.git@flux-inpaint

 gradio
 spaces
 accelerate
 transformers==4.42.4
 sentencepiece
+git+https://github.com/Gothos/diffusers.git@flux-inpaint

utils/florence.py CHANGED Viewed

@@ -1,54 +0,0 @@
-import os
-from typing import Union, Any, Tuple, Dict
-from unittest.mock import patch
-import torch
-from PIL import Image
-from transformers import AutoModelForCausalLM, AutoProcessor
-from transformers.dynamic_module_utils import get_imports
-FLORENCE_CHECKPOINT = "microsoft/Florence-2-base"
-FLORENCE_OPEN_VOCABULARY_DETECTION_TASK = '<OPEN_VOCABULARY_DETECTION>'
-def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
-    """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
-    if not str(filename).endswith("/modeling_florence2.py"):
-        return get_imports(filename)
-    imports = get_imports(filename)
-    imports.remove("flash_attn")
-    return imports
-def load_florence_model(
-    device: torch.device, checkpoint: str = FLORENCE_CHECKPOINT
-) -> Tuple[Any, Any]:
-    with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
-        model = AutoModelForCausalLM.from_pretrained(
-            checkpoint, trust_remote_code=True).to(device).eval()
-        processor = AutoProcessor.from_pretrained(
-            checkpoint, trust_remote_code=True)
-        return model, processor
-def run_florence_inference(
-    model: Any,
-    processor: Any,
-    device: torch.device,
-    image: Image,
-    task: str,
-    text: str = ""
-) -> Tuple[str, Dict]:
-    prompt = task + text
-    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
-    generated_ids = model.generate(
-        input_ids=inputs["input_ids"],
-        pixel_values=inputs["pixel_values"],
-        max_new_tokens=1024,
-        num_beams=3
-    )
-    generated_text = processor.batch_decode(
-        generated_ids, skip_special_tokens=False)[0]
-    response = processor.post_process_generation(
-        generated_text, task=task, image_size=image.size)
-    return generated_text, response

utils/sam.py CHANGED Viewed

@@ -1,45 +0,0 @@
-from typing import Any
-import numpy as np
-import supervision as sv
-import torch
-from PIL import Image
-from sam2.build_sam import build_sam2, build_sam2_video_predictor
-from sam2.sam2_image_predictor import SAM2ImagePredictor
-SAM_CHECKPOINT = "checkpoints/sam2_hiera_small.pt"
-SAM_CONFIG = "sam2_hiera_s.yaml"
-def load_sam_image_model(
-    device: torch.device,
-    config: str = SAM_CONFIG,
-    checkpoint: str = SAM_CHECKPOINT
-) -> SAM2ImagePredictor:
-    model = build_sam2(config, checkpoint, device=device)
-    return SAM2ImagePredictor(sam_model=model)
-def load_sam_video_model(
-    device: torch.device,
-    config: str = SAM_CONFIG,
-    checkpoint: str = SAM_CHECKPOINT
-) -> Any:
-    return build_sam2_video_predictor(config, checkpoint, device=device)
-def run_sam_inference(
-    model: Any,
-    image: Image,
-    detections: sv.Detections
-) -> sv.Detections:
-    image = np.array(image.convert("RGB"))
-    model.set_image(image)
-    mask, score, _ = model.predict(box=detections.xyxy, multimask_output=False)
-    # dirty fix; remove this later
-    if len(mask.shape) == 4:
-        mask = np.squeeze(mask)
-    detections.mask = mask.astype(bool)
-    return detections