Spaces:
Running
on
Zero
Running
on
Zero
bug fix
Browse files- app.py +15 -11
- requirements.txt +1 -1
- utils/__pycache__/florence.cpython-310.pyc +0 -0
- utils/florence.py +5 -2
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from typing import Optional
|
2 |
-
|
3 |
import gradio as gr
|
4 |
import spaces
|
5 |
import supervision as sv
|
@@ -8,6 +8,7 @@ from PIL import Image
|
|
8 |
from io import BytesIO
|
9 |
import PIL.Image
|
10 |
import requests
|
|
|
11 |
|
12 |
from utils.florence import load_florence_model, run_florence_inference, \
|
13 |
FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
|
@@ -29,17 +30,14 @@ SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
|
|
29 |
@spaces.GPU(duration=20)
|
30 |
@torch.inference_mode()
|
31 |
@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
|
32 |
-
def process_image(image_input, image_url, task_prompt,
|
33 |
if not image_input:
|
34 |
gr.Info("Please upload an image.")
|
35 |
return None
|
36 |
if not task_prompt:
|
37 |
gr.Info("Please enter a task prompt.")
|
38 |
return None
|
39 |
-
|
40 |
-
gr.Info("Please enter a text prompt.")
|
41 |
-
return None
|
42 |
-
|
43 |
if image_url:
|
44 |
print("start to fetch image from url", image_url)
|
45 |
response = requests.get(image_url)
|
@@ -52,8 +50,8 @@ def process_image(image_input, image_url, task_prompt, text_input) -> Optional[I
|
|
52 |
processor=FLORENCE_PROCESSOR,
|
53 |
device=DEVICE,
|
54 |
image=image_input,
|
55 |
-
task=
|
56 |
-
text=
|
57 |
)
|
58 |
detections = sv.Detections.from_lmm(
|
59 |
lmm=sv.LMM.FLORENCE_2,
|
@@ -66,9 +64,14 @@ def process_image(image_input, image_url, task_prompt, text_input) -> Optional[I
|
|
66 |
return None
|
67 |
images = []
|
68 |
print("mask generated:", len(detections.mask))
|
|
|
|
|
|
|
69 |
for i in range(len(detections.mask)):
|
70 |
-
|
71 |
-
|
|
|
|
|
72 |
return images
|
73 |
|
74 |
|
@@ -80,6 +83,7 @@ with gr.Blocks() as demo:
|
|
80 |
task_prompt = gr.Dropdown(
|
81 |
["<CAPTION>", "<DETAILED_CAPTION>", "<MORE_DETAILED_CAPTION>", "<CAPTION_TO_PHRASE_GROUNDING>", "<OPEN_VOCABULARY_DETECTION>", "<DENSE_REGION_CAPTION>"], value="<CAPTION_TO_PHRASE_GROUNDING>", label="Task Prompt", info="task prompts"
|
82 |
)
|
|
|
83 |
text_prompt = gr.Textbox(label='Text prompt', placeholder='Enter text prompts')
|
84 |
submit_button = gr.Button(value='Submit', variant='primary')
|
85 |
with gr.Column():
|
@@ -87,7 +91,7 @@ with gr.Blocks() as demo:
|
|
87 |
print(image, image_url, task_prompt, text_prompt, image_gallery)
|
88 |
submit_button.click(
|
89 |
fn = process_image,
|
90 |
-
inputs = [image, image_url, task_prompt, text_prompt],
|
91 |
outputs = [image_gallery,],
|
92 |
show_api=False
|
93 |
)
|
|
|
1 |
from typing import Optional
|
2 |
+
import numpy as np
|
3 |
import gradio as gr
|
4 |
import spaces
|
5 |
import supervision as sv
|
|
|
8 |
from io import BytesIO
|
9 |
import PIL.Image
|
10 |
import requests
|
11 |
+
import cv2
|
12 |
|
13 |
from utils.florence import load_florence_model, run_florence_inference, \
|
14 |
FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
|
|
|
30 |
@spaces.GPU(duration=20)
|
31 |
@torch.inference_mode()
|
32 |
@torch.autocast(device_type="cuda", dtype=torch.bfloat16)
|
33 |
+
def process_image(image_input, image_url, task_prompt, text_prompt=None, dilate=0) -> Optional[Image.Image]:
|
34 |
if not image_input:
|
35 |
gr.Info("Please upload an image.")
|
36 |
return None
|
37 |
if not task_prompt:
|
38 |
gr.Info("Please enter a task prompt.")
|
39 |
return None
|
40 |
+
|
|
|
|
|
|
|
41 |
if image_url:
|
42 |
print("start to fetch image from url", image_url)
|
43 |
response = requests.get(image_url)
|
|
|
50 |
processor=FLORENCE_PROCESSOR,
|
51 |
device=DEVICE,
|
52 |
image=image_input,
|
53 |
+
task=task_prompt,
|
54 |
+
text=text_prompt
|
55 |
)
|
56 |
detections = sv.Detections.from_lmm(
|
57 |
lmm=sv.LMM.FLORENCE_2,
|
|
|
64 |
return None
|
65 |
images = []
|
66 |
print("mask generated:", len(detections.mask))
|
67 |
+
kernel_size = dilate
|
68 |
+
kernel = np.ones((kernel_size, kernel_size), np.uint8)
|
69 |
+
|
70 |
for i in range(len(detections.mask)):
|
71 |
+
mask = detections.mask[i].astype(np.uint8) * 255
|
72 |
+
if dilate > 0:
|
73 |
+
mask = cv2.dilate(mask, kernel, iterations=1)
|
74 |
+
images.append(mask)
|
75 |
return images
|
76 |
|
77 |
|
|
|
83 |
task_prompt = gr.Dropdown(
|
84 |
["<CAPTION>", "<DETAILED_CAPTION>", "<MORE_DETAILED_CAPTION>", "<CAPTION_TO_PHRASE_GROUNDING>", "<OPEN_VOCABULARY_DETECTION>", "<DENSE_REGION_CAPTION>"], value="<CAPTION_TO_PHRASE_GROUNDING>", label="Task Prompt", info="task prompts"
|
85 |
)
|
86 |
+
dilate = gr.Slider(label="dilate mask", minimum=0, maximum=50, value=10, step=1)
|
87 |
text_prompt = gr.Textbox(label='Text prompt', placeholder='Enter text prompts')
|
88 |
submit_button = gr.Button(value='Submit', variant='primary')
|
89 |
with gr.Column():
|
|
|
91 |
print(image, image_url, task_prompt, text_prompt, image_gallery)
|
92 |
submit_button.click(
|
93 |
fn = process_image,
|
94 |
+
inputs = [image, image_url, task_prompt, text_prompt, dilate],
|
95 |
outputs = [image_gallery,],
|
96 |
show_api=False
|
97 |
)
|
requirements.txt
CHANGED
@@ -8,4 +8,4 @@ gradio
|
|
8 |
supervision
|
9 |
opencv-python
|
10 |
pytest
|
11 |
-
requests
|
|
|
8 |
supervision
|
9 |
opencv-python
|
10 |
pytest
|
11 |
+
requests
|
utils/__pycache__/florence.cpython-310.pyc
CHANGED
Binary files a/utils/__pycache__/florence.cpython-310.pyc and b/utils/__pycache__/florence.cpython-310.pyc differ
|
|
utils/florence.py
CHANGED
@@ -42,9 +42,12 @@ def run_florence_inference(
|
|
42 |
device: torch.device,
|
43 |
image: Image,
|
44 |
task: str,
|
45 |
-
text: str =
|
46 |
) -> Tuple[str, Dict]:
|
47 |
-
|
|
|
|
|
|
|
48 |
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
|
49 |
generated_ids = model.generate(
|
50 |
input_ids=inputs["input_ids"],
|
|
|
42 |
device: torch.device,
|
43 |
image: Image,
|
44 |
task: str,
|
45 |
+
text: str = None
|
46 |
) -> Tuple[str, Dict]:
|
47 |
+
if text:
|
48 |
+
prompt = task + text
|
49 |
+
else:
|
50 |
+
prompt = task
|
51 |
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
|
52 |
generated_ids = model.generate(
|
53 |
input_ids=inputs["input_ids"],
|