jiuface commited on
Commit
f0d9f07
1 Parent(s): caa3c61
app.py CHANGED
@@ -1,5 +1,5 @@
1
  from typing import Optional
2
-
3
  import gradio as gr
4
  import spaces
5
  import supervision as sv
@@ -8,6 +8,7 @@ from PIL import Image
8
  from io import BytesIO
9
  import PIL.Image
10
  import requests
 
11
 
12
  from utils.florence import load_florence_model, run_florence_inference, \
13
  FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
@@ -29,17 +30,14 @@ SAM_IMAGE_MODEL = load_sam_image_model(device=DEVICE)
29
  @spaces.GPU(duration=20)
30
  @torch.inference_mode()
31
  @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
32
- def process_image(image_input, image_url, task_prompt, text_input) -> Optional[Image.Image]:
33
  if not image_input:
34
  gr.Info("Please upload an image.")
35
  return None
36
  if not task_prompt:
37
  gr.Info("Please enter a task prompt.")
38
  return None
39
- if not text_input:
40
- gr.Info("Please enter a text prompt.")
41
- return None
42
-
43
  if image_url:
44
  print("start to fetch image from url", image_url)
45
  response = requests.get(image_url)
@@ -52,8 +50,8 @@ def process_image(image_input, image_url, task_prompt, text_input) -> Optional[I
52
  processor=FLORENCE_PROCESSOR,
53
  device=DEVICE,
54
  image=image_input,
55
- task=text_input,
56
- text=prompt
57
  )
58
  detections = sv.Detections.from_lmm(
59
  lmm=sv.LMM.FLORENCE_2,
@@ -66,9 +64,14 @@ def process_image(image_input, image_url, task_prompt, text_input) -> Optional[I
66
  return None
67
  images = []
68
  print("mask generated:", len(detections.mask))
 
 
 
69
  for i in range(len(detections.mask)):
70
- img = Image.fromarray(detections.mask[i].astype(np.uint8) * 255)
71
- images.append(img)
 
 
72
  return images
73
 
74
 
@@ -80,6 +83,7 @@ with gr.Blocks() as demo:
80
  task_prompt = gr.Dropdown(
81
  ["<CAPTION>", "<DETAILED_CAPTION>", "<MORE_DETAILED_CAPTION>", "<CAPTION_TO_PHRASE_GROUNDING>", "<OPEN_VOCABULARY_DETECTION>", "<DENSE_REGION_CAPTION>"], value="<CAPTION_TO_PHRASE_GROUNDING>", label="Task Prompt", info="task prompts"
82
  )
 
83
  text_prompt = gr.Textbox(label='Text prompt', placeholder='Enter text prompts')
84
  submit_button = gr.Button(value='Submit', variant='primary')
85
  with gr.Column():
@@ -87,7 +91,7 @@ with gr.Blocks() as demo:
87
  print(image, image_url, task_prompt, text_prompt, image_gallery)
88
  submit_button.click(
89
  fn = process_image,
90
- inputs = [image, image_url, task_prompt, text_prompt],
91
  outputs = [image_gallery,],
92
  show_api=False
93
  )
 
1
  from typing import Optional
2
+ import numpy as np
3
  import gradio as gr
4
  import spaces
5
  import supervision as sv
 
8
  from io import BytesIO
9
  import PIL.Image
10
  import requests
11
+ import cv2
12
 
13
  from utils.florence import load_florence_model, run_florence_inference, \
14
  FLORENCE_OPEN_VOCABULARY_DETECTION_TASK
 
30
  @spaces.GPU(duration=20)
31
  @torch.inference_mode()
32
  @torch.autocast(device_type="cuda", dtype=torch.bfloat16)
33
+ def process_image(image_input, image_url, task_prompt, text_prompt=None, dilate=0) -> Optional[Image.Image]:
34
  if not image_input:
35
  gr.Info("Please upload an image.")
36
  return None
37
  if not task_prompt:
38
  gr.Info("Please enter a task prompt.")
39
  return None
40
+
 
 
 
41
  if image_url:
42
  print("start to fetch image from url", image_url)
43
  response = requests.get(image_url)
 
50
  processor=FLORENCE_PROCESSOR,
51
  device=DEVICE,
52
  image=image_input,
53
+ task=task_prompt,
54
+ text=text_prompt
55
  )
56
  detections = sv.Detections.from_lmm(
57
  lmm=sv.LMM.FLORENCE_2,
 
64
  return None
65
  images = []
66
  print("mask generated:", len(detections.mask))
67
+ kernel_size = dilate
68
+ kernel = np.ones((kernel_size, kernel_size), np.uint8)
69
+
70
  for i in range(len(detections.mask)):
71
+ mask = detections.mask[i].astype(np.uint8) * 255
72
+ if dilate > 0:
73
+ mask = cv2.dilate(mask, kernel, iterations=1)
74
+ images.append(mask)
75
  return images
76
 
77
 
 
83
  task_prompt = gr.Dropdown(
84
  ["<CAPTION>", "<DETAILED_CAPTION>", "<MORE_DETAILED_CAPTION>", "<CAPTION_TO_PHRASE_GROUNDING>", "<OPEN_VOCABULARY_DETECTION>", "<DENSE_REGION_CAPTION>"], value="<CAPTION_TO_PHRASE_GROUNDING>", label="Task Prompt", info="task prompts"
85
  )
86
+ dilate = gr.Slider(label="dilate mask", minimum=0, maximum=50, value=10, step=1)
87
  text_prompt = gr.Textbox(label='Text prompt', placeholder='Enter text prompts')
88
  submit_button = gr.Button(value='Submit', variant='primary')
89
  with gr.Column():
 
91
  print(image, image_url, task_prompt, text_prompt, image_gallery)
92
  submit_button.click(
93
  fn = process_image,
94
+ inputs = [image, image_url, task_prompt, text_prompt, dilate],
95
  outputs = [image_gallery,],
96
  show_api=False
97
  )
requirements.txt CHANGED
@@ -8,4 +8,4 @@ gradio
8
  supervision
9
  opencv-python
10
  pytest
11
- requests
 
8
  supervision
9
  opencv-python
10
  pytest
11
+ requests
utils/__pycache__/florence.cpython-310.pyc CHANGED
Binary files a/utils/__pycache__/florence.cpython-310.pyc and b/utils/__pycache__/florence.cpython-310.pyc differ
 
utils/florence.py CHANGED
@@ -42,9 +42,12 @@ def run_florence_inference(
42
  device: torch.device,
43
  image: Image,
44
  task: str,
45
- text: str = ""
46
  ) -> Tuple[str, Dict]:
47
- prompt = task + text
 
 
 
48
  inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
49
  generated_ids = model.generate(
50
  input_ids=inputs["input_ids"],
 
42
  device: torch.device,
43
  image: Image,
44
  task: str,
45
+ text: str = None
46
  ) -> Tuple[str, Dict]:
47
+ if text:
48
+ prompt = task + text
49
+ else:
50
+ prompt = task
51
  inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
52
  generated_ids = model.generate(
53
  input_ids=inputs["input_ids"],