Spaces:
Running
on
Zero
Running
on
Zero
more captioning tasks
Browse files- app.py +12 -11
- utils/tasks.py +18 -4
app.py
CHANGED
@@ -5,7 +5,8 @@ import spaces
|
|
5 |
|
6 |
from utils.annotate import annotate_with_boxes
|
7 |
from utils.models import load_models, run_inference, CHECKPOINTS
|
8 |
-
from utils.tasks import TASK_NAMES, TASKS
|
|
|
9 |
|
10 |
MARKDOWN = """
|
11 |
# Better Florence-2 Playground 🔥
|
@@ -25,12 +26,12 @@ MARKDOWN = """
|
|
25 |
</div>
|
26 |
"""
|
27 |
|
28 |
-
OBJECT_DETECTION_EXAMPLES = [
|
29 |
-
|
30 |
-
]
|
31 |
-
CAPTION_EXAMPLES = [
|
32 |
-
|
33 |
-
]
|
34 |
|
35 |
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
36 |
MODELS, PROCESSORS = load_models(DEVICE)
|
@@ -41,13 +42,13 @@ def process(checkpoint_dropdown, task_dropdown, image_input):
|
|
41 |
model = MODELS[checkpoint_dropdown]
|
42 |
processor = PROCESSORS[checkpoint_dropdown]
|
43 |
task = TASKS[task_dropdown]
|
44 |
-
if task_dropdown ==
|
45 |
_, response = run_inference(
|
46 |
model, processor, DEVICE, image_input, task)
|
47 |
detections = sv.Detections.from_lmm(
|
48 |
lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
|
49 |
return annotate_with_boxes(image_input, detections)
|
50 |
-
elif task_dropdown
|
51 |
_, response = run_inference(
|
52 |
model, processor, DEVICE, image_input, task)
|
53 |
return response[task]
|
@@ -73,7 +74,7 @@ with gr.Blocks() as demo:
|
|
73 |
with gr.Column():
|
74 |
@gr.render(inputs=task_dropdown_component)
|
75 |
def show_output(text):
|
76 |
-
if text ==
|
77 |
image_output_component = gr.Image(type='pil', label='Image Output')
|
78 |
submit_button_component.click(
|
79 |
fn=process,
|
@@ -84,7 +85,7 @@ with gr.Blocks() as demo:
|
|
84 |
],
|
85 |
outputs=image_output_component
|
86 |
)
|
87 |
-
elif text
|
88 |
text_output_component = gr.Textbox(label='Caption Output')
|
89 |
submit_button_component.click(
|
90 |
fn=process,
|
|
|
5 |
|
6 |
from utils.annotate import annotate_with_boxes
|
7 |
from utils.models import load_models, run_inference, CHECKPOINTS
|
8 |
+
from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
|
9 |
+
CAPTION_TASK_NAMES
|
10 |
|
11 |
MARKDOWN = """
|
12 |
# Better Florence-2 Playground 🔥
|
|
|
26 |
</div>
|
27 |
"""
|
28 |
|
29 |
+
# OBJECT_DETECTION_EXAMPLES = [
|
30 |
+
# ["microsoft/Florence-2-large-ft", "Object Detection", "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
|
31 |
+
# ]
|
32 |
+
# CAPTION_EXAMPLES = [
|
33 |
+
# ["microsoft/Florence-2-large-ft", "Caption", "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
|
34 |
+
# ]
|
35 |
|
36 |
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
37 |
MODELS, PROCESSORS = load_models(DEVICE)
|
|
|
42 |
model = MODELS[checkpoint_dropdown]
|
43 |
processor = PROCESSORS[checkpoint_dropdown]
|
44 |
task = TASKS[task_dropdown]
|
45 |
+
if task_dropdown == OBJECT_DETECTION_TASK_NAME:
|
46 |
_, response = run_inference(
|
47 |
model, processor, DEVICE, image_input, task)
|
48 |
detections = sv.Detections.from_lmm(
|
49 |
lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
|
50 |
return annotate_with_boxes(image_input, detections)
|
51 |
+
elif task_dropdown in CAPTION_TASK_NAMES:
|
52 |
_, response = run_inference(
|
53 |
model, processor, DEVICE, image_input, task)
|
54 |
return response[task]
|
|
|
74 |
with gr.Column():
|
75 |
@gr.render(inputs=task_dropdown_component)
|
76 |
def show_output(text):
|
77 |
+
if text == OBJECT_DETECTION_TASK_NAME:
|
78 |
image_output_component = gr.Image(type='pil', label='Image Output')
|
79 |
submit_button_component.click(
|
80 |
fn=process,
|
|
|
85 |
],
|
86 |
outputs=image_output_component
|
87 |
)
|
88 |
+
elif text in CAPTION_TASK_NAMES:
|
89 |
text_output_component = gr.Textbox(label='Caption Output')
|
90 |
submit_button_component.click(
|
91 |
fn=process,
|
utils/tasks.py
CHANGED
@@ -1,8 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
TASK_NAMES = [
|
2 |
-
|
3 |
-
|
|
|
|
|
4 |
]
|
5 |
TASKS = {
|
6 |
-
|
7 |
-
|
|
|
|
|
8 |
}
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
OBJECT_DETECTION_TASK_NAME = "Object Detection"
|
2 |
+
CAPTION_TASK_NAME = "Caption"
|
3 |
+
DETAILED_CAPTION_TASK_NAME = "Detailed Caption"
|
4 |
+
MORE_DETAILED_CAPTION_TASK_NAME = "More Detailed Caption"
|
5 |
+
|
6 |
TASK_NAMES = [
|
7 |
+
OBJECT_DETECTION_TASK_NAME,
|
8 |
+
CAPTION_TASK_NAME,
|
9 |
+
DETAILED_CAPTION_TASK_NAME,
|
10 |
+
MORE_DETAILED_CAPTION_TASK_NAME
|
11 |
]
|
12 |
TASKS = {
|
13 |
+
OBJECT_DETECTION_TASK_NAME: "<OD>",
|
14 |
+
CAPTION_TASK_NAME: "<CAPTION>",
|
15 |
+
DETAILED_CAPTION_TASK_NAME: "<DETAILED_CAPTION>",
|
16 |
+
MORE_DETAILED_CAPTION_TASK_NAME: "<MORE_DETAILED_CAPTION>"
|
17 |
}
|
18 |
+
CAPTION_TASK_NAMES = [
|
19 |
+
CAPTION_TASK_NAME,
|
20 |
+
DETAILED_CAPTION_TASK_NAME,
|
21 |
+
MORE_DETAILED_CAPTION_TASK_NAME
|
22 |
+
]
|