Spaces:

SkalskiP
/

better-florence-2

Running on Zero

App Files Files Community

SkalskiP commited on Jul 4, 2024

Commit

5d15f06

1 Parent(s): 9c79daa

more captioning tasks

Browse files

Files changed (2) hide show

app.py +12 -11
utils/tasks.py +18 -4

app.py CHANGED Viewed

@@ -5,7 +5,8 @@ import spaces
 from utils.annotate import annotate_with_boxes
 from utils.models import load_models, run_inference, CHECKPOINTS
-from utils.tasks import TASK_NAMES, TASKS
 MARKDOWN = """
 # Better Florence-2 Playground 🔥
@@ -25,12 +26,12 @@ MARKDOWN = """
 </div>
 """
-OBJECT_DETECTION_EXAMPLES = [
-    ["microsoft/Florence-2-large-ft", "Object Detection", "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
-]
-CAPTION_EXAMPLES = [
-    ["microsoft/Florence-2-large-ft", "Caption", "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
-]
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 MODELS, PROCESSORS = load_models(DEVICE)
@@ -41,13 +42,13 @@ def process(checkpoint_dropdown, task_dropdown, image_input):
     model = MODELS[checkpoint_dropdown]
     processor = PROCESSORS[checkpoint_dropdown]
     task = TASKS[task_dropdown]
-    if task_dropdown == "Object Detection":
         _, response = run_inference(
             model, processor, DEVICE, image_input, task)
         detections = sv.Detections.from_lmm(
             lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
         return annotate_with_boxes(image_input, detections)
-    elif task_dropdown == "Caption":
         _, response = run_inference(
             model, processor, DEVICE, image_input, task)
         return response[task]
@@ -73,7 +74,7 @@ with gr.Blocks() as demo:
         with gr.Column():
             @gr.render(inputs=task_dropdown_component)
             def show_output(text):
-                if text == "Object Detection":
                     image_output_component = gr.Image(type='pil', label='Image Output')
                     submit_button_component.click(
                         fn=process,
@@ -84,7 +85,7 @@ with gr.Blocks() as demo:
                         ],
                         outputs=image_output_component
                     )
-                elif text == "Caption":
                     text_output_component = gr.Textbox(label='Caption Output')
                     submit_button_component.click(
                         fn=process,

 from utils.annotate import annotate_with_boxes
 from utils.models import load_models, run_inference, CHECKPOINTS
+from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
+    CAPTION_TASK_NAMES
 MARKDOWN = """
 # Better Florence-2 Playground 🔥
 </div>
 """
+# OBJECT_DETECTION_EXAMPLES = [
+#     ["microsoft/Florence-2-large-ft", "Object Detection", "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
+# ]
+# CAPTION_EXAMPLES = [
+#     ["microsoft/Florence-2-large-ft", "Caption", "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
+# ]
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 MODELS, PROCESSORS = load_models(DEVICE)
     model = MODELS[checkpoint_dropdown]
     processor = PROCESSORS[checkpoint_dropdown]
     task = TASKS[task_dropdown]
+    if task_dropdown == OBJECT_DETECTION_TASK_NAME:
         _, response = run_inference(
             model, processor, DEVICE, image_input, task)
         detections = sv.Detections.from_lmm(
             lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
         return annotate_with_boxes(image_input, detections)
+    elif task_dropdown in CAPTION_TASK_NAMES:
         _, response = run_inference(
             model, processor, DEVICE, image_input, task)
         return response[task]
         with gr.Column():
             @gr.render(inputs=task_dropdown_component)
             def show_output(text):
+                if text == OBJECT_DETECTION_TASK_NAME:
                     image_output_component = gr.Image(type='pil', label='Image Output')
                     submit_button_component.click(
                         fn=process,
                         ],
                         outputs=image_output_component
                     )
+                elif text in CAPTION_TASK_NAMES:
                     text_output_component = gr.Textbox(label='Caption Output')
                     submit_button_component.click(
                         fn=process,

utils/tasks.py CHANGED Viewed

@@ -1,8 +1,22 @@
 TASK_NAMES = [
-    "Object Detection",
-    "Caption"
 ]
 TASKS = {
-    "Object Detection": "<OD>",
-    "Caption": "<CAPTION>"
 }

+OBJECT_DETECTION_TASK_NAME = "Object Detection"
+CAPTION_TASK_NAME = "Caption"
+DETAILED_CAPTION_TASK_NAME = "Detailed Caption"
+MORE_DETAILED_CAPTION_TASK_NAME = "More Detailed Caption"
 TASK_NAMES = [
+    OBJECT_DETECTION_TASK_NAME,
+    CAPTION_TASK_NAME,
+    DETAILED_CAPTION_TASK_NAME,
+    MORE_DETAILED_CAPTION_TASK_NAME
 ]
 TASKS = {
+    OBJECT_DETECTION_TASK_NAME: "<OD>",
+    CAPTION_TASK_NAME: "<CAPTION>",
+    DETAILED_CAPTION_TASK_NAME: "<DETAILED_CAPTION>",
+    MORE_DETAILED_CAPTION_TASK_NAME: "<MORE_DETAILED_CAPTION>"
 }
+CAPTION_TASK_NAMES = [
+    CAPTION_TASK_NAME,
+    DETAILED_CAPTION_TASK_NAME,
+    MORE_DETAILED_CAPTION_TASK_NAME
+]