Spaces:

SkalskiP
/

better-florence-2

Running on Zero

+import gradio as gr
+import supervision as sv
+import torch
+import spaces
+from utils.annotate import annotate_with_boxes
+from utils.models import load_models, run_inference, CHECKPOINTS
+from utils.tasks import TASK_NAMES, TASKS
+MARKDOWN = """
+# Better Florence-2 Playground 🔥
+<div>
+    <a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-finetune-florence-2-on-detection-dataset.ipynb">
+        <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="display:inline-block;">
+    </a>
+    <a href="https://blog.roboflow.com/florence-2/">
+        <img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="Roboflow" style="display:inline-block;">
+    </a>
+    <a href="https://arxiv.org/abs/2311.06242">
+        <img src="https://img.shields.io/badge/arXiv-2311.06242-b31b1b.svg" alt="arXiv" style="display:inline-block;">
+    </a>
+    <a href="https://www.youtube.com/watch?v=i3KjYgxNH6w">
+        <img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
+    </a>
+</div>
+"""
+OBJECT_DETECTION_EXAMPLES = [
+    ["microsoft/Florence-2-large-ft", "Object Detection", "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
+]
+CAPTION_EXAMPLES = [
+    ["microsoft/Florence-2-large-ft", "Caption", "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
+]
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODELS, PROCESSORS = load_models(DEVICE)
+@spaces.GPU
+def process(checkpoint_dropdown, task_dropdown, image_input):
+    model = MODELS[checkpoint_dropdown]
+    processor = PROCESSORS[checkpoint_dropdown]
+    task = TASKS[task_dropdown]
+    if task_dropdown == "Object Detection":
+        _, response = run_inference(
+            model, processor, DEVICE, image_input, task)
+        detections = sv.Detections.from_lmm(
+            lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
+        return annotate_with_boxes(image_input, detections)
+    elif task_dropdown == "Caption":
+        _, response = run_inference(
+            model, processor, DEVICE, image_input, task)
+        return response[task]
+with gr.Blocks() as demo:
+    gr.Markdown(MARKDOWN)
+    with gr.Row():
+        checkpoint_dropdown_component = gr.Dropdown(
+            choices=CHECKPOINTS,
+            value=CHECKPOINTS[0],
+            label="Model", info="Select a Florence 2 model to use.")
+        task_dropdown_component = gr.Dropdown(
+            choices=TASK_NAMES,
+            value=TASK_NAMES[0],
+            label="Task", info="Select a task to perform with the model.")
+    with gr.Row():
+        with gr.Column():
+            image_input_component = gr.Image(type='pil', label='Image Input')
+            submit_button_component = gr.Button(value='Submit', variant='primary')
+        with gr.Column():
+            @gr.render(inputs=task_dropdown_component)
+            def show_output(text):
+                if text == "Object Detection":
+                    image_output_component = gr.Image(type='pil', label='Image Output')
+                    submit_button_component.click(
+                        fn=process,
+                        inputs=[
+                            checkpoint_dropdown_component,
+                            task_dropdown_component,
+                            image_input_component
+                        ],
+                        outputs=image_output_component
+                    )
+                elif text == "Caption":
+                    text_output_component = gr.Textbox(label='Caption Output')
+                    submit_button_component.click(
+                        fn=process,
+                        inputs=[
+                            checkpoint_dropdown_component,
+                            task_dropdown_component,
+                            image_input_component
+                        ],
+                        outputs=text_output_component
+                    )
+    # @gr.render(inputs=task_dropdown_component)
+    # def show_examples(text):
+    #     if text == "Object Detection":
+    #         gr.Examples(
+    #             fn=process,
+    #             examples=OBJECT_DETECTION_EXAMPLES,
+    #             inputs=[
+    #                 checkpoint_dropdown_component,
+    #                 task_dropdown_component,
+    #                 image_input_component
+    #             ],
+    #             outputs=image_output_component
+    #         )
+    #     elif text == "Caption":
+    #         gr.Examples(
+    #             fn=process,
+    #             examples=CAPTION_EXAMPLES,
+    #             inputs=[
+    #                 checkpoint_dropdown_component,
+    #                 task_dropdown_component,
+    #                 image_input_component
+    #             ],
+    #             outputs=text_output_component
+    #         )
+demo.launch(debug=False, show_error=True, max_threads=1)

requirements-local.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch
+einops
+timm
+gradio
+transformers
+gradio-image-prompter
+supervision==0.22.0rc1

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+einops
+spaces
+timm
+gradio
+transformers
+gradio-image-prompter
+supervision==0.22.0rc1

utils/__init__.py ADDED Viewed

File without changes

utils/annotate.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import supervision as sv
+from PIL import Image
+def annotate_with_boxes(image: Image, detections: sv.Detections) -> Image:
+    annotated_image = image.copy()
+    thickness = sv.calculate_optimal_line_thickness(resolution_wh=image.size)
+    text_scale = sv.calculate_optimal_text_scale(resolution_wh=image.size)
+    bounding_box_annotator = sv.BoundingBoxAnnotator(
+        color_lookup=sv.ColorLookup.INDEX, thickness=thickness)
+    label_annotator = sv.LabelAnnotator(
+        color_lookup=sv.ColorLookup.INDEX,
+        text_scale=text_scale,
+        text_thickness=thickness)
+    annotated_image = bounding_box_annotator.annotate(annotated_image, detections)
+    annotated_image = label_annotator.annotate(annotated_image, detections)
+    return annotated_image

utils/imports.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import os
+from typing import Union
+from transformers.dynamic_module_utils import get_imports
+def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
+    """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
+    if not str(filename).endswith("/modeling_florence2.py"):
+        return get_imports(filename)
+    imports = get_imports(filename)
+    imports.remove("flash_attn")
+    return imports

utils/models.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+from typing import Tuple, Dict, Any
+from transformers import AutoModelForCausalLM, AutoProcessor
+from unittest.mock import patch
+from PIL import Image
+from utils.imports import fixed_get_imports
+CHECKPOINTS = [
+    "microsoft/Florence-2-large-ft",
+    "microsoft/Florence-2-large",
+    "microsoft/Florence-2-base-ft",
+    "microsoft/Florence-2-base",
+]
+def load_models(device: torch.device) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+    with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
+        models = {}
+        processors = {}
+        for checkpoint in CHECKPOINTS:
+            models[checkpoint] = AutoModelForCausalLM.from_pretrained(
+                checkpoint, trust_remote_code=True).to(device)
+            processors[checkpoint] = AutoProcessor.from_pretrained(
+                checkpoint, trust_remote_code=True)
+    return models, processors
+def run_inference(
+    model: Any,
+    processor: Any,
+    device: torch.device,
+    image: Image,
+    task: str,
+    text: str = ""
+) -> Tuple[str, Dict]:
+    prompt = task + text
+    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
+    generated_ids = model.generate(
+        input_ids=inputs["input_ids"],
+        pixel_values=inputs["pixel_values"],
+        max_new_tokens=1024,
+        num_beams=3
+    )
+    generated_text = processor.batch_decode(
+        generated_ids, skip_special_tokens=False)[0]
+    response = processor.post_process_generation(
+        generated_text, task=task, image_size=image.size)
+    return generated_text, response

utils/tasks.py ADDED Viewed

	@@ -0,0 +1,8 @@

+TASK_NAMES = [
+    "Object Detection",
+    "Caption"
+]
+TASKS = {
+    "Object Detection": "<OD>",
+    "Caption": "<CAPTION>"
+}