SkalskiP commited on
Commit
9c79daa
1 Parent(s): 039d19a

initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ venv
2
+ .idea
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: Better Florence 2
3
- emoji: 🐠
4
- colorFrom: indigo
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 4.37.2
8
  app_file: app.py
 
1
  ---
2
  title: Better Florence 2
3
+ emoji: 🔥
4
+ colorFrom: purple
5
+ colorTo: green
6
  sdk: gradio
7
  sdk_version: 4.37.2
8
  app_file: app.py
app.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import supervision as sv
3
+ import torch
4
+ import spaces
5
+
6
+ from utils.annotate import annotate_with_boxes
7
+ from utils.models import load_models, run_inference, CHECKPOINTS
8
+ from utils.tasks import TASK_NAMES, TASKS
9
+
10
+ MARKDOWN = """
11
+ # Better Florence-2 Playground 🔥
12
+ <div>
13
+ <a href="https://colab.research.google.com/github/roboflow-ai/notebooks/blob/main/notebooks/how-to-finetune-florence-2-on-detection-dataset.ipynb">
14
+ <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Colab" style="display:inline-block;">
15
+ </a>
16
+ <a href="https://blog.roboflow.com/florence-2/">
17
+ <img src="https://raw.githubusercontent.com/roboflow-ai/notebooks/main/assets/badges/roboflow-blogpost.svg" alt="Roboflow" style="display:inline-block;">
18
+ </a>
19
+ <a href="https://arxiv.org/abs/2311.06242">
20
+ <img src="https://img.shields.io/badge/arXiv-2311.06242-b31b1b.svg" alt="arXiv" style="display:inline-block;">
21
+ </a>
22
+ <a href="https://www.youtube.com/watch?v=i3KjYgxNH6w">
23
+ <img src="https://badges.aleen42.com/src/youtube.svg" alt="YouTube" style="display:inline-block;">
24
+ </a>
25
+ </div>
26
+ """
27
+
28
+ OBJECT_DETECTION_EXAMPLES = [
29
+ ["microsoft/Florence-2-large-ft", "Object Detection", "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
30
+ ]
31
+ CAPTION_EXAMPLES = [
32
+ ["microsoft/Florence-2-large-ft", "Caption", "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
33
+ ]
34
+
35
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
36
+ MODELS, PROCESSORS = load_models(DEVICE)
37
+
38
+
39
+ @spaces.GPU
40
+ def process(checkpoint_dropdown, task_dropdown, image_input):
41
+ model = MODELS[checkpoint_dropdown]
42
+ processor = PROCESSORS[checkpoint_dropdown]
43
+ task = TASKS[task_dropdown]
44
+ if task_dropdown == "Object Detection":
45
+ _, response = run_inference(
46
+ model, processor, DEVICE, image_input, task)
47
+ detections = sv.Detections.from_lmm(
48
+ lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
49
+ return annotate_with_boxes(image_input, detections)
50
+ elif task_dropdown == "Caption":
51
+ _, response = run_inference(
52
+ model, processor, DEVICE, image_input, task)
53
+ return response[task]
54
+
55
+
56
+ with gr.Blocks() as demo:
57
+ gr.Markdown(MARKDOWN)
58
+ with gr.Row():
59
+ checkpoint_dropdown_component = gr.Dropdown(
60
+ choices=CHECKPOINTS,
61
+ value=CHECKPOINTS[0],
62
+ label="Model", info="Select a Florence 2 model to use.")
63
+ task_dropdown_component = gr.Dropdown(
64
+ choices=TASK_NAMES,
65
+ value=TASK_NAMES[0],
66
+ label="Task", info="Select a task to perform with the model.")
67
+
68
+ with gr.Row():
69
+ with gr.Column():
70
+ image_input_component = gr.Image(type='pil', label='Image Input')
71
+ submit_button_component = gr.Button(value='Submit', variant='primary')
72
+
73
+ with gr.Column():
74
+ @gr.render(inputs=task_dropdown_component)
75
+ def show_output(text):
76
+ if text == "Object Detection":
77
+ image_output_component = gr.Image(type='pil', label='Image Output')
78
+ submit_button_component.click(
79
+ fn=process,
80
+ inputs=[
81
+ checkpoint_dropdown_component,
82
+ task_dropdown_component,
83
+ image_input_component
84
+ ],
85
+ outputs=image_output_component
86
+ )
87
+ elif text == "Caption":
88
+ text_output_component = gr.Textbox(label='Caption Output')
89
+ submit_button_component.click(
90
+ fn=process,
91
+ inputs=[
92
+ checkpoint_dropdown_component,
93
+ task_dropdown_component,
94
+ image_input_component
95
+ ],
96
+ outputs=text_output_component
97
+ )
98
+
99
+ # @gr.render(inputs=task_dropdown_component)
100
+ # def show_examples(text):
101
+ # if text == "Object Detection":
102
+ # gr.Examples(
103
+ # fn=process,
104
+ # examples=OBJECT_DETECTION_EXAMPLES,
105
+ # inputs=[
106
+ # checkpoint_dropdown_component,
107
+ # task_dropdown_component,
108
+ # image_input_component
109
+ # ],
110
+ # outputs=image_output_component
111
+ # )
112
+ # elif text == "Caption":
113
+ # gr.Examples(
114
+ # fn=process,
115
+ # examples=CAPTION_EXAMPLES,
116
+ # inputs=[
117
+ # checkpoint_dropdown_component,
118
+ # task_dropdown_component,
119
+ # image_input_component
120
+ # ],
121
+ # outputs=text_output_component
122
+ # )
123
+
124
+ demo.launch(debug=False, show_error=True, max_threads=1)
requirements-local.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ einops
3
+ timm
4
+ gradio
5
+ transformers
6
+ gradio-image-prompter
7
+ supervision==0.22.0rc1
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ einops
2
+ spaces
3
+ timm
4
+ gradio
5
+ transformers
6
+ gradio-image-prompter
7
+ supervision==0.22.0rc1
utils/__init__.py ADDED
File without changes
utils/annotate.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import supervision as sv
2
+ from PIL import Image
3
+
4
+
5
+ def annotate_with_boxes(image: Image, detections: sv.Detections) -> Image:
6
+ annotated_image = image.copy()
7
+ thickness = sv.calculate_optimal_line_thickness(resolution_wh=image.size)
8
+ text_scale = sv.calculate_optimal_text_scale(resolution_wh=image.size)
9
+ bounding_box_annotator = sv.BoundingBoxAnnotator(
10
+ color_lookup=sv.ColorLookup.INDEX, thickness=thickness)
11
+ label_annotator = sv.LabelAnnotator(
12
+ color_lookup=sv.ColorLookup.INDEX,
13
+ text_scale=text_scale,
14
+ text_thickness=thickness)
15
+ annotated_image = bounding_box_annotator.annotate(annotated_image, detections)
16
+ annotated_image = label_annotator.annotate(annotated_image, detections)
17
+ return annotated_image
utils/imports.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from typing import Union
4
+ from transformers.dynamic_module_utils import get_imports
5
+
6
+
7
+ def fixed_get_imports(filename: Union[str, os.PathLike]) -> list[str]:
8
+ """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
9
+ if not str(filename).endswith("/modeling_florence2.py"):
10
+ return get_imports(filename)
11
+ imports = get_imports(filename)
12
+ imports.remove("flash_attn")
13
+ return imports
utils/models.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import Tuple, Dict, Any
3
+ from transformers import AutoModelForCausalLM, AutoProcessor
4
+ from unittest.mock import patch
5
+ from PIL import Image
6
+
7
+ from utils.imports import fixed_get_imports
8
+
9
+ CHECKPOINTS = [
10
+ "microsoft/Florence-2-large-ft",
11
+ "microsoft/Florence-2-large",
12
+ "microsoft/Florence-2-base-ft",
13
+ "microsoft/Florence-2-base",
14
+ ]
15
+
16
+
17
+ def load_models(device: torch.device) -> Tuple[Dict[str, Any], Dict[str, Any]]:
18
+ with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
19
+ models = {}
20
+ processors = {}
21
+ for checkpoint in CHECKPOINTS:
22
+ models[checkpoint] = AutoModelForCausalLM.from_pretrained(
23
+ checkpoint, trust_remote_code=True).to(device)
24
+ processors[checkpoint] = AutoProcessor.from_pretrained(
25
+ checkpoint, trust_remote_code=True)
26
+ return models, processors
27
+
28
+
29
+ def run_inference(
30
+ model: Any,
31
+ processor: Any,
32
+ device: torch.device,
33
+ image: Image,
34
+ task: str,
35
+ text: str = ""
36
+ ) -> Tuple[str, Dict]:
37
+ prompt = task + text
38
+ inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
39
+ generated_ids = model.generate(
40
+ input_ids=inputs["input_ids"],
41
+ pixel_values=inputs["pixel_values"],
42
+ max_new_tokens=1024,
43
+ num_beams=3
44
+ )
45
+ generated_text = processor.batch_decode(
46
+ generated_ids, skip_special_tokens=False)[0]
47
+ response = processor.post_process_generation(
48
+ generated_text, task=task, image_size=image.size)
49
+ return generated_text, response
utils/tasks.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ TASK_NAMES = [
2
+ "Object Detection",
3
+ "Caption"
4
+ ]
5
+ TASKS = {
6
+ "Object Detection": "<OD>",
7
+ "Caption": "<CAPTION>"
8
+ }