SkalskiP commited on
Commit
1858b2a
1 Parent(s): eba8e42

full end to end app rewrite

Browse files
Files changed (2) hide show
  1. app.py +82 -34
  2. utils/tasks.py +33 -2
app.py CHANGED
@@ -1,13 +1,19 @@
 
 
1
  import gradio as gr
 
2
  import supervision as sv
3
  import torch
4
- import spaces
 
5
 
6
  from utils.annotate import annotate_with_boxes
7
  from utils.models import load_models, run_inference, CHECKPOINTS
8
  from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
9
  CAPTION_TASK_NAMES, CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
10
- MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME
 
 
11
 
12
  MARKDOWN = """
13
  # Better Florence-2 Playground 🔥
@@ -35,21 +41,14 @@ text format. It uses a DaViT vision encoder to convert images into visual token
35
  embeddings. These are then concatenated with BERT-generated text embeddings and
36
  processed by a transformer-based multi-modal encoder-decoder to generate the response.
37
  """
38
-
39
- OBJECT_DETECTION_EXAMPLES = [
40
- ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
41
- ]
42
- CAPTION_EXAMPLES = [
43
- ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"],
44
- ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"],
45
- ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg"]
46
- ]
47
- OCR_EXAMPLES = [
48
- ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg"],
49
- ]
50
- OCR_WITH_REGION_EXAMPLES = [
51
- ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg"],
52
- ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg"]
53
  ]
54
 
55
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -58,20 +57,26 @@ MODELS, PROCESSORS = load_models(DEVICE)
58
 
59
 
60
  @spaces.GPU
61
- def process(checkpoint_dropdown, task_dropdown, image_input):
 
 
 
 
 
62
  model = MODELS[checkpoint_dropdown]
63
  processor = PROCESSORS[checkpoint_dropdown]
64
  task = TASKS[task_dropdown]
 
65
  if task_dropdown in [OBJECT_DETECTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME]:
66
  _, response = run_inference(
67
  model, processor, DEVICE, image_input, task)
68
  detections = sv.Detections.from_lmm(
69
  lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
70
- return annotate_with_boxes(image_input, detections)
71
  elif task_dropdown in CAPTION_TASK_NAMES or task_dropdown == OCR_TASK_NAME:
72
  _, response = run_inference(
73
  model, processor, DEVICE, image_input, task)
74
- return response[task]
75
 
76
 
77
  with gr.Blocks() as demo:
@@ -80,31 +85,74 @@ with gr.Blocks() as demo:
80
  checkpoint_dropdown_component = gr.Dropdown(
81
  choices=CHECKPOINTS,
82
  value=CHECKPOINTS[0],
83
- label="Model", info="Select a Florence 2 model to use.")
 
 
84
  task_dropdown_component = gr.Dropdown(
85
  choices=TASK_NAMES,
86
  value=TASK_NAMES[0],
87
- label="Task", info="Select a task to perform with the model.")
 
 
88
 
89
  with gr.Row():
90
  with gr.Column():
91
- image_input_component = gr.Image(type='pil', label='Image Input')
 
 
 
92
  submit_button_component = gr.Button(value='Submit', variant='primary')
93
 
94
  with gr.Column():
95
  image_output_component = gr.Image(type='pil', label='Image Output')
96
  text_output_component = gr.Textbox(label='Caption Output', visible=False)
97
-
98
- def on_dropdown_input(text):
99
- if text in CAPTION_TASK_NAMES + [OCR_TASK_NAME]:
100
- return [gr.Image(visible=False), gr.Textbox(visible=True)]
101
- else:
102
- return [gr.Image(visible=True), gr.Textbox(visible=False)]
103
-
104
- task_dropdown_component.input(
105
- on_dropdown_input,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  inputs=[task_dropdown_component],
107
- outputs=[image_output_component, text_output_component])
108
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  demo.launch(debug=False, show_error=True, max_threads=1)
 
1
+ from typing import Tuple, Optional
2
+
3
  import gradio as gr
4
+ import spaces
5
  import supervision as sv
6
  import torch
7
+ from gradio_image_prompter import ImagePrompter
8
+ from PIL import Image
9
 
10
  from utils.annotate import annotate_with_boxes
11
  from utils.models import load_models, run_inference, CHECKPOINTS
12
  from utils.tasks import TASK_NAMES, TASKS, OBJECT_DETECTION_TASK_NAME, \
13
  CAPTION_TASK_NAMES, CAPTION_TASK_NAME, DETAILED_CAPTION_TASK_NAME, \
14
+ MORE_DETAILED_CAPTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME, OCR_TASK_NAME, \
15
+ IMAGE_INPUT_TASK_NAMES, IMAGE_PROMPTER_INPUT_TASK_NAMES, IMAGE_OUTPUT_TASK_NAMES, \
16
+ TEXTBOX_OUTPUT_TASK_NAMES
17
 
18
  MARKDOWN = """
19
  # Better Florence-2 Playground 🔥
 
41
  embeddings. These are then concatenated with BERT-generated text embeddings and
42
  processed by a transformer-based multi-modal encoder-decoder to generate the response.
43
  """
44
+ EXAMPLES = [
45
+ ["microsoft/Florence-2-large-ft", OBJECT_DETECTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
46
+ ["microsoft/Florence-2-large-ft", CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
47
+ ["microsoft/Florence-2-large-ft", DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
48
+ ["microsoft/Florence-2-large-ft", MORE_DETAILED_CAPTION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/dog-2.jpeg", None],
49
+ ["microsoft/Florence-2-large-ft", OCR_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
50
+ ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/notebooks/examples/handwritten-text.jpg", None],
51
+ ["microsoft/Florence-2-large-ft", OCR_WITH_REGION_TASK_NAME, "https://media.roboflow.com/inference/license_plate_1.jpg", None]
 
 
 
 
 
 
 
52
  ]
53
 
54
  DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
57
 
58
 
59
  @spaces.GPU
60
+ def process(
61
+ checkpoint_dropdown,
62
+ task_dropdown,
63
+ image_input,
64
+ image_prompter_input
65
+ ) -> Tuple[Optional[Image.Image], Optional[str]]:
66
  model = MODELS[checkpoint_dropdown]
67
  processor = PROCESSORS[checkpoint_dropdown]
68
  task = TASKS[task_dropdown]
69
+
70
  if task_dropdown in [OBJECT_DETECTION_TASK_NAME, OCR_WITH_REGION_TASK_NAME]:
71
  _, response = run_inference(
72
  model, processor, DEVICE, image_input, task)
73
  detections = sv.Detections.from_lmm(
74
  lmm=sv.LMM.FLORENCE_2, result=response, resolution_wh=image_input.size)
75
+ return annotate_with_boxes(image_input, detections), None
76
  elif task_dropdown in CAPTION_TASK_NAMES or task_dropdown == OCR_TASK_NAME:
77
  _, response = run_inference(
78
  model, processor, DEVICE, image_input, task)
79
+ return None, response[task]
80
 
81
 
82
  with gr.Blocks() as demo:
 
85
  checkpoint_dropdown_component = gr.Dropdown(
86
  choices=CHECKPOINTS,
87
  value=CHECKPOINTS[0],
88
+ label="Model", info="Select a Florence 2 model to use.",
89
+ interactive=True
90
+ )
91
  task_dropdown_component = gr.Dropdown(
92
  choices=TASK_NAMES,
93
  value=TASK_NAMES[0],
94
+ label="Task", info="Select a task to perform with the model.",
95
+ interactive=True
96
+ )
97
 
98
  with gr.Row():
99
  with gr.Column():
100
+ image_input_component = gr.Image(
101
+ type='pil', label='Upload image')
102
+ image_prompter_input_component = ImagePrompter(
103
+ type='pil', label='Upload image and draw box prompt', visible=False)
104
  submit_button_component = gr.Button(value='Submit', variant='primary')
105
 
106
  with gr.Column():
107
  image_output_component = gr.Image(type='pil', label='Image Output')
108
  text_output_component = gr.Textbox(label='Caption Output', visible=False)
109
+ with gr.Row():
110
+ gr.Examples(
111
+ fn=process,
112
+ examples=EXAMPLES,
113
+ inputs=[
114
+ checkpoint_dropdown_component,
115
+ task_dropdown_component,
116
+ image_input_component,
117
+ image_prompter_input_component
118
+ ],
119
+ outputs=[
120
+ image_output_component,
121
+ text_output_component
122
+ ],
123
+ run_on_click=True
124
+ )
125
+
126
+ def on_dropdown_change(text):
127
+ return [
128
+ gr.Image(visible=text in IMAGE_INPUT_TASK_NAMES),
129
+ ImagePrompter(visible=text in IMAGE_PROMPTER_INPUT_TASK_NAMES),
130
+ gr.Image(visible=text in IMAGE_OUTPUT_TASK_NAMES),
131
+ gr.Textbox(visible=text in TEXTBOX_OUTPUT_TASK_NAMES)
132
+ ]
133
+
134
+ task_dropdown_component.change(
135
+ on_dropdown_change,
136
  inputs=[task_dropdown_component],
137
+ outputs=[
138
+ image_input_component,
139
+ image_prompter_input_component,
140
+ image_output_component,
141
+ text_output_component
142
+ ]
143
+ )
144
+ submit_button_component.click(
145
+ fn=process,
146
+ inputs=[
147
+ checkpoint_dropdown_component,
148
+ task_dropdown_component,
149
+ image_input_component,
150
+ image_prompter_input_component
151
+ ],
152
+ outputs=[
153
+ image_output_component,
154
+ text_output_component
155
+ ]
156
+ )
157
 
158
  demo.launch(debug=False, show_error=True, max_threads=1)
utils/tasks.py CHANGED
@@ -4,6 +4,8 @@ DETAILED_CAPTION_TASK_NAME = "Detailed Caption"
4
  MORE_DETAILED_CAPTION_TASK_NAME = "More Detailed Caption"
5
  OCR_TASK_NAME = "OCR"
6
  OCR_WITH_REGION_TASK_NAME = "OCR with Region"
 
 
7
 
8
  TASK_NAMES = [
9
  OBJECT_DETECTION_TASK_NAME,
@@ -11,7 +13,9 @@ TASK_NAMES = [
11
  DETAILED_CAPTION_TASK_NAME,
12
  MORE_DETAILED_CAPTION_TASK_NAME,
13
  OCR_TASK_NAME,
14
- OCR_WITH_REGION_TASK_NAME
 
 
15
  ]
16
  TASKS = {
17
  OBJECT_DETECTION_TASK_NAME: "<OD>",
@@ -19,10 +23,37 @@ TASKS = {
19
  DETAILED_CAPTION_TASK_NAME: "<DETAILED_CAPTION>",
20
  MORE_DETAILED_CAPTION_TASK_NAME: "<MORE_DETAILED_CAPTION>",
21
  OCR_TASK_NAME: "<OCR>",
22
- OCR_WITH_REGION_TASK_NAME: "<OCR_WITH_REGION>"
 
 
23
  }
24
  CAPTION_TASK_NAMES = [
25
  CAPTION_TASK_NAME,
26
  DETAILED_CAPTION_TASK_NAME,
27
  MORE_DETAILED_CAPTION_TASK_NAME
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  ]
 
4
  MORE_DETAILED_CAPTION_TASK_NAME = "More Detailed Caption"
5
  OCR_TASK_NAME = "OCR"
6
  OCR_WITH_REGION_TASK_NAME = "OCR with Region"
7
+ REGION_TO_CATEGORY_TASK_NAME = "Region to Category"
8
+ REGION_TO_DESCRIPTION_TASK_NAME = "Region to Description"
9
 
10
  TASK_NAMES = [
11
  OBJECT_DETECTION_TASK_NAME,
 
13
  DETAILED_CAPTION_TASK_NAME,
14
  MORE_DETAILED_CAPTION_TASK_NAME,
15
  OCR_TASK_NAME,
16
+ OCR_WITH_REGION_TASK_NAME,
17
+ REGION_TO_CATEGORY_TASK_NAME,
18
+ REGION_TO_DESCRIPTION_TASK_NAME
19
  ]
20
  TASKS = {
21
  OBJECT_DETECTION_TASK_NAME: "<OD>",
 
23
  DETAILED_CAPTION_TASK_NAME: "<DETAILED_CAPTION>",
24
  MORE_DETAILED_CAPTION_TASK_NAME: "<MORE_DETAILED_CAPTION>",
25
  OCR_TASK_NAME: "<OCR>",
26
+ OCR_WITH_REGION_TASK_NAME: "<OCR_WITH_REGION>",
27
+ REGION_TO_CATEGORY_TASK_NAME: "<REGION_TO_CATEGORY>",
28
+ REGION_TO_DESCRIPTION_TASK_NAME: "<REGION_TO_DESCRIPTION>"
29
  }
30
  CAPTION_TASK_NAMES = [
31
  CAPTION_TASK_NAME,
32
  DETAILED_CAPTION_TASK_NAME,
33
  MORE_DETAILED_CAPTION_TASK_NAME
34
+ ]
35
+
36
+ IMAGE_INPUT_TASK_NAMES = [
37
+ OBJECT_DETECTION_TASK_NAME,
38
+ CAPTION_TASK_NAME,
39
+ DETAILED_CAPTION_TASK_NAME,
40
+ MORE_DETAILED_CAPTION_TASK_NAME,
41
+ OCR_TASK_NAME,
42
+ OCR_WITH_REGION_TASK_NAME,
43
+ ]
44
+ IMAGE_PROMPTER_INPUT_TASK_NAMES = [
45
+ REGION_TO_CATEGORY_TASK_NAME,
46
+ REGION_TO_DESCRIPTION_TASK_NAME
47
+ ]
48
+ IMAGE_OUTPUT_TASK_NAMES = [
49
+ OBJECT_DETECTION_TASK_NAME,
50
+ OCR_WITH_REGION_TASK_NAME,
51
+ REGION_TO_CATEGORY_TASK_NAME,
52
+ REGION_TO_DESCRIPTION_TASK_NAME
53
+ ]
54
+ TEXTBOX_OUTPUT_TASK_NAMES = [
55
+ CAPTION_TASK_NAME,
56
+ DETAILED_CAPTION_TASK_NAME,
57
+ MORE_DETAILED_CAPTION_TASK_NAME,
58
+ OCR_TASK_NAME
59
  ]