Spaces:
Sleeping
Sleeping
File size: 16,468 Bytes
17228fa e528291 1d6e1d0 17c2208 4de8e25 19975ad 4de8e25 db81045 19975ad db81045 fa2eb8b 8db717e fa2eb8b 8c9559c fa2eb8b 8c9559c fa2eb8b 8c9559c fa2eb8b 8c9559c fa2eb8b 8c9559c fa2eb8b 8c9559c fa2eb8b 17228fa 76a9fd4 85b113a 76a9fd4 bf9b2e9 17228fa 17c2208 17228fa fa2eb8b 17228fa fa2eb8b 3bdebf9 4de8e25 d4a53db 4de8e25 e528291 d4a53db 6b93795 d4a53db e528291 d4a53db e528291 d4a53db 6b93795 d4a53db 4de8e25 bf9b2e9 4de8e25 008a02f 4de8e25 e528291 19975ad e528291 17228fa 17c2208 17228fa e528291 17228fa 008a02f 4de8e25 e528291 008a02f fa2eb8b 008a02f 3bdebf9 e528291 6b93795 d4a53db 4034764 9a3d1d8 6b93795 9a3d1d8 fa2eb8b 4de8e25 335c0a9 9a3d1d8 3bdebf9 17228fa 8c9559c 9a3d1d8 8c9559c 76a9fd4 bf9b2e9 76a9fd4 8c9559c 008a02f bf9b2e9 008a02f 8c9559c 328b127 9a3d1d8 4de8e25 3bdebf9 008a02f 335c0a9 9a3d1d8 335c0a9 4de8e25 3bdebf9 17228fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 |
import gradio as gr
import torch
import cv2
import numpy as np
from matplotlib import pyplot as plt
from PIL import Image, ImageDraw
from transformers import AutoProcessor
from modeling_florence2 import Florence2ForConditionalGeneration
import io
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from matplotlib.patches import Polygon
import numpy as np
import random
import json
with open("config.json", "r") as f:
config = json.load(f)
d_model = config['text_config']['d_model']
num_layers = config['text_config']['encoder_layers']
attention_heads = config['text_config']['encoder_attention_heads']
vocab_size = config['text_config']['vocab_size']
max_length = config['text_config']['max_length']
beam_size = config['text_config']['num_beams']
dropout = config['text_config']['dropout']
activation_function = config['text_config']['activation_function']
no_repeat_ngram_size = config['text_config']['no_repeat_ngram_size']
patch_size = config['vision_config']['patch_size'][0]
temporal_embeddings = config['vision_config']['visual_temporal_embedding']['max_temporal_embeddings']
title = """# 🙋🏻♂️Welcome to Tonic's PLeIAs/📸📈✍🏻Florence-PDF"""
description = """
This application showcases the **PLeIAs/📸📈✍🏻Florence-PDF** model, a powerful AI system designed for both **text and image generation tasks**. The model is capable of handling complex tasks such as object detection, image captioning, OCR (Optical Character Recognition), and detailed region-based image analysis.
### Model Usage and Flexibility
- **No Repeat N-Grams**: To reduce repetition in text generation, the model is configured with a **no_repeat_ngram_size** of **{no_repeat_ngram_size}**, ensuring more diverse and meaningful outputs.
- **Sampling Strategies**: 🙏🏻PLeIAs/📸📈✍🏻Florence-PDF offers flexible sampling strategies, including **top-k** and **top-p (nucleus) sampling**, allowing for both creative and constrained generation based on user needs.
📸📈✍🏻Florence-PDF is a robust model capable of handling various **text and image** tasks with high precision and flexibility, making it a valuable tool for both academic research and practical applications.
### **How to Use**:
1. **Upload an Image**: Select an image for processing.
2. **Choose a Task**: Pick a task from the dropdown menu, such as "Caption", "Object Detection", "OCR", etc.
3. **Process**: Click the "Process" button to let PLeIAs/📸📈✍🏻Florence-PDF analyze the image and generate the output.
4. **View Results**: Depending on the task, you’ll either see a processed image (e.g., with bounding boxes or labels) or a text-based result (e.g., a generated caption or extracted text).
You can reset the interface anytime by clicking the **Reset** button.
### **Available Tasks**:
- **✍🏻Caption**: Generate a concise description of the image.
- **📸Object Detection**: Identify and label objects within the image.
- **📸✍🏻OCR**: Extract text from the image.
- **📸Region Proposal**: Detect key regions in the image for detailed captioning.
"""
model_presentation = f"""
The **🙏🏻PLeIAs/📸📈✍🏻Florence-PDF** model is a state-of-the-art model for conditional generation tasks, designed to be highly effective for both **text** and **vision** tasks. It is built as an **encoder-decoder** architecture, which allows for enhanced flexibility and performance in generating outputs based on diverse inputs.
### Key Features
- **Model Architecture**: 🙏🏻PLeIAs/📸📈✍🏻Florence-PDF uses an encoder-decoder structure, which makes it effective in tasks like **text generation**, **summarization**, and **translation**. It has **{num_layers} layers** for both the encoder and decoder, with a model dimension (`d_model`) of **{d_model}**.
- **Conditional Generation**: The model can generate text conditionally, with a maximum length of **{max_length} tokens** for each generated sequence, making it ideal for tasks that require concise output.
- **Beam Search**: 🙏🏻PLeIAs/📸📈✍🏻Florence-PDFsupports **beam search** with up to **{beam_size} beams**, enabling more diverse and accurate text generation by exploring multiple potential outputs before selecting the best one.
- **Tokenization**: It includes a tokenizer with a vocabulary size of **{vocab_size} tokens**. Special tokens such as the **bos_token_id (0)** and **eos_token_id (2)** help control the generation process by marking the beginning and end of a sequence.
- **Attention Mechanism**: Both the encoder and decoder utilize **{attention_heads} attention heads** per layer, ensuring that the model can focus on relevant parts of the input when generating text.
- **Dropout and Activation**: 🙏🏻PLeIAs/📸📈✍🏻Florence-PDF employs a **{activation_function} activation function** and a **dropout rate of {dropout}**, which enhances model performance by preventing overfitting and improving generalization.
- **Training Configuration**: The model uses **float32** precision for training, and it supports fine-tuning for specific tasks by setting `finetuning_task` appropriately.
### Vision Integration
In addition to text tasks, 🙏🏻PLeIAs/📸📈✍🏻Florence-PDF also incorporates **vision capabilities**:
- **Patch-based Image Processing**: The vision component operates on image patches with a patch size of **{patch_size}x{patch_size}**.
- **Temporal Embedding**: Visual tasks benefit from temporal embeddings with up to **{temporal_embeddings} steps**, making Florence-2 well-suited for video analysis.
"""
joinus = """🌟TeamTonic🌟 is always making cool demos! Join our active builder's 🛠️community 👻 [](https://discord.gg/qdfnvSPcqP) On 🤗Huggingface:[MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Tonic-AI](https://github.com/tonic-ai) & contribute to🌟 [Build Tonic](https://git.tonic-ai.com/contribute)🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
"""
how_to_use = """The advanced settings allow you to fine-tune the text generation process. Here's what each setting does and how to use it:
### Top-k (Default: 50)
Top-k sampling limits the next token selection to the k most likely tokens.
- **Lower values** (e.g., 10) make the output more focused and deterministic.
- **Higher values** (e.g., 100) allow for more diverse outputs.
**Example:** For a creative writing task, try setting top-k to 80 for more varied language.
### Top-p (Default: 1.0)
Top-p (or nucleus) sampling selects from the smallest set of tokens whose cumulative probability exceeds p.
- **Lower values** (e.g., 0.5) make the output more focused and coherent.
- **Higher values** (e.g., 0.9) allow for more diverse and potentially creative outputs.
**Example:** For a factual caption, set top-p to 0.7 to balance accuracy and creativity.
### Repetition Penalty (Default: 1.0)
This penalizes repetition in the generated text.
- **Values closer to 1.0** have minimal effect on repetition.
- **Higher values** (e.g., 1.5) more strongly discourage repetition.
**Example:** If you notice repeated phrases, try increasing to 1.2 for more varied text.
### Number of Beams (Default: 3)
Beam search explores multiple possible sequences in parallel.
- **Higher values** (e.g., 5) can lead to better quality but slower generation.
- **Lower values** (e.g., 1) are faster but may produce lower quality results.
**Example:** For complex tasks like dense captioning, try increasing to 5 beams.
### Max Tokens (Default: 512)
This sets the maximum length of the generated text.
- **Lower values** (e.g., 100) for concise outputs.
- **Higher values** (e.g., 1000) for more detailed descriptions.
**Example:** For a detailed image description, set max tokens to 800 for a comprehensive output.
Remember, these settings interact with each other, so experimenting with different combinations can lead to interesting results!
"""
device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model = Florence2ForConditionalGeneration.from_pretrained("PleIAs/Florence-PDF", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
processor = AutoProcessor.from_pretrained("PleIAs/Florence-PDF", trust_remote_code=True)
TASK_PROMPTS = {
"✍🏻Caption": "<CAPTION>",
"✍🏻✍🏻Caption": "<DETAILED_CAPTION>",
"✍🏻✍🏻✍🏻Caption": "<MORE_DETAILED_CAPTION>",
"📸Object Detection": "<OD>",
"📸Dense Region Caption": "<DENSE_REGION_CAPTION>",
"📸✍🏻OCR": "<OCR>",
"📸✍🏻OCR with Region": "<OCR_WITH_REGION>",
"📸Region Proposal": "<REGION_PROPOSAL>"
}
IMAGE_TASKS = ["📸Object Detection", "📸Dense Region Caption", "📸Region Proposal", "📸✍🏻OCR with Region"]
TEXT_TASKS = ["✍🏻Caption", "✍🏻✍🏻Caption", "✍🏻✍🏻✍🏻Caption", "📸✍🏻OCR", "📸✍🏻OCR with Region"]
colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']
def fig_to_pil(fig):
buf = io.BytesIO()
fig.savefig(buf, format='png')
buf.seek(0)
return Image.open(buf)
def plot_bbox(image, data, use_quad_boxes=False):
fig, ax = plt.subplots()
ax.imshow(image)
if use_quad_boxes:
for quad_box, label in zip(data.get('quad_boxes', []), data.get('labels', [])):
quad_box = np.array(quad_box).reshape(-1, 2)
poly = Polygon(quad_box, linewidth=1, edgecolor='r', facecolor='none')
ax.add_patch(poly)
plt.text(quad_box[0][0], quad_box[0][1], label, color='white', fontsize=8,
bbox=dict(facecolor='red', alpha=0.5))
else:
bboxes = data.get('bboxes', [])
labels = data.get('labels', [])
for bbox, label in zip(bboxes, labels):
x1, y1, x2, y2 = bbox
rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=1, edgecolor='r', facecolor='none')
ax.add_patch(rect)
plt.text(x1, y1, label, color='white', fontsize=8, bbox=dict(facecolor='red', alpha=0.5))
ax.axis('off')
return fig
def draw_ocr_bboxes(image, prediction):
scale = 1
draw = ImageDraw.Draw(image)
bboxes, labels = prediction['quad_boxes'], prediction['labels']
for box, label in zip(bboxes, labels):
color = random.choice(colormap)
new_box = (np.array(box) * scale).tolist()
draw.polygon(new_box, width=3, outline=color)
draw.text((new_box[0]+8, new_box[1]+2),
"{}".format(label),
align="right",
fill=color)
return image
def draw_bounding_boxes(image, quad_boxes, labels, color=(0, 255, 0), thickness=2):
"""
Draws quadrilateral bounding boxes on the image.
"""
for i, quad in enumerate(quad_boxes):
points = np.array(quad, dtype=np.int32).reshape((-1, 1, 2)) # Reshape the quad points for drawing
image = cv2.polylines(image, [points], isClosed=True, color=color, thickness=thickness)
label_pos = (int(quad[0]), int(quad[1]) - 10)
cv2.putText(image, labels[i], label_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, thickness)
return image
def process_image(image, task):
prompt = TASK_PROMPTS[task]
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
generated_ids = model.generate(
**inputs,
max_new_tokens=1024,
num_beams=3,
do_sample=False
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))
return parsed_answer
def main_process(image, task, top_k, top_p, repetition_penalty, num_beams, max_tokens):
prompt = TASK_PROMPTS[task]
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
generated_ids = model.generate(
**inputs,
max_new_tokens=max_tokens,
num_beams=num_beams,
do_sample=True,
top_k=top_k,
top_p=top_p,
repetition_penalty=repetition_penalty
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
parsed_answer = processor.post_process_generation(generated_text, task=prompt, image_size=(image.width, image.height))
return parsed_answer
def process_and_update(image, task, top_k, top_p, repetition_penalty, num_beams, max_tokens):
if image is None:
return None, gr.update(visible=False), "Please upload an image first.", gr.update(visible=True)
result = main_process(image, task, top_k, top_p, repetition_penalty, num_beams, max_tokens)
if task in IMAGE_TASKS:
if task == "📸✍🏻OCR with Region":
fig = plot_bbox(image, result.get('<OCR_WITH_REGION>', {}), use_quad_boxes=True)
output_image = fig_to_pil(fig)
text_output = result.get('<OCR_WITH_REGION>', {}).get('recognized_text', 'No text found')
return output_image, gr.update(visible=True), text_output, gr.update(visible=False)
else:
fig = plot_bbox(image, result.get(TASK_PROMPTS[task], {}))
output_image = fig_to_pil(fig)
return output_image, gr.update(visible=True), None, gr.update(visible=False)
else:
return None, gr.update(visible=False), str(result), gr.update(visible=True)
def reset_outputs():
return None, gr.update(visible=False), None, gr.update(visible=True)
with gr.Blocks(title="Tonic's 🙏🏻PLeIAs/📸📈✍🏻Florence-PDF") as iface:
with gr.Column():
with gr.Row():
gr.Markdown(title)
with gr.Row():
with gr.Column(scale=1):
with gr.Group():
gr.Markdown(model_presentation)
with gr.Column(scale=1):
with gr.Group():
gr.Markdown(description)
with gr.Row():
with gr.Accordion("🫱🏻🫲🏻Join Us", open=True):
gr.Markdown(joinus)
with gr.Row():
with gr.Column(scale=1):
image_input = gr.Image(type="pil", label="Input Image")
task_dropdown = gr.Dropdown(list(TASK_PROMPTS.keys()), label="Task", value="✍🏻Caption")
with gr.Row():
submit_button = gr.Button("📸📈✍🏻Process")
reset_button = gr.Button("♻️Reset")
with gr.Accordion("🧪Advanced Settings", open=False):
with gr.Accordion("🏗️How To Use", open=True):
gr.Markdown(how_to_use)
top_k = gr.Slider(minimum=1, maximum=100, value=50, step=1, label="Top-k")
top_p = gr.Slider(minimum=0.0, maximum=1.0, value=1.0, step=0.01, label="Top-p")
repetition_penalty = gr.Slider(minimum=1.0, maximum=2.0, value=1.0, step=0.01, label="Repetition Penalty")
num_beams = gr.Slider(minimum=1, maximum=6, value=3, step=1, label="Number of Beams")
max_tokens = gr.Slider(minimum=1, maximum=1024, value=1000, step=1, label="Max Tokens")
with gr.Column(scale=1):
output_image = gr.Image(label="🙏🏻PLeIAs/📸📈✍🏻Florence-PDF", visible=False)
output_text = gr.Textbox(label="🙏🏻PLeIAs/📸📈✍🏻Florence-PDF", visible=False)
submit_button.click(
fn=process_and_update,
inputs=[image_input, task_dropdown, top_k, top_p, repetition_penalty, num_beams, max_tokens],
outputs=[output_image, output_image, output_text, output_text]
)
reset_button.click(
fn=reset_outputs,
inputs=[],
outputs=[output_image, output_image, output_text, output_text]
)
task_dropdown.change(
fn=lambda task: (gr.update(visible=task in IMAGE_TASKS), gr.update(visible=task in TEXT_TASKS)),
inputs=[task_dropdown],
outputs=[output_image, output_text]
)
iface.launch() |