|
|
import gradio as gr |
|
|
import base64 |
|
|
import json |
|
|
from PIL import ImageDraw |
|
|
from io import BytesIO |
|
|
import re |
|
|
import requests |
|
|
from transformers import Qwen2VLProcessor |
|
|
|
|
|
processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", use_fast=True) |
|
|
url = "http://localhost:8000/v2/models/vllm_model/generate" |
|
|
|
|
|
|
|
|
def ask_triton(image): |
|
|
try: |
|
|
|
|
|
|
|
|
buf = BytesIO() |
|
|
image.save(buf, format="PNG") |
|
|
img_b64 = base64.b64encode(buf.getvalue()).decode("utf-8") |
|
|
|
|
|
|
|
|
messages = [ |
|
|
{ |
|
|
'role': 'system', |
|
|
'content': [{'type': 'text', 'text': "You are a Vision Language Model specialized in product images. Detect nutrition tables."}] |
|
|
}, |
|
|
{ |
|
|
'role': 'user', |
|
|
'content': [ |
|
|
{ |
|
|
'type': 'image', |
|
|
'image': img_b64, |
|
|
}, |
|
|
{ |
|
|
'type': 'text', |
|
|
'text': "Detect the bounding box of the nutrition table." |
|
|
} |
|
|
] |
|
|
} |
|
|
] |
|
|
|
|
|
|
|
|
chat_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) |
|
|
|
|
|
payload = { |
|
|
"text_input": chat_text, |
|
|
"image": img_b64, |
|
|
"parameters": { |
|
|
"stream": False, |
|
|
"temperature": 0, |
|
|
"max_tokens": 2048 |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
response = requests.post(url, json=payload) |
|
|
resp_json = response.json() |
|
|
output_text = resp_json.get("text_output", "") |
|
|
|
|
|
|
|
|
if "<|im_start|>assistant\n" in output_text: |
|
|
output_text = output_text.rsplit("<|im_start|>assistant\n", 1)[-1] |
|
|
|
|
|
|
|
|
match = re.search(r"\((\d+),(\d+)\),\((\d+),(\d+)\)", output_text) |
|
|
if match: |
|
|
x1, y1, x2, y2 = map(int, match.groups()) |
|
|
draw = ImageDraw.Draw(image) |
|
|
w, h = image.size |
|
|
draw.rectangle((x1 / 1000 * w, y1 / 1000 * h, x2 / 1000 * w, y2 / 1000 * h), outline="green", width=10) |
|
|
|
|
|
return image, output_text |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error: {e}" |
|
|
|
|
|
|
|
|
gr.Interface( |
|
|
fn=ask_triton, |
|
|
inputs=[ |
|
|
gr.Image(type="pil") |
|
|
], |
|
|
outputs=["image", "text"], |
|
|
title="Nutrition Table Detection", |
|
|
description="Please upload image containing a nutrition table to visualizes bounding box prediction." |
|
|
).launch(server_name="0.0.0.0", server_port=7860) |
|
|
|
|
|
|