Build / app.py
ManishThota's picture
Update app.py
a333293 verified
raw
history blame
5.49 kB
import gradio as gr
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import cv2
import numpy as np
import ast
# # Ensure GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
def video_to_frames(video, fps=1):
"""Converts a video file into frames and stores them as PNG images in a list."""
frames_png = []
cap = cv2.VideoCapture(video)
if not cap.isOpened():
print("Error opening video file")
return frames_png
frame_count = 0
frame_interval = int(cap.get(cv2.CAP_PROP_FPS)) // fps # Calculate frame interval
while cap.isOpened():
ret, frame = cap.read()
if not ret:
print("Can't receive frame (stream end?). Exiting ...")
break
if frame_count % frame_interval == 0:
is_success, buffer = cv2.imencode(".png", frame)
if is_success:
frames_png.append(np.array(buffer).tobytes())
frame_count += 1
cap.release()
return frames_png
def extract_frames(frame):
# Convert binary data to a numpy array
frame_np = np.frombuffer(frame, dtype=np.uint8)
# Decode the PNG image
image_rgb = cv2.imdecode(frame_np, flags=cv2.IMREAD_COLOR) # Assuming it's in RGB format
# Convert RGB to BGR
image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)
return image_bgr
def predict_answer(image, video, question, max_tokens=100):
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
# frames = video_to_frames(video)
# answers = []
# for i in range(len(frames)):
# image = extract_frames(frames[i])
# image_tensor = model.image_preprocess([image])
# # Generate the answer
# output_ids = model.generate(
# input_ids,
# max_new_tokens=max_tokens,
# images=image_tensor,
# use_cache=True)[0]
# answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
# answers.append(answer)
# return answers
if image:
# Process as an image
image = image.convert("RGB")
image_tensor = model.image_preprocess(image)
#Generate the answer
output_ids = model.generate(
input_ids,
max_new_tokens=max_tokens,
images=image_tensor,
use_cache=True)[0]
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
elif video:
# Process as a video
frames = video_to_frames(video)
answers = []
for i in range(len(frames)):
image = extract_frames(frames[i])
image_tensor = model.image_preprocess([image])
# Generate the answer
output_ids = model.generate(
input_ids,
max_new_tokens=max_tokens,
images=image_tensor,
use_cache=True)[0]
answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
answers.append(answer)
return ast.literal_eval(answers[0])
else:
return "Unsupported file type. Please upload an image or video."
def gradio_predict(image, video, question, max_tokens):
answer = predict_answer(image, video, question, max_tokens)
return answer
# iface = gr.Interface(
# fn=gradio_predict,
# inputs=[
# gr.Image(type="pil", label="Upload or Drag an Image"),
# gr.Video(label="Upload your video here"),
# gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4),
# gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")],
# outputs=gr.TextArea(label="Answer"),
# # outputs=gr.Image(label="Output"),
# title="Video/Image Viewer",
# description="Upload an image or video to view it or extract frames from the video.",
# )
# iface.launch(debug=True)
with gr.Blocks() as app:
gr.Markdown("### Upload an Image or Video")
with gr.Row():
image = gr.Image(type="pil", label="Upload or Drag an Image")
video = gr.Video(label="Upload your video here")
with gr.Row():
with gr.Column():
question = gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", lines=4)
tokens = gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")
with gr.Column():
answer = gr.TextArea(label="Answer")
btn = gr.Button("Predict")
btn.click(gradio_predict, inputs=[image, video, question, tokens], outputs=answer)
app.launch(debug=True)