Build / app.py
ManishThota's picture
Update app.py
e7d06c3 verified
import gradio as gr
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import cv2
import numpy as np
import ast
# # Ensure GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
def video_to_frames(video, fps=1):
"""Converts a video file into frames and stores them as PNG images in a list."""
frames_png = []
cap = cv2.VideoCapture(video)
if not cap.isOpened():
print("Error opening video file")
return frames_png
frame_count = 0
frame_interval = int(cap.get(cv2.CAP_PROP_FPS)) // fps # Calculate frame interval
while cap.isOpened():
ret, frame = cap.read()
if not ret:
print("Can't receive frame (stream end?). Exiting ...")
break
if frame_count % frame_interval == 0:
is_success, buffer = cv2.imencode(".png", frame)
if is_success:
frames_png.append(np.array(buffer).tobytes())
frame_count += 1
cap.release()
return frames_png
def extract_frames(frame):
# Convert binary data to a numpy array
frame_np = np.frombuffer(frame, dtype=np.uint8)
# Decode the PNG image
image_rgb = cv2.imdecode(frame_np, flags=cv2.IMREAD_COLOR) # Assuming it's in RGB format
# Convert RGB to BGR
image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)
return image_bgr
def predict_answer(image, video, question):
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
if image is not None:
# Process as an image
image = image.convert("RGB")
image_tensor = model.image_preprocess(image)
#Generate the answer
output_ids = model.generate(
input_ids,
max_new_tokens=25,
images=image_tensor,
use_cache=True)[0]
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
elif video is not None:
# Process as a video
frames = video_to_frames(video)
answers = []
for frame in frames:
image = extract_frames(frame)
image_tensor = model.image_preprocess([image])
# Generate the answer
output_ids = model.generate(
input_ids,
max_new_tokens=25,
images=image_tensor,
use_cache=True)[0]
answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
answers.append(answer)
return ast.literal_eval(answers[0])
else:
return "Unsupported file type. Please upload an image or video."
def gradio_predict(image, video, question):
answer = predict_answer(image, video, question)
return answer
css = """
#container{
display: block;
margin-left: auto;
margin-right: auto;
width: 60%;
}
#intro{
max-width: 100%;
margin: 0 auto;
text-align: center;
}
"""
with gr.Blocks(css = css) as app:
with gr.Row(elem_id="container"):
gr.Image("gsoc_redhen.png",min_width=60, label="GSOC 2024")
gr.Markdown("""
## This Gradio app serves as four folds:
### 1. My ability and experience to design a customizable Gradio application with Interface/Blocks structure.
### 2. One of my Multimodel Vision-Language model's capabilities with the LLaVA framework.
### 3. Demo for annotating random images and 4 second videos provided at Notion (https://shorturl.at/givyC)
### 4. Ability to integrate a Large Language Model and Vision Encoder
""")
with gr.Row():
video = gr.Video(label="Upload your video here")
image = gr.Image(type="pil", label="Upload or Drag an Image")
with gr.Row():
with gr.Column():
question = gr.Textbox(label="Question", placeholder="Annotate prompt", lines=4.3)
btn = gr.Button("Annotate")
with gr.Column():
answer = gr.TextArea(label="Answer")
btn.click(gradio_predict, inputs=[image, video, question], outputs=answer)
app.launch(debug=True)