Build / app.py
ManishThota's picture
Update app.py
f303a7a verified
import gradio as gr
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import cv2
import numpy as np
import ast
# # Ensure GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
def video_to_frames(video, fps=1):
"""Converts a video file into frames and stores them as PNG images in a list."""
frames_png = []
cap = cv2.VideoCapture(video)
if not cap.isOpened():
print("Error opening video file")
return frames_png
frame_count = 0
frame_interval = int(cap.get(cv2.CAP_PROP_FPS)) // fps # Calculate frame interval
while cap.isOpened():
ret, frame = cap.read()
if not ret:
print("Can't receive frame (stream end?). Exiting ...")
break
if frame_count % frame_interval == 0:
is_success, buffer = cv2.imencode(".png", frame)
if is_success:
frames_png.append(np.array(buffer).tobytes())
frame_count += 1
cap.release()
return frames_png
def extract_frames(frame):
# Convert binary data to a numpy array
frame_np = np.frombuffer(frame, dtype=np.uint8)
# Decode the PNG image
image_rgb = cv2.imdecode(frame_np, flags=cv2.IMREAD_COLOR) # Assuming it's in RGB format
# Convert RGB to BGR
image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)
return image_bgr
def predict_answer(image, video, question):
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
if image is not None:
# Process as an image
image = image.convert("RGB")
image_tensor = model.image_preprocess(image)
#Generate the answer
output_ids = model.generate(
input_ids,
max_new_tokens=25,
images=image_tensor,
use_cache=True)[0]
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
elif video is not None:
# Process as a video
frames = video_to_frames(video)
answers = []
for frame in frames:
image = extract_frames(frame)
image_tensor = model.image_preprocess([image])
# Generate the answer
output_ids = model.generate(
input_ids,
max_new_tokens=25,
images=image_tensor,
use_cache=True)[0]
answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
answers.append(answer)
return ast.literal_eval(answers[0])
else:
return "Unsupported file type. Please upload an image or video."
def gradio_predict(image, video, question):
answer = predict_answer(image, video, question)
return answer
css = """
#container{
display: block;
margin-left: auto;
margin-right: auto;
width: 50%;
}
#intro{
max-width: 100%;
margin: 0 auto;
text-align: center;
}
"""
with gr.Blocks(css = css) as app:
gr.Image("gsoc_redhen.png")
# with gr.Row(elem_id="container"):
# gr.Markdown("""
# ![GSOC Red Hen Labs](gsoc_redhen.png)
# <img src="https://private-user-images.githubusercontent.com/37763863/311454340-af72f848-9735-4d49-830b-885ffbb81091.jpeg?jwt=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJnaXRodWIuY29tIiwiYXVkIjoicmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSIsImtleSI6ImtleTUiLCJleHAiOjE3MTAwMDU4OTMsIm5iZiI6MTcxMDAwNTU5MywicGF0aCI6Ii8zNzc2Mzg2My8zMTE0NTQzNDAtYWY3MmY4NDgtOTczNS00ZDQ5LTgzMGItODg1ZmZiYjgxMDkxLmpwZWc_WC1BbXotQWxnb3JpdGhtPUFXUzQtSE1BQy1TSEEyNTYmWC1BbXotQ3JlZGVudGlhbD1BS0lBVkNPRFlMU0E1M1BRSzRaQSUyRjIwMjQwMzA5JTJGdXMtZWFzdC0xJTJGczMlMkZhd3M0X3JlcXVlc3QmWC1BbXotRGF0ZT0yMDI0MDMwOVQxNzMzMTNaJlgtQW16LUV4cGlyZXM9MzAwJlgtQW16LVNpZ25hdHVyZT1iZmI1MDIyYWRhZDNlN2FkYjQwNTdmNWQ3NDdhYjIzZDQ1YjdlZDk0NTdhNzU2MDc1MzhlYjM3MmY3NDEwOGEwJlgtQW16LVNpZ25lZEhlYWRlcnM9aG9zdCZhY3Rvcl9pZD0wJmtleV9pZD0wJnJlcG9faWQ9MCJ9.yLXWF03FIGRAixNAVn91oHiZQau1LfSGDNl_aunBpvk" width="1000" height="500" />
# """)
gr.Markdown("""
## This Gradio app serves as four folds:
### 1. My ability and experience to design a customizable Gradio application with Interface/Blocks structure.
### 2. One of my Multimodel Vision-Language model's capabilities with the LLaVA framework.
### 3. Demo for annotating random images and 4 second videos provided at Notion (https://shorturl.at/givyC)
### 4. Ability to integrate a Large Language Model and Vision Encoder
""")
with gr.Row():
video = gr.Video(label="Upload your video here")
image = gr.Image(type="pil", label="Upload or Drag an Image")
with gr.Row():
with gr.Column():
question = gr.Textbox(label="Question", placeholder="Annotate prompt", lines=4.3)
btn = gr.Button("Annotate")
with gr.Column():
answer = gr.TextArea(label="Answer")
btn.click(gradio_predict, inputs=[image, video, question], outputs=answer)
app.launch(debug=True, allowed_paths=["/"])