Spaces:
Paused
Paused
File size: 8,231 Bytes
7ec133b 9db455c 85e7ead 69344b8 7ec133b c52e238 fb293c4 c52e238 7ec133b f8dcf83 a4115fd f8dcf83 7ec133b 9db455c 5c72980 b499d7f 7294f1e b499d7f 7294f1e b499d7f 7294f1e b499d7f 7294f1e 9db455c b499d7f 3f30162 feb8185 fa7747b 3f30162 0a6288f 9db455c 0a6288f 5c72980 0a6288f 9db455c 0a6288f fa7747b 3f30162 0a6288f bda5bd0 9309155 bda5bd0 d0246f4 0a6288f 5c72980 0a6288f 7bf6cc7 f6b64ee bda5bd0 69344b8 bda5bd0 69344b8 bda5bd0 69344b8 bda5bd0 69344b8 fa7747b 69344b8 fa7747b b1c675e aa10fdf b1c675e aa10fdf b1c675e aa10fdf b1c675e 82a5278 b1c675e aa10fdf b1c675e aa10fdf b1c675e aa10fdf c5ecbbd b1c675e 82a5278 b1c675e aa10fdf f6b64ee aa10fdf f6b64ee b1c675e aa10fdf 0e737a8 b1c675e c5ecbbd b1c675e f6b64ee b1c675e f6b64ee aa10fdf fa7747b b50f60b 7ec133b 5c72980 e7d06c3 5c72980 8975f30 8501e74 e7d06c3 5c72980 a333293 ee0b20a a333293 5c72980 a333293 5c72980 066538a a333293 aa10fdf 1f932c1 066538a aa10fdf 0327ec6 5c72980 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 |
import gradio as gr
from PIL import Image
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import cv2
import numpy as np
import ast
from collections import Counter
# # Ensure GPU usage if available
device = "cuda" if torch.cuda.is_available() else "cpu"
# Initialize the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
def video_to_frames(video, fps=1):
"""Converts a video file into frames and stores them as PNG images in a list."""
frames_png = []
cap = cv2.VideoCapture(video)
if not cap.isOpened():
print("Error opening video file")
return frames_png
frame_count = 0
frame_interval = int(cap.get(cv2.CAP_PROP_FPS)) // fps # Calculate frame interval
while cap.isOpened():
ret, frame = cap.read()
if not ret:
print("Can't receive frame (stream end?). Exiting ...")
break
if frame_count % frame_interval == 0:
is_success, buffer = cv2.imencode(".png", frame)
if is_success:
frames_png.append(np.array(buffer).tobytes())
frame_count += 1
cap.release()
return frames_png
def extract_frames(frame):
# Convert binary data to a numpy array
frame_np = np.frombuffer(frame, dtype=np.uint8)
# Decode the PNG image
image_rgb = cv2.imdecode(frame_np, flags=cv2.IMREAD_COLOR) # Assuming it's in RGB format
# Convert RGB to BGR
image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)
return image_bgr
def predict_answer(video, image, question):
text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)
if image:
# Process as an image
image = image.convert("RGB")
image_tensor = model.image_preprocess(image)
#Generate the answer
output_ids = model.generate(
input_ids,
max_new_tokens=25,
images=image_tensor,
use_cache=True)[0]
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
elif video:
frames = video_to_frames(video)
image = extract_frames(frames[2])
image_tensor = model.image_preprocess([image])
# Generate the answer
output_ids = model.generate(
input_ids,
max_new_tokens=25,
images=image_tensor,
use_cache=True)[0]
answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
return answer
# # Process as a video
# frames = video_to_frames(video)
# answers = []
# for frame in frames:
# image = extract_frames(frame)
# image_tensor = model.image_preprocess([image])
# # Generate the answer
# output_ids = model.generate(
# input_ids,
# max_new_tokens=25,
# images=image_tensor,
# use_cache=True)[0]
# answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
# answers.append(answer)
# # Modify this logic based on your specific needs
# most_common_answer = Counter(answers).most_common(1)[0][0]
# # Safely evaluate the most common answer assuming it's a string representation of a Python literal
# try:
# evaluated_answer = ast.literal_eval(most_common_answer)
# except (ValueError, SyntaxError):
# # Handle malformed answer string
# evaluated_answer = f"Error evaluating answer: {most_common_answer}"
# return evaluated_answer
# return ast.literal_eval(answers[0])
# else:
# return "Unsupported file type. Please upload an image or video."
promt_cat_dog = """
Annotate this image with this schema:
{
“description”: “Is there a cat in the image?”,
“value”: “Cat”
},
{
“description”: “Is there a dog in the image?”,
“value”: “Dog”,
},
{
“description”: “Is there a horse in the image?”,
“value”: “Horse”,
},
provide me the answers as a dictionary with key as the string value of the variable value on top and its value should be boolean value
"""
promt_bus_people = """
Annotate this image with this schema:
{
“description”: “Is there a bus in the image?”,
“value”: “Bus”,
},
{
“description”: “Is there a bike in the image?”,
“value”: “Bike”,
},
provide me the answers as a dictionary with key as the string value of the variable value on top and its value should be boolean value
"""
# promt_video = """
# Annotate this image with this schema:
# {
# “description”: “Is the person standing?”,
# “value”: “standing”,
# },
# {
# “description”: “Is the person's hands free?”,
# “value”: “Hands-Free”,
# },
# provide me the answers as a dictionary with key as the string value of the variable value on top and its value should be boolean value
# """
promt_video = """
Annotate this image by indicating the presence or absence of specific objects.
{
“description”: “Is the person standing?”,
“value”: “standing”,
},
{
“description”: “Is the person's hands free?”,
“value”: “Hands-Free”,
},
Provide your answers as a dictionary with the object type as the key and a boolean value indicating its presence in the image, Use 'true' for objects present in the image and 'false' for objects not present.
"""
test_examples = [[None, "Images/cat_dog.jpeg", promt_cat_dog],
[None,"Images/bus_people.jpeg", promt_bus_people],
["videos/v2.mp4",None,promt_video],
["videos/v3.mp4",None,promt_video]]
def gradio_predict(video,image, question):
answer = predict_answer(video,image, question)
return answer
css = """
#container{
display: block;
margin-left: auto;
margin-right: auto;
width: 60%;
}
#intro{
max-width: 100%;
margin: 0 auto;
text-align: center;
}
"""
with gr.Blocks(css = css) as app:
with gr.Row(elem_id="container"):
gr.Image("gsoc_redhen.png",min_width=60, label="GSOC 2024")
gr.Markdown("""
## This Gradio app serves as four folds:
### 1. My ability and experience to design a customizable Gradio application with Interface/Blocks structure.
### 2. One of my Multimodel Vision-Language model's capabilities with the LLaVA framework.
### 3. Demo for annotating random images and 4 second videos provided at Notion (https://shorturl.at/givyC)
### 4. Ability to integrate a Large Language Model and Vision Encoder
""")
with gr.Row():
video = gr.Video(label="Video")
image = gr.Image(type="pil", label="Image")
with gr.Row():
with gr.Column():
question = gr.Textbox(label="Question", placeholder="Annotate prompt", lines=4.3)
btn = gr.Button("Annotate")
with gr.Column():
answer = gr.TextArea(label="Answer")
btn.click(gradio_predict, inputs=[video,image, question], outputs=answer)
gr.Examples(
examples=test_examples,
inputs=[video,image, question],
outputs= answer,
fn=gradio_predict,
cache_examples=True,
)
app.launch(debug=True)
|