import gradio as gr from PIL import Image import torch from transformers import AutoModelForCausalLM, AutoTokenizer import cv2 import numpy as np import pandas as pd import ast from collections import Counter from io import BytesIO from io import StringIO # # Ensure GPU usage if available device = "cuda" if torch.cuda.is_available() else "cpu" # Initialize the model and tokenizer model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE", torch_dtype=torch.float16, device_map="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True) def video_to_frames(video, fps=1): """Converts a video file into frames and stores them as PNG images in a list.""" frames_png = [] cap = cv2.VideoCapture(video) if not cap.isOpened(): print("Error opening video file") return frames_png frame_count = 0 frame_interval = int(cap.get(cv2.CAP_PROP_FPS)) // fps # Calculate frame interval while cap.isOpened(): ret, frame = cap.read() if not ret: print("Can't receive frame (stream end?). Exiting ...") break if frame_count % frame_interval == 0: is_success, buffer = cv2.imencode(".png", frame) if is_success: frames_png.append(np.array(buffer).tobytes()) frame_count += 1 cap.release() return frames_png def extract_frames(frame): # Convert binary data to a numpy array frame_np = np.frombuffer(frame, dtype=np.uint8) # Decode the PNG image image_rgb = cv2.imdecode(frame_np, flags=cv2.IMREAD_COLOR) # Assuming it's in RGB format # Convert RGB to BGR image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR) return image_bgr def predict_answer(video, image, question): text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \n{question}? ASSISTANT:" input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device) if image: # Process as an image image = image.convert("RGB") image_tensor = model.image_preprocess(image) #Generate the answer output_ids = model.generate( input_ids, max_new_tokens=25, images=image_tensor, use_cache=True)[0] return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip() elif video: frames = video_to_frames(video) image = extract_frames(frames[2]) image_tensor = model.image_preprocess([image]) # Generate the answer output_ids = model.generate( input_ids, max_new_tokens=25, images=image_tensor, use_cache=True)[0] answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip() return answer # # Process as a video # frames = video_to_frames(video) # answers = [] # for frame in frames: # image = extract_frames(frame) # image_tensor = model.image_preprocess([image]) # # Generate the answer # output_ids = model.generate( # input_ids, # max_new_tokens=25, # images=image_tensor, # use_cache=True)[0] # answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip() # answers.append(answer) # # Modify this logic based on your specific needs # most_common_answer = Counter(answers).most_common(1)[0][0] # # Safely evaluate the most common answer assuming it's a string representation of a Python literal # try: # evaluated_answer = ast.literal_eval(most_common_answer) # except (ValueError, SyntaxError): # # Handle malformed answer string # evaluated_answer = f"Error evaluating answer: {most_common_answer}" # return evaluated_answer # return ast.literal_eval(answers[0]) # else: # return "Unsupported file type. Please upload an image or video." promt_cat_dog = """ Annotate this image with this schema: { “description”: “Is there a cat in the image?”, “value”: “Cat” }, { “description”: “Is there a dog in the image?”, “value”: “Dog”, }, { “description”: “Is there a horse in the image?”, “value”: “Horse”, }, provide me the answers as a dictionary with key as the string value of the variable value on top and its value should be boolean value """ promt_bus_people = """ Annotate this image with this schema: { “description”: “Is there a bus in the image?”, “value”: “Bus”, }, { “description”: “Is there a bike in the image?”, “value”: “Bike”, }, provide me the answers as a dictionary with key as the string value of the variable value on top and its value should be boolean value """ promt_video = """ Annotate this image with this schema: { “description”: “Is there a person standing in the image?”, “value”: “standing”, }, { “description”: “Is the person's hands free in the image?”, “value”: “hands-free”, }, { “description”: “Is it indoors?”, “value”: “Indoors” }, provide me the answers as a dictionary with key as the string value of the variable value on top and its value should be boolean value. """ test_examples = [[None, "Images/cat_dog.jpeg", promt_cat_dog], [None,"Images/bus_people.jpeg", promt_bus_people], ["videos/v1_new.mp4",None,promt_video], ["videos/v3.mp4",None,promt_video]] def gradio_predict(video,image, question): answer = predict_answer(video,image, question) return answer def convert_and_save(data): # Assuming 'data' is a dictionary to convert into a DataFrame and save as CSV df = pd.DataFrame([data]) csv_buffer = StringIO() df.to_csv(csv_buffer, index=False) csv_buffer.seek(0) # Move to the start of the StringIO buffer return ("annotations.csv", csv_buffer.getvalue()) css = """ #container{ display: block; margin-left: auto; margin-right: auto; width: 60%; } #intro{ max-width: 100%; margin: 0 auto; text-align: center; } """ with gr.Blocks(css = css) as app: with gr.Row(elem_id="container"): gr.Image("gsoc_redhen.png",min_width=60, label="GSOC 2024") gr.Markdown(""" ## This Gradio app serves as four folds: ### 1. My ability and experience to design a customizable Gradio application with Interface/Blocks structure. ### 2. One of my Multimodel Vision-Language model's capabilities with the LLaVA framework. ### 3. Demo for annotating random images and 4 second videos provided at Notion (https://shorturl.at/givyC) ### 4. Ability to integrate a Large Language Model and Vision Encoder """) with gr.Row(): video = gr.Video(label="Video") image = gr.Image(type="pil", label="Image") with gr.Row(): with gr.Column(): question = gr.Textbox(label="Annotate", placeholder="Annotate prompt", lines=4.3) btn = gr.Button("Annotate") with gr.Column(): answer = gr.TextArea(label="Answer") save_btn = gr.Button("Save as CSV") download_link = gr.File(label="Download CSV") # Make sure the inputs and outputs match in your click function btn.click(gradio_predict, inputs=[video,image, question], outputs=answer) # Button to save the answer as CSV save_btn.click(fn=convert_and_save, inputs=answer, outputs=download_link) gr.Examples( examples=test_examples, inputs=[video,image, question], outputs= answer, fn=gradio_predict, cache_examples=True, ) app.launch(debug=True)