import gradio as gr from PIL import Image import torch from transformers import AutoModelForCausalLM, AutoTokenizer import cv2 import numpy as np import io # # Ensure GPU usage if available device = "cuda" if torch.cuda.is_available() else "cpu" # Initialize the model and tokenizer model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE", torch_dtype=torch.float16, device_map="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True) def process_video(video_bytes): """Extracts frames from the video, 1 per second.""" video = cv2.VideoCapture(io.BytesIO(video_bytes)) fps = video.get(cv2.CAP_PROP_FPS) frames = [] success, frame = video.read() while success: frames.append(frame) for _ in range(int(fps)): # Skip fps frames success, frame = video.read() video.release() return frames[:4] # Return the first 4 frames def predict_answer(image, video, question, max_tokens=100): if image: # Process as an image image = image.convert("RGB") input_ids = tokenizer(question, return_tensors='pt').input_ids.to(device) image_tensor = model.image_preprocess(frame) #Generate the answer output_ids = model.generate( input_ids, max_new_tokens=max_tokens, images=image_tensor, use_cache=True)[0] return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip() elif video: # Process as a video frames = process_video(video) answers = [] for frame in frames: frame = Image.open(frame).convert("RGB") input_ids = tokenizer(question, return_tensors='pt').input_ids.to(device) image_tensor = model.image_preprocess(frame) # Generate the answer output_ids = model.generate( input_ids, max_new_tokens=max_tokens, images=image_tensor, use_cache=True)[0] answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip() answers.append(answer) return "\n".join(answers) else: return "Unsupported file type. Please upload an image or video." def gradio_predict(image, video, question, max_tokens): answer = predict_answer(image, video, question, max_tokens) return answer # Define the Gradio interface iface = gr.Interface( fn=gradio_predict, inputs=[gr.Image(type="pil", label="Upload or Drag an Image"), gr.Video(label="upload your video here"), gr.Textbox(label="Question", placeholder="e.g. Can you explain the slide?", scale=4), gr.Slider(2, 500, value=25, label="Token Count", info="Choose between 2 and 500")], outputs=gr.TextArea(label="Answer"), # examples=examples, title="Super Rapid Annotator - Multimodal vision tool to annotate videos with LLaVA framework", # description="An interactive chat model that can answer questions about images in an Academic context. \n We can input images, and the system will analyze them to provide information about their contents. I've utilized this capability by feeding slides from PowerPoint presentations used in classes and the lecture content passed as text. Consequently, the model now mimics the behavior and responses of my professors. So, if I present any PowerPoint slide, it explains it just like my professor would, further it can be personalized.", ) # Launch the app iface.queue().launch(debug=True)