Build

Paused

App Files Files Community

Build / app.py

ManishThota

Update app.py

e7d06c3 verified 12 months ago

raw

history blame

4.86 kB

	import gradio as gr
	from PIL import Image
	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import cv2
	import numpy as np
	import ast


	# # Ensure GPU usage if available
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Initialize the model and tokenizer
	model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True)
	tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)




	def video_to_frames(video, fps=1):
	"""Converts a video file into frames and stores them as PNG images in a list."""
	frames_png = []
	cap = cv2.VideoCapture(video)

	if not cap.isOpened():
	print("Error opening video file")
	return frames_png

	frame_count = 0
	frame_interval = int(cap.get(cv2.CAP_PROP_FPS)) // fps # Calculate frame interval

	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	print("Can't receive frame (stream end?). Exiting ...")
	break

	if frame_count % frame_interval == 0:
	is_success, buffer = cv2.imencode(".png", frame)
	if is_success:
	frames_png.append(np.array(buffer).tobytes())

	frame_count += 1

	cap.release()
	return frames_png

	def extract_frames(frame):

	# Convert binary data to a numpy array
	frame_np = np.frombuffer(frame, dtype=np.uint8)

	# Decode the PNG image
	image_rgb = cv2.imdecode(frame_np, flags=cv2.IMREAD_COLOR) # Assuming it's in RGB format

	# Convert RGB to BGR
	image_bgr = cv2.cvtColor(image_rgb, cv2.COLOR_RGB2BGR)

	return image_bgr

	def predict_answer(image, video, question):

	text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{question}? ASSISTANT:"
	input_ids = tokenizer(text, return_tensors='pt').input_ids.to(device)


	if image is not None:
	# Process as an image
	image = image.convert("RGB")
	image_tensor = model.image_preprocess(image)

	#Generate the answer
	output_ids = model.generate(
	input_ids,
	max_new_tokens=25,
	images=image_tensor,
	use_cache=True)[0]

	return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()

	elif video is not None:
	# Process as a video
	frames = video_to_frames(video)
	answers = []
	for frame in frames:
	image = extract_frames(frame)
	image_tensor = model.image_preprocess([image])

	# Generate the answer
	output_ids = model.generate(
	input_ids,
	max_new_tokens=25,
	images=image_tensor,
	use_cache=True)[0]

	answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
	answers.append(answer)
	return ast.literal_eval(answers[0])

	else:
	return "Unsupported file type. Please upload an image or video."




	def gradio_predict(image, video, question):
	answer = predict_answer(image, video, question)
	return answer

	css = """
	#container{
	display: block;
	margin-left: auto;
	margin-right: auto;
	width: 60%;
	}
	#intro{
	max-width: 100%;
	margin: 0 auto;
	text-align: center;
	}

	"""
	with gr.Blocks(css = css) as app:
	with gr.Row(elem_id="container"):
	gr.Image("gsoc_redhen.png",min_width=60, label="GSOC 2024")
	gr.Markdown("""
	## This Gradio app serves as four folds:
	### 1. My ability and experience to design a customizable Gradio application with Interface/Blocks structure.
	### 2. One of my Multimodel Vision-Language model's capabilities with the LLaVA framework.
	### 3. Demo for annotating random images and 4 second videos provided at Notion (https://shorturl.at/givyC)
	### 4. Ability to integrate a Large Language Model and Vision Encoder
	""")
	with gr.Row():
	video = gr.Video(label="Upload your video here")
	image = gr.Image(type="pil", label="Upload or Drag an Image")
	with gr.Row():
	with gr.Column():
	question = gr.Textbox(label="Question", placeholder="Annotate prompt", lines=4.3)
	btn = gr.Button("Annotate")
	with gr.Column():
	answer = gr.TextArea(label="Answer")


	btn.click(gradio_predict, inputs=[image, video, question], outputs=answer)

	app.launch(debug=True)