Spaces:

AI-ANK
/

PaLM-Kosmos-Vision-Gradio

Sleeping

App Files Files Community

PaLM-Kosmos-Vision-Gradio / app.py

AI-ANK

Update app.py

3711571 11 months ago

raw

history blame contribute delete

3.31 kB

	import gradio as gr
	import requests
	from PIL import Image
	from transformers import AutoProcessor, AutoModelForVision2Seq
	from io import BytesIO
	import replicate
	from llama_index.llms.palm import PaLM
	from llama_index import ServiceContext, VectorStoreIndex, Document
	from llama_index.memory import ChatMemoryBuffer
	import os
	import base64
	import tempfile


	# Function to get image caption via Kosmos2 (as in your original code)
	import numpy as np
	from PIL import Image

	# Function to get image caption via Kosmos2
	def get_image_caption(image_array):
	# Convert the numpy array to a PIL Image
	image = Image.fromarray(image_array.astype('uint8'), 'RGB')

	# Save the PIL Image to a temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".jpeg") as tmp_file:
	image.save(tmp_file, format="JPEG")
	tmp_file_path = tmp_file.name

	# Prepare the input data for the model
	input_data = {
	"image": open(tmp_file_path, "rb"),
	"description_type": "Brief"
	}

	# Get the model output
	output = replicate.run(
	"lucataco/kosmos-2:3e7b211c29c092f4bcc8853922cc986baa52efe255876b80cac2c2fbb4aff805",
	input=input_data
	)

	# Process the output to extract the description
	text_description = output.split('\n\n')[0]
	return text_description


	# Function to create the chat engine (as in your original code)
	def create_chat_engine(img_desc, api_key):
	llm = PaLM(api_key=api_key)
	service_context = ServiceContext.from_defaults(llm=llm, embed_model="local")
	doc = Document(text=img_desc)
	index = VectorStoreIndex.from_documents([doc], service_context=service_context)
	chatmemory = ChatMemoryBuffer.from_defaults(token_limit=1500)

	chat_engine = index.as_chat_engine(
	chat_mode="context",
	system_prompt=(
	f"You are a chatbot, able to have normal interactions, as well as talk. "
	"You always answer in great detail and are polite. Your responses always descriptive. "
	"Your job is to talk about an image the user has uploaded. Image description: {img_desc}."
	),
	verbose=True,
	memory=chatmemory
	)
	return chat_engine

	# Function to handle chat interaction
	# Function to handle chat interaction
	def process_image_and_chat(image_array, user_input):
	if image_array is None:
	return "Please capture an image."

	img_desc = get_image_caption(image_array)
	chat_engine = create_chat_engine(img_desc, os.environ["GOOGLE_API_KEY"])

	if user_input:
	try:
	response = chat_engine.chat(user_input)
	return response
	except Exception as e:
	return f'An error occurred: {str(e)}'
	else:
	return "Ask me anything about the uploaded image."


	# Define Gradio interface
	image_input = gr.Image(sources=["webcam"], type="numpy")
	text_input = gr.Textbox(label="Ask me about the image:")
	output_text = gr.Textbox(label="Response")

	iface = gr.Interface(
	fn=process_image_and_chat,
	inputs=[image_input, text_input],
	outputs=output_text,
	title="My version of ChatGPT vision",
	description="You can capture an image using your webcam and start chatting with the LLM about the image",
	allow_flagging="never"
	)

	# Launch the app
	iface.launch()