AI-ANK's picture
Update app.py
3711571
import gradio as gr
import requests
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from io import BytesIO
import replicate
from llama_index.llms.palm import PaLM
from llama_index import ServiceContext, VectorStoreIndex, Document
from llama_index.memory import ChatMemoryBuffer
import os
import base64
import tempfile
# Function to get image caption via Kosmos2 (as in your original code)
import numpy as np
from PIL import Image
# Function to get image caption via Kosmos2
def get_image_caption(image_array):
# Convert the numpy array to a PIL Image
image = Image.fromarray(image_array.astype('uint8'), 'RGB')
# Save the PIL Image to a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".jpeg") as tmp_file:
image.save(tmp_file, format="JPEG")
tmp_file_path = tmp_file.name
# Prepare the input data for the model
input_data = {
"image": open(tmp_file_path, "rb"),
"description_type": "Brief"
}
# Get the model output
output = replicate.run(
"lucataco/kosmos-2:3e7b211c29c092f4bcc8853922cc986baa52efe255876b80cac2c2fbb4aff805",
input=input_data
)
# Process the output to extract the description
text_description = output.split('\n\n')[0]
return text_description
# Function to create the chat engine (as in your original code)
def create_chat_engine(img_desc, api_key):
llm = PaLM(api_key=api_key)
service_context = ServiceContext.from_defaults(llm=llm, embed_model="local")
doc = Document(text=img_desc)
index = VectorStoreIndex.from_documents([doc], service_context=service_context)
chatmemory = ChatMemoryBuffer.from_defaults(token_limit=1500)
chat_engine = index.as_chat_engine(
chat_mode="context",
system_prompt=(
f"You are a chatbot, able to have normal interactions, as well as talk. "
"You always answer in great detail and are polite. Your responses always descriptive. "
"Your job is to talk about an image the user has uploaded. Image description: {img_desc}."
),
verbose=True,
memory=chatmemory
)
return chat_engine
# Function to handle chat interaction
# Function to handle chat interaction
def process_image_and_chat(image_array, user_input):
if image_array is None:
return "Please capture an image."
img_desc = get_image_caption(image_array)
chat_engine = create_chat_engine(img_desc, os.environ["GOOGLE_API_KEY"])
if user_input:
try:
response = chat_engine.chat(user_input)
return response
except Exception as e:
return f'An error occurred: {str(e)}'
else:
return "Ask me anything about the uploaded image."
# Define Gradio interface
image_input = gr.Image(sources=["webcam"], type="numpy")
text_input = gr.Textbox(label="Ask me about the image:")
output_text = gr.Textbox(label="Response")
iface = gr.Interface(
fn=process_image_and_chat,
inputs=[image_input, text_input],
outputs=output_text,
title="My version of ChatGPT vision",
description="You can capture an image using your webcam and start chatting with the LLM about the image",
allow_flagging="never"
)
# Launch the app
iface.launch()