from langchain.agents import load_tools from langchain.agents import initialize_agent from langchain.agents import AgentType from langchain.chat_models import AzureChatOpenAI from langchain.llms import OpenAI from langchain.chains.conversation.memory import ConversationBufferWindowMemory import os OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") OPENAI_API_BASE = os.getenv("OPENAI_API_BASE") DEP_NAME=os.getenv("deployment_name") llm = AzureChatOpenAI(deployment_name=DEP_NAME, openai_api_base=OPENAI_API_BASE, openai_api_key=OPENAI_API_KEY, openai_api_version="2023-03-15-preview", model_name="gpt-3.5-turbo") import torch from transformers import BlipProcessor, BlipForConditionalGeneration image_to_text_model = "Salesforce/blip-image-captioning-large" device = 'cuda' if torch.cuda.is_available() else 'cpu' processor = BlipProcessor.from_pretrained(image_to_text_model) model = BlipForConditionalGeneration.from_pretrained(image_to_text_model).to(device) from transformers.models.oneformer.modeling_oneformer import OneFormerModelOutput import requests from PIL import Image def describeImage(image_url): image_object = Image.open(image_url).convert('RGB') inputs = processor(image_object, return_tensors="pt").to(device) outputs = model.generate(**inputs) return processor.decode(outputs[0], skip_special_tokens=True) from langchain.tools import BaseTool class DescribeImageTool(BaseTool): name = "Describe Image Tool" description = 'use this tool to describe an image.' def _run(self, url: str): description = describeImage(url) return description def _arun(self, query: str): raise NotImplementedError("Async operation not supported yet") tools = [DescribeImageTool()] agent = initialize_agent( agent='chat-conversational-react-description', tools=tools, llm=llm, verbose=True, max_iterations=3, early_stopping_method='generate', memory=ConversationBufferWindowMemory( memory_key='chat_history', k=5, return_messages=True ) ) from langchain.chains import LLMChain from langchain.prompts import PromptTemplate def enToChinese(english): pp = "Please translate the following sentence from English to Chinese:{english}" prompt = PromptTemplate( input_variables=["english"], template=pp ) llchain = LLMChain(llm=llm, prompt=prompt) return llchain.run(english) def chToEnglish(chinese): pp = "Please translate the following sentence from Chinese to English:{chinese}" prompt = PromptTemplate( input_variables=["chinese"], template=pp ) llchain = LLMChain(llm=llm, prompt=prompt) return llchain.run(chinese) import gradio as gr def image_to_txt(image_url, user_input): user_input = chToEnglish(user_input) return enToChinese(agent(f"{input}:\n{image_url}")['output']) with gr.Blocks() as demo: image_url = gr.Image(type="filepath", label="请选择一张图片") print(image_url) user_input = gr.Textbox(placeholder="请输入问题..", lines=1,label="问题") submit_btn = gr.Button('确认', variant="primary") output = gr.Textbox(placeholder="", lines=1,label="回答") submit_btn.click(image_to_txt, inputs=[image_url, user_input], outputs=output) demo.launch()