# from langchain.agents import load_tools import requests from PIL import Image from langchain.agents import initialize_agent from langchain.chains.conversation.memory import ConversationBufferWindowMemory # from langchain.agents import AgentType # from langchain.llms import OpenAI # from langchain.chat_models import ChatOpenAI from langchain.chat_models import AzureChatOpenAI from langchain.tools import BaseTool # from transformers.models.oneformer.modeling_oneformer import OneFormerModelOutput import os from transformers import BlipProcessor, BlipForConditionalGeneration from langchain import PromptTemplate, FewShotPromptTemplate from langchain.chains import LLMChain OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] OPENAI_API_BASE = os.environ['OPENAI_API_BASE'] DEPLOYMENT_NAME = os.environ['DEPLOYMENT_NAME'] llm = AzureChatOpenAI(deployment_name=DEPLOYMENT_NAME, openai_api_base=OPENAI_API_BASE, openai_api_key=OPENAI_API_KEY, openai_api_version="2023-03-15-preview", model_name="gpt-3.5-turbo") # OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] or 'Your OPENAI API Key' # OPENAI_API_KEY = "123" # llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY, temperature=0, model_name='gpt-3.5-turbo') image_to_text_model = "Salesforce/blip-image-captioning-large" # device = 'cuda' if torch.cuda.is_available() else 'cpu' device = 'cpu' processor = BlipProcessor.from_pretrained(image_to_text_model) model = BlipForConditionalGeneration.from_pretrained(image_to_text_model).to(device) def describeImage3(url): image_object = Image.open(requests.get(url, stream=True).raw).convert('RGB') # image inputs = processor(image_object, return_tensors="pt").to(device) outputs = model.generate(**inputs) return processor.decode(outputs[0], skip_special_tokens=True) def describeImage(image_url): image_obj = Image.open(image_url).convert('RGB') inputs = processor(image_obj, return_tensors='pt').to(device) outputs = model.generate(**inputs) return processor.decode(outputs[0], skip_special_tokens=True) def describeImage2(image_object): # image_object = Image.open(requests.get(url, stream=True).raw).convert('RGB') # image inputs = processor(image_object, return_tensors="pt").to(device) outputs = model.generate(**inputs) return processor.decode(outputs[0], skip_special_tokens=True) def toChinese(en: str): pp = "将下面的语句翻译成中文\n{en}" prompt = PromptTemplate( input_variables=["en"], template=pp ) llchain = LLMChain(llm=llm, prompt=prompt) return llchain.run(en) # description = describeImage('https://images.unsplash.com/photo-1673207520321-c27d09eb0955?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=1035&q=80') # description = describeImage('https://alifei03.cfp.cn/creative/vcg/800/new/VCG21gic13601846.jpg') # description class DescribeImageTool(BaseTool): name = "Describe Image Tool" description = 'use this tool to describe an image.' def _run(self, url: str): description = describeImage(url) return description def _arun(self, query: str): raise NotImplementedError("Async operation not supported yet") tools = [DescribeImageTool()] agent = initialize_agent( agent='chat-conversational-react-description', tools=tools, llm=llm, verbose=True, max_iterations=3, early_stopping_method='generate', memory=ConversationBufferWindowMemory( memory_key='chat_history', k=5, return_messages=True ) ) # image_url = 'https://images.unsplash.com/photo-1673207520321-c27d09eb0955?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=1035&q=80' # image_url = 'https://alifei03.cfp.cn/creative/vcg/800/new/VCG21gic13601846.jpg' # agent(f"Describe the following image:\n{image_url}") # agent(f"What is the brand of car in the following image:\n{image_url}") # image_url = 'https://alifei03.cfp.cn/creative/vcg/800/new/VCG21gic13601846.jpg' # agent(f"Please describe the following image:\n{image_url}") # agent.memory.buffer