File size: 3,359 Bytes
4cab9e8
 
68a6d3e
c80b6fd
4c31b77
4cab9e8
68a6d3e
bfec104
68a6d3e
 
016b491
 
4c31b77
 
 
 
 
7c575d3
3fb0009
7c575d3
 
3fb0009
4c31b77
 
 
 
 
7c575d3
 
3fb0009
 
7c575d3
 
 
 
 
4c31b77
 
 
3fb0009
 
 
 
 
 
 
 
 
 
 
 
 
 
4c31b77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c575d3
3fb0009
 
 
 
 
 
 
 
7c575d3
 
3fb0009
 
 
 
 
 
 
 
7c575d3
 
 
ca72124
987e1ba
 
 
 
 
 
 
 
 
 
321eda5
65ff5ad
321eda5
987e1ba
 
4c31b77
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from langchain.agents import load_tools
from langchain.agents import initialize_agent
from langchain.agents import AgentType
from langchain.chat_models import AzureChatOpenAI
from langchain.llms import OpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
import os

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_BASE = os.getenv("OPENAI_API_BASE")
DEP_NAME=os.getenv("deployment_name")
llm = AzureChatOpenAI(deployment_name=DEP_NAME, openai_api_base=OPENAI_API_BASE,
                      openai_api_key=OPENAI_API_KEY, openai_api_version="2023-03-15-preview",
                      model_name="gpt-3.5-turbo")

import torch
from transformers import BlipProcessor, BlipForConditionalGeneration

image_to_text_model = "Salesforce/blip-image-captioning-large"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

processor = BlipProcessor.from_pretrained(image_to_text_model)
model = BlipForConditionalGeneration.from_pretrained(image_to_text_model).to(device)

from transformers.models.oneformer.modeling_oneformer import OneFormerModelOutput
import requests
from PIL import Image


def describeImage(image_url):
    image_object = Image.open(image_url).convert('RGB')
    inputs = processor(image_object, return_tensors="pt").to(device)
    outputs = model.generate(**inputs)
    return processor.decode(outputs[0], skip_special_tokens=True)


from langchain.tools import BaseTool


class DescribeImageTool(BaseTool):
    name = "Describe Image Tool"
    description = 'use this tool to describe an image.'

    def _run(self, url: str):
        description = describeImage(url)
        return description

    def _arun(self, query: str):
        raise NotImplementedError("Async operation not supported yet")


tools = [DescribeImageTool()]

agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=ConversationBufferWindowMemory(
        memory_key='chat_history',
        k=5,
        return_messages=True
    )
)

from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate


def enToChinese(english):
    pp = "Please translate the following sentence from English to Chinese:{english}"
    prompt = PromptTemplate(
        input_variables=["english"],
        template=pp
    )
    llchain = LLMChain(llm=llm, prompt=prompt)
    return llchain.run(english)


def chToEnglish(chinese):
    pp = "Please translate the following sentence from Chinese to English:{chinese}"
    prompt = PromptTemplate(
        input_variables=["chinese"],
        template=pp
    )
    llchain = LLMChain(llm=llm, prompt=prompt)
    return llchain.run(chinese)



import gradio as gr


def image_to_txt(image_url, user_input):
    user_input = chToEnglish(user_input)
    return enToChinese(agent(f"{input}:\n{image_url}")['output'])


with gr.Blocks() as demo:
    image_url = gr.Image(type="filepath", label="请选择一张图片")
    print(image_url)
    user_input = gr.Textbox(placeholder="请输入问题..", lines=1,label="问题")
    submit_btn = gr.Button('确认', variant="primary")
    output = gr.Textbox(placeholder="", lines=1,label="回答")
    submit_btn.click(image_to_txt, inputs=[image_url, user_input], outputs=output)

demo.launch()