File size: 2,143 Bytes
0ff5891
 
 
 
 
 
 
b5b47eb
0ff5891
 
 
3b84856
 
 
0ff5891
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b84856
0ff5891
 
 
3b84856
0ff5891
 
 
3b84856
 
 
0ff5891
 
b5b47eb
 
 
0ff5891
 
 
 
b5b47eb
0ff5891
 
 
 
3b84856
b5b47eb
0ff5891
 
 
 
 
241adb3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import gradio as gr
import base64
import json
import os
from PIL import Image
import io
from langchain_openai import ChatOpenAI
from langchain_community.callbacks import get_openai_callback
import replicate

# Set up environment variables for API keys
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")
os.environ["REPLICATE_API_TOKEN"] = os.environ.get("REPLICATE_API_TOKEN")


# Initialize the LLM model
llm = ChatOpenAI(model='gpt-3.5-turbo-0125', temperature=0.2)

with open("Resource/instructions copy.txt", "r") as f:
    instructions = f.read()

def image_to_base64(image):
    # Convert PIL Image to Bytes
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    return base64.b64encode(buffered.getvalue()).decode('utf-8')

def captions_image(image):
    # Convert image to base64 string
    image_base64 = f"data:image/jpeg;base64,{image_to_base64(image)}"
    
    # Call the Replicate API for image captioning
    response = replicate.run(
        "salesforce/blip:2e1dddc8621f72155f24cf2e0adbde548458d3cab9f00c0139eea840d0ac4746",
        input={"task": "image_captioning", "image": image_base64}
    )
    return response

def generate_prompt_gpt_3_turbo(image,instructions):
    # Get image captions
    image_description = captions_image(image)
    
    # Read instructions from a file (or directly insert your text here)
    
    
    # Format the prompt with the image description
    prompt = instructions.format(image_description=image_description)
    with get_openai_callback() as cb:
        # Invoke LLM model and get response
        response = llm.invoke(prompt).content
    
    # Convert response to JSON if necessary
    response_dict = json.loads(response)
    
    return response_dict, image_description, cb

# Define the Gradio interface
iface = gr.Interface(
    fn=generate_prompt_gpt_3_turbo,
    inputs=[gr.Image(type="pil"),gr.Textbox(value = instructions)],
    outputs=[gr.JSON(),gr.Textbox(),gr.Textbox()],
    title="Image to Text Generator",
    description="Upload an image to generate descriptive text based on the image."
)

# Run the interface
iface.launch(share=True)