|
import os |
|
import base64 |
|
import markdown |
|
import gradio as gr |
|
from openai import OpenAI |
|
from dotenv import load_dotenv |
|
|
|
load_dotenv() |
|
|
|
XAI_API_KEY = os.getenv("XAI_API_KEY") |
|
|
|
client = OpenAI( |
|
api_key=XAI_API_KEY, |
|
base_url="https://api.x.ai/v1", |
|
) |
|
|
|
def build_messages_from_history(history): |
|
messages = [ |
|
{ |
|
"role": "system", |
|
"content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. " |
|
"You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses." |
|
} |
|
] |
|
|
|
for ((user_text, user_image_url), assistant_text) in history: |
|
user_content = [] |
|
if user_image_url: |
|
image_content = { |
|
"type": "image_url", |
|
"image_url": { |
|
"url": user_image_url, |
|
"detail": "high", |
|
}, |
|
} |
|
user_content.append(image_content) |
|
|
|
if user_text.strip(): |
|
user_content.append({ |
|
"type": "text", |
|
"text": user_text.strip(), |
|
}) |
|
|
|
messages.append({ |
|
"role": "user", |
|
"content": user_content |
|
}) |
|
|
|
|
|
messages.append({ |
|
"role": "assistant", |
|
"content": assistant_text |
|
}) |
|
|
|
return messages |
|
|
|
def create_response(history, user_text, user_image_path): |
|
user_text = user_text.strip() |
|
user_image_url = "" |
|
|
|
if user_text.startswith("http"): |
|
parts = user_text.split(" ", 1) |
|
user_image_url = parts[0] |
|
if len(parts) > 1: |
|
user_text = parts[1] |
|
else: |
|
user_text = "" |
|
|
|
if user_image_path is not None: |
|
with open(user_image_path, "rb") as f: |
|
image_bytes = f.read() |
|
base64_image = base64.b64encode(image_bytes).decode("utf-8") |
|
user_image_url = f"data:image/jpeg;base64,{base64_image}" |
|
|
|
temp_history = history.copy() |
|
temp_history.append(((user_text, user_image_url), "")) |
|
|
|
messages = [ |
|
{ |
|
"role": "system", |
|
"content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. " |
|
"You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses." |
|
} |
|
] |
|
for ((old_user_text, old_user_image_url), old_assistant_text) in history: |
|
old_user_content = [] |
|
if old_user_image_url: |
|
old_user_content.append({ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": old_user_image_url, |
|
"detail": "high", |
|
}, |
|
}) |
|
if old_user_text.strip(): |
|
old_user_content.append({ |
|
"type": "text", |
|
"text": old_user_text.strip(), |
|
}) |
|
messages.append({"role": "user", "content": old_user_content}) |
|
messages.append({"role": "assistant", "content": old_assistant_text}) |
|
|
|
new_user_content = [] |
|
if user_image_url: |
|
new_user_content.append({ |
|
"type": "image_url", |
|
"image_url": { |
|
"url": user_image_url, |
|
"detail": "high", |
|
}, |
|
}) |
|
if user_text.strip(): |
|
new_user_content.append({ |
|
"type": "text", |
|
"text": user_text.strip(), |
|
}) |
|
|
|
if not new_user_content: |
|
return history, "Please provide text or an image." |
|
|
|
messages.append({"role": "user", "content": new_user_content}) |
|
|
|
completion = client.chat.completions.create( |
|
model="grok-2-vision-1212", |
|
messages=messages, |
|
stream=False, |
|
temperature=0.01, |
|
) |
|
assistant_response = completion.choices[0].message.content |
|
|
|
md = markdown.Markdown(extensions=["fenced_code"]) |
|
converted = md.convert(assistant_response) |
|
|
|
history.append(((user_text, user_image_url), assistant_response)) |
|
|
|
return history, converted |
|
|
|
def chat(user_message, image, history): |
|
history, assistant_output = create_response(history, user_message, image) |
|
|
|
display_chat = [] |
|
for ((u_txt, u_img_url), a_txt) in history: |
|
user_display = u_txt |
|
if u_img_url and u_img_url.startswith("data:image"): |
|
user_display += "\n\n[User uploaded an image]" |
|
elif u_img_url and u_img_url.startswith("http"): |
|
user_display += f"\n\n[User provided image URL: {u_img_url}]" |
|
|
|
display_chat.append((user_display.strip(), a_txt.strip())) |
|
|
|
return display_chat, history |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown( |
|
"# Grok 2 Vision Chatbot\n" |
|
"Welcome!" |
|
"You can do following things with Grok:\n" |
|
"- Upload an image and ask a question about it.\n" |
|
"- Provide an image URL in your message (e.g. `http://example.com/image.jpg What is in this image?`).\n" |
|
"- Or just ask a text question without any image.\n\n" |
|
"Also it remembers previous messages too." |
|
) |
|
|
|
chatbot = gr.Chatbot(label="Conversation") |
|
with gr.Row(): |
|
image_input = gr.Image(type="filepath", label="Upload an image (optional)", interactive=True) |
|
user_message_input = gr.Textbox( |
|
label="Your message:", |
|
placeholder="Type your text or paste an image URL (e.g. http://... ). You can also combine them." |
|
) |
|
submit_button = gr.Button("Send") |
|
|
|
state = gr.State([]) |
|
|
|
submit_button.click( |
|
chat, |
|
inputs=[user_message_input, image_input, state], |
|
outputs=[chatbot, state] |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |