import os import base64 import markdown import gradio as gr from openai import OpenAI from dotenv import load_dotenv load_dotenv() XAI_API_KEY = os.getenv("XAI_API_KEY") client = OpenAI( api_key=XAI_API_KEY, base_url="https://api.x.ai/v1", ) def build_messages_from_history(history): messages = [ { "role": "system", "content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. " "You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses." } ] for ((user_text, user_image_url), assistant_text) in history: user_content = [] if user_image_url: image_content = { "type": "image_url", "image_url": { "url": user_image_url, "detail": "high", }, } user_content.append(image_content) if user_text.strip(): user_content.append({ "type": "text", "text": user_text.strip(), }) messages.append({ "role": "user", "content": user_content }) # Add the assistant turn messages.append({ "role": "assistant", "content": assistant_text }) return messages def create_response(history, user_text, user_image_path): user_text = user_text.strip() user_image_url = "" if user_text.startswith("http"): parts = user_text.split(" ", 1) user_image_url = parts[0] if len(parts) > 1: user_text = parts[1] else: user_text = "" if user_image_path is not None: with open(user_image_path, "rb") as f: image_bytes = f.read() base64_image = base64.b64encode(image_bytes).decode("utf-8") user_image_url = f"data:image/jpeg;base64,{base64_image}" temp_history = history.copy() temp_history.append(((user_text, user_image_url), "")) messages = [ { "role": "system", "content": "You are Grok Vision, an assistant designed to understand and describe images and also answer text-based queries. " "You should use all previous messages in the conversation as context. Provide clear, positive, and useful responses." } ] for ((old_user_text, old_user_image_url), old_assistant_text) in history: old_user_content = [] if old_user_image_url: old_user_content.append({ "type": "image_url", "image_url": { "url": old_user_image_url, "detail": "high", }, }) if old_user_text.strip(): old_user_content.append({ "type": "text", "text": old_user_text.strip(), }) messages.append({"role": "user", "content": old_user_content}) messages.append({"role": "assistant", "content": old_assistant_text}) new_user_content = [] if user_image_url: new_user_content.append({ "type": "image_url", "image_url": { "url": user_image_url, "detail": "high", }, }) if user_text.strip(): new_user_content.append({ "type": "text", "text": user_text.strip(), }) if not new_user_content: return history, "Please provide text or an image." messages.append({"role": "user", "content": new_user_content}) completion = client.chat.completions.create( model="grok-2-vision-1212", messages=messages, stream=False, temperature=0.01, ) assistant_response = completion.choices[0].message.content md = markdown.Markdown(extensions=["fenced_code"]) converted = md.convert(assistant_response) history.append(((user_text, user_image_url), assistant_response)) return history, converted def chat(user_message, image, history): history, assistant_output = create_response(history, user_message, image) display_chat = [] for ((u_txt, u_img_url), a_txt) in history: user_display = u_txt if u_img_url and u_img_url.startswith("data:image"): user_display += "\n\n[User uploaded an image]" elif u_img_url and u_img_url.startswith("http"): user_display += f"\n\n[User provided image URL: {u_img_url}]" display_chat.append((user_display.strip(), a_txt.strip())) return display_chat, history with gr.Blocks() as demo: gr.Markdown( "# Grok 2 Vision Chatbot\n" "Welcome!" "You can do following things with Grok:\n" "- Upload an image and ask a question about it.\n" "- Provide an image URL in your message (e.g. `http://example.com/image.jpg What is in this image?`).\n" "- Or just ask a text question without any image.\n\n" "Also it remembers previous messages too." ) chatbot = gr.Chatbot(label="Conversation") with gr.Row(): image_input = gr.Image(type="filepath", label="Upload an image (optional)", interactive=True) user_message_input = gr.Textbox( label="Your message:", placeholder="Type your text or paste an image URL (e.g. http://... ). You can also combine them." ) submit_button = gr.Button("Send") state = gr.State([]) submit_button.click( chat, inputs=[user_message_input, image_input, state], outputs=[chatbot, state] ) if __name__ == "__main__": demo.launch()