import gradio as gr import spaces import torch from transformers import AutoTokenizer, AutoModelForCausalLM from PIL import Image # Load model and tokenizer model_name = "mistral-community/pixtral-12b-240910" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto" ) @spaces.GPU(duration=120) def generate_response(image, prompt, max_length, temperature): messages = [ {"role": "system", "content": "You are a helpful assistant that can analyze images and text."}, {"role": "user", "content": prompt} ] formatted_prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) # Preprocess the image if image is not None: image = Image.open(image).convert("RGB") inputs = tokenizer(formatted_prompt, images=[image], return_tensors="pt", padding=True).to(model.device) else: inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=True).to(model.device) # Generate with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=max_length, do_sample=True, temperature=temperature, top_k=100, top_p=0.95, ) # Decode and return the response response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True) return response # Custom CSS css = """ body { background-color: #1a1a2e; color: #e0e0e0; font-family: 'Arial', sans-serif; } .container { max-width: 900px; margin: auto; padding: 20px; } .gradio-container { background-color: #16213e; border-radius: 15px; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); } .header { background-color: #0f3460; padding: 20px; border-radius: 15px 15px 0 0; text-align: center; margin-bottom: 20px; } .header h1 { color: #e94560; font-size: 2.5em; margin-bottom: 10px; } .header p { color: #a0a0a0; } .input-group, .output-group { background-color: #1a1a2e; padding: 20px; border-radius: 10px; margin-bottom: 20px; } .input-group label, .output-group label { color: #e94560; font-weight: bold; } .generate-btn { background-color: #e94560 !important; color: white !important; border: none !important; border-radius: 5px !important; padding: 10px 20px !important; font-size: 16px !important; cursor: pointer !important; transition: background-color 0.3s ease !important; } .generate-btn:hover { background-color: #c81e45 !important; } .example-prompts { background-color: #1f2b47; padding: 15px; border-radius: 10px; margin-bottom: 20px; } .example-prompts h3 { color: #e94560; margin-bottom: 10px; } .example-prompts ul { list-style-type: none; padding-left: 0; } .example-prompts li { margin-bottom: 5px; cursor: pointer; transition: color 0.3s ease; } .example-prompts li:hover { color: #e94560; } """ # Example prompts example_prompts = [ "Describe this image in detail.", "What emotions does this image evoke?", "Imagine a story based on this image.", "What technical aspects of photography are demonstrated in this image?", "How might this image be used in advertising?" ] # Gradio interface with gr.Blocks(css=css) as iface: gr.HTML( """
Generate text responses based on images and prompts using the powerful Pixtral-12B model.