import torch from PIL import Image from transformers import AutoProcessor, AutoModelForVision2Seq from transformers.image_utils import load_image import numpy as np import gradio as gr # Set the device (GPU or CPU) DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # Initialize processor and model try: processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct") model = AutoModelForVision2Seq.from_pretrained( "HuggingFaceTB/SmolVLM-Instruct", torch_dtype=torch.bfloat16, _attn_implementation="flash_attention_2" if DEVICE == "cuda" else "eager", ).to(DEVICE) except Exception as e: print(f"Error loading model or processor: {str(e)}") exit(1) # Define the function to answer questions def answer_question(image, question): # Check if the image is provided if image is None: return "Error: Please upload an image." # Convert NumPy array to PIL Image if necessary try: if isinstance(image, np.ndarray): image = Image.fromarray(image) except Exception as e: return f"Error: Unable to process the image. {str(e)}" # Ensure question is provided if not question.strip(): return "Error: Please provide a question." # Create input message for the model messages = [ { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": question}, ], }, ] # Apply chat template (this assumes the processor has a chat-based input format) try: prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt").to(DEVICE) except Exception as e: return f"Error: Failed to prepare inputs. {str(e)}" # Generate the answer try: outputs = model.generate(**inputs) answer = processor.decode(outputs[0], skip_special_tokens=True) return answer except Exception as e: return f"Error: Failed to generate answer. {str(e)}" # Create Gradio interface iface = gr.Interface( fn=answer_question, inputs=[ gr.Image(type="numpy"), gr.Textbox(lines=2, placeholder="Enter your question here..."), ], outputs="text", title="Image Question Answering", description="Upload an image and ask a question about it.", ) if __name__ == "__main__": iface.launch()