import gradio as gr from transformers import AutoProcessor, AutoModelForVision2Seq from PIL import Image import torch import spaces processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct") model = AutoModelForVision2Seq.from_pretrained( "HuggingFaceTB/SmolVLM-Instruct", torch_dtype=torch.bfloat16, ).to("cuda") DEFAULT_ASSISTANT_PREFIX = "Let's think step by step:" DEFAULT_DECODING_STRATEGY = "Top P Sampling" DEFAULT_TEMPERATURE = 0.4 DEFAULT_MAX_NEW_TOKENS = 512 DEFAULT_REPETITION_PENALTY = 1.2 DEFAULT_TOP_P = 0.8 @spaces.GPU def model_inference(images, text): # Kullanıcıdan gelen metin ve görsel sorgularını işleyin if text == "" and not images: return "Please input a query and optionally image(s)." if text == "" and images: return "Please input a text query along with the image(s)." if isinstance(images, Image.Image): images = [images] text = f"{DEFAULT_ASSISTANT_PREFIX} {text}" resulting_messages = [ { "role": "user", "content": [{"type": "image"} for _ in images] + [{"type": "text", "text": text}] } ] prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True) inputs = processor(text=prompt, images=images, return_tensors="pt") inputs = {k: v.to("cuda") for k, v in inputs.items()} generation_args = { "max_new_tokens": DEFAULT_MAX_NEW_TOKENS, "repetition_penalty": DEFAULT_REPETITION_PENALTY, "temperature": DEFAULT_TEMPERATURE, "do_sample": (DEFAULT_DECODING_STRATEGY == "Top P Sampling"), "top_p": DEFAULT_TOP_P if DEFAULT_DECODING_STRATEGY == "Top P Sampling" else None, } generation_args.update(inputs) generated_ids = model.generate(**generation_args) generated_texts = processor.batch_decode( generated_ids[:, generation_args["input_ids"].size(1):], skip_special_tokens=True ) return generated_texts[0] examples = [ ["image1.jpeg", "What does this painting tell us explain in detail?"], ["image2.jpg", "What does this painting tell us explain in detail?"], ["image3.jpg", "Describe the scene in this picture."] ] with gr.Blocks() as demo: gr.Markdown("## SmolVLM Vision Instruct Demo with Example Inputs") with gr.Row(): with gr.Column(): image_input = gr.Image(label="Input Picture", type="pil", interactive=True) query_input = gr.Textbox(label="Question", interactive=True) submit_btn = gr.Button("Submit") with gr.Column(): output_text = gr.Textbox(label="Output Text", interactive=False) gr.Examples( examples=examples, inputs=[image_input, query_input], outputs=output_text, fn=model_inference ) submit_btn.click( model_inference, inputs=[image_input, query_input], outputs=output_text ) demo.launch(debug=True)