import spaces import os import time import torch import gradio as gr from threading import Thread from PIL import Image from transformers import AutoModelForCausalLM, AutoTokenizer, AutoProcessor # Model and processor initialization model_name = "Qwen/QVQ-72B-Preview" model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, device_map="auto", torch_dtype=torch.float16 ) tokenizer = AutoTokenizer.from_pretrained( model_name, trust_remote_code=True ) processor = AutoProcessor.from_pretrained( model_name, trust_remote_code=True ) # Footer footer = """

Powered by QVQ-72B Model

""" # Vision model function @spaces.GPU() def process_image(image, text_input=None): try: # Convert image to PIL format if needed if not isinstance(image, Image.Image): image = Image.fromarray(image).convert("RGB") # Prepare messages if not text_input: text_input = "Please describe this image in detail." messages = [ { "role": "system", "content": "You are a helpful and harmless assistant." }, { "role": "user", "content": [ {"image": image}, {"text": text_input} ] } ] # Process inputs response = model.chat(tokenizer, messages) return response except Exception as e: return f"Error processing image: {str(e)}" # CSS styling css = """ footer { visibility: hidden; } """ # Gradio interface with gr.Blocks(theme="Yntec/HaleyCH_Theme_Orange", css=css) as demo: with gr.Row(): input_img = gr.Image(label="Input Image") with gr.Row(): text_input = gr.Textbox(label="Question (Optional)") with gr.Row(): submit_btn = gr.Button(value="Submit") with gr.Row(): output_text = gr.Textbox(label="Response") submit_btn.click(process_image, [input_img, text_input], [output_text]) gr.HTML(footer) # Launch the app demo.launch(debug=True)