Spaces:

remyxai
/

VQASynth

Running on Zero

File size: 3,894 Bytes

"""SpaceLlama3.1 demo gradio app."""

import datetime
import logging
import os

import gradio as gr
import torch
import PIL.Image
from prismatic import load
from huggingface_hub import login

# Authenticate with the Hugging Face Hub
def authenticate_huggingface():
    hf_token = os.getenv("HF_TOKEN")
    if hf_token:
        login(token=hf_token)
    else:
        raise ValueError("Hugging Face API token not found. Please set it as an environment variable named 'HF_TOKEN'.")

# Call the authentication function once at the start
authenticate_huggingface()

INTRO_TEXT = """SpaceLlama3.1 demo\n\n
| [Model](https://huggingface.co/remyxai/SpaceLlama3.1) 
| [GitHub](https://github.com/remyxai/VQASynth/tree/main) 
| [Demo](https://huggingface.co/spaces/remyxai/SpaceLlama3.1) 
| [Discord](https://discord.gg/DAy3P5wYJk) 
\n\n
**This is an experimental research model.** Make sure to add appropriate guardrails when using the model for applications.
"""

# Set model location as a constant outside the function
MODEL_LOCATION = "remyxai/SpaceLlama3.1"  # Update as needed

# Global model variable
global_model = None

def load_model():
    """Loads the model globally."""
    global global_model
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    global_model = load(MODEL_LOCATION)
    global_model.to(device, dtype=torch.bfloat16)
    logging.info("Model loaded successfully.")

def compute(image, prompt):
    """Runs model inference."""
    if image is None:
        raise gr.Error("Image required")

    logging.info('prompt="%s"', prompt)

    # Open the image file
    if isinstance(image, str):
        image = PIL.Image.open(image).convert("RGB")

    # Use the globally loaded model
    vlm = global_model

    # Prepare prompt
    prompt_builder = vlm.get_prompt_builder()
    prompt_builder.add_turn(role="human", message=prompt)
    prompt_text = prompt_builder.get_prompt()

    # Generate the text based on image and prompt
    generated_text = vlm.generate(
        image,
        prompt_text,
        do_sample=True,
        temperature=0.1,
        max_new_tokens=512,
        min_length=1,
    )
    output = generated_text.split("</s>")[0]

    logging.info('output="%s"', output)

    return output  # Ensure that output is a string

def reset():
    """Resets the input fields."""
    return "", None

def create_app():
    """Creates demo UI."""

    with gr.Blocks() as demo:
        # Main UI structure
        gr.Markdown(INTRO_TEXT)
        with gr.Row():
            image = gr.Image(value=None, label="Image", type="filepath", visible=True)  # input
            with gr.Column():
                prompt = gr.Textbox(value="", label="Prompt", visible=True)
                model_info = gr.Markdown(label="Model Info")
                run = gr.Button("Run", variant="primary")
                clear = gr.Button("Clear")
                highlighted_text = gr.HighlightedText(value="", label="Output", visible=True)

        # Button event handlers
        run.click(
            fn=compute,
            inputs=[image, prompt],
            outputs=highlighted_text,  # Ensure this is the right output component
        )
        clear.click(fn=reset, inputs=None, outputs=[prompt, image])

        # Status
        status = gr.Markdown(f"Startup: {datetime.datetime.now()}")
        gpu_kind = gr.Markdown(f"GPU=?")
        demo.load(
            fn=lambda: f"Model `{MODEL_LOCATION}` loaded.",  # Ensure the output is a string
            inputs=None,
            outputs=model_info,
        )

    return demo

if __name__ == "__main__":

    logging.basicConfig(
        level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
    )

    for k, v in os.environ.items():
        logging.info('environ["%s"] = %r', k, v)

    # Load the model once globally
    load_model()

    create_app().queue().launch()