Pheye-x2 672x672 pixels

import gradio as gr
from huggingface_hub import InferenceClient
import json
from pheye_builder import create_model_and_transforms
from huggingface_hub import hf_hub_download
import torch
from PIL import Image
import os
import requests
import spaces

def get_config(hf_model_path):
    config_path = hf_hub_download(hf_model_path, "config.json")

    with open(config_path, "r") as f:
        config = json.load(f)

    return config


def get_model_path(hf_model_path):
    return hf_hub_download(hf_model_path, "checkpoint.pt")


HF_MODEL = "miguelcarv/Pheye-x2-672"
config = get_config(HF_MODEL)

print("Got config")

model, tokenizer = create_model_and_transforms(
            clip_vision_encoder_path=config["encoder"],
            lang_decoder_path=config["decoder"],
            tokenizer_path=config["tokenizer"],
            cross_attn_every_n_layers=config["cross_interval"],
            level=config["level"],
            reduce_factor=config["reduce"],
            from_layer=config["from_layer"],
            encoder_dtype=eval(config["encoder_dtype"]),
            decoder_dtype=eval(config["decoder_dtype"]),
            dtype=eval(config["other_params_dtype"])
        ) 

if config["first_level"]:
    model.vision_encoder.add_first_level_adapter()

print("Created model")

DEVICE = 'cuda'
model_path = get_model_path(HF_MODEL)
model.load_state_dict(torch.load(model_path, map_location="cpu"))
model = model.to(DEVICE)

print("Loaded model")

SYSTEM_PROMPT = "You are an AI visual assistant and you are seeing a single image. You will receive an instruction regarding that image. Your goal is to follow the instruction as faithfully as you can."

whiteboard = Image.open(requests.get("https://c1.staticflickr.com/7/6168/6207108414_a8833f410e_o.jpg", stream=True).raw).convert('RGB')
taxi_image = Image.open(requests.get("https://llava.hliu.cc/file=/nobackup/haotian/tmp/gradio/ca10383cc943e99941ecffdc4d34c51afb2da472/extreme_ironing.jpg", stream=True).raw).convert('RGB')

@spaces.GPU
def generate_answer(img, question, max_new_tokens, num_beams):
    
    image = [img]
    prompt = [f"{SYSTEM_PROMPT}\n\nInstruction: {question}\nOutput:"]
    inputs = tokenizer(prompt, padding='longest', return_tensors='pt')
    print("Generating a response with the following parameters:")
    print(f"""Question: {question}\nMax New Tokens: {max_new_tokens}\nNum Beams: {num_beams}""")

    model.eval()
    with torch.no_grad():
        outputs = model.generate(vision_x=image, 
                                lang_x=inputs.input_ids.to(DEVICE),
                                device=DEVICE,
                                max_new_tokens=max_new_tokens,
                                num_beams = num_beams,
                                eos_token_id = tokenizer.eos_token_id,
                                pad_token_id = tokenizer.pad_token_id,
                                attention_mask=inputs.attention_mask.to(DEVICE))
        answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].split("Output:")[-1].lstrip()
    
    return answer


# Create the Gradio interface
iface = gr.Interface(
    fn=generate_answer,
    inputs=[
        gr.Image(type="pil", label="Image"),
        gr.Textbox(label="Question"),
        gr.Slider(minimum=5, maximum=500, step=1, value=50, label="Max New Tokens"),
        gr.Slider(minimum=1, maximum=5, step=1, value=3, label="Num Beams")
    ],
    outputs=gr.Textbox(label="Answer"),
    title="<h1 style='text-align: center; display: block;'>Pheye-x2 672x672 pixels</h1>",
    examples=[[taxi_image, "What is unusual about this image?", 500, 3], [whiteboard, "What is the main topic of the whiteboard?", 500, 3]]
)


if __name__ == "__main__":
    # Launch the Gradio app
    iface.launch()