# app.py
import spaces
import os
import gradio as gr
import torch
from PIL import Image
from transformers import AutoTokenizer, AutoModelForCausalLM
import timm
from torchvision import transforms
#from llama_cpp import Llama
from peft import PeftModel

# 1. Model Definitions (Same as in training script)
class SigLIPImageEncoder(torch.nn.Module):
    def __init__(self, model_name='resnet50', embed_dim=512, pretrained_path=None):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained=False, num_classes=0, global_pool='avg') # pretrained=False
        self.embed_dim = embed_dim
        self.projection = torch.nn.Linear(self.model.num_features, embed_dim)

        if pretrained_path:
            self.load_state_dict(torch.load(pretrained_path, map_location=torch.device('cpu'))) # Load to CPU first
            print(f"Loaded SigLIP image encoder from {pretrained_path}")
        else:
            print("Initialized SigLIP image encoder without pretrained weights.")

    def forward(self, image):
        features = self.model(image)
        embedding = self.projection(features)
        return embedding

# 2. Load Models and Tokenizer
phi3_model_path = "QuantFactory/Phi-3-mini-4k-instruct-GGUF"  # Path to your quantized Phi-3 GGUF model
peft_model_path = "./qlora_phi3_model"
image_model_name = 'resnet50'
image_embed_dim = 512
siglip_pretrained_path = "image_encoder.pth" # Path to your pretrained SigLIP model

#device = torch.device("cpu") # Force CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load Tokenizer (using a compatible tokenizer)
text_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True) # Or a compatible tokenizer
text_tokenizer.pad_token = text_tokenizer.eos_token # Important for training

# Image Transformations
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load SigLIP Image Encoder
image_encoder = SigLIPImageEncoder(model_name=image_model_name, embed_dim=image_embed_dim, pretrained_path=siglip_pretrained_path).to(device)
image_encoder.eval() # Set to evaluation mode

# Load Phi-3 model using llama.cpp
#base_model = Llama(
#    model_path=phi3_model_path,
#    n_gpu_layers=0,  # Ensure no GPU usage
#    n_ctx=2048,       # Adjust context length as needed
#    verbose=True,
#)


#base_model = Llama.from_pretrained(
#	repo_id="QuantFactory/Phi-3-mini-4k-instruct-GGUF",
#	filename="Phi-3-mini-4k-instruct.Q2_K.gguf",
#    n_gpu_layers=0,
#    n_ctx=2048,
#    verbose=True 
#)

base_model_name="microsoft/Phi-3-mini-4k-instruct"
#device = "cuda"

#base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float32, device_map={"": device})
base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float32, device_map="auto")


# Load and merge
model = PeftModel.from_pretrained(base_model, peft_model_path, offload_dir='./offload')
model = model.merge_and_unload()
print("phi-3 model loaded sucessfully")  
# 3. Inference Function

@spaces.GPU
def predict(image, question):
    """
    Takes an image and a question as input and returns an answer.
    """
    if image is None or question is None or question == "":
        return "Please provide both an image and a question."

    try:
        image = Image.fromarray(image).convert("RGB")
        image = image_transform(image).unsqueeze(0).to(device)

        # Get image embeddings
        with torch.no_grad():
            image_embeddings = image_encoder(image)
            # Flatten the image embeddings for simplicity
            image_embeddings = image_embeddings.flatten().tolist()

        # Create the prompt with image embeddings
        prompt = f"Question: {question}\nImage Embeddings: {image_embeddings}\nAnswer:"

        # Generate answer using llama.cpp
        output = model(
            prompt,
            max_tokens=128,
            stop=["Q:", "\n"],
            echo=False,
        )

        answer = output["choices"][0]["text"].strip()

        return answer

    except Exception as e:
        return f"An error occurred: {str(e)}"

# 4. Gradio Interface
iface = gr.Interface(
    fn=predict,
    inputs=[
        gr.Image(label="Upload an Image"),
        gr.Textbox(label="Ask a Question about the Image", placeholder="What is in the image?")
    ],
    outputs=gr.Textbox(label="Answer"),
    title="Image Question Answering with Phi-3 and SigLIP (CPU)",
    description="Ask questions about an image and get answers powered by Phi-3 (llama.cpp) and SigLIP.",
    examples=[
        ["cat_0006.png", "Create a interesting story about this image?"],
        ["bird_0004.png", "Can you describe this image?"],
        ["truck_0003.png", "Elaborate the setting of the image"],
        ["ship_0007.png", "Explain the purpose of image"]
    ]
)

# 5. Launch the App
if __name__ == "__main__":
    iface.launch()