import os
import tempfile
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Set the cache directory to a writable directory
cache_dir = os.getenv("HF_HOME", "/tmp/huggingface_cache")

# Ensure the cache directory is writable
os.makedirs(cache_dir, exist_ok=True)
os.environ["HF_HOME"] = cache_dir

# Retrieve the Hugging Face API token from environment variables
api_token = os.getenv("ttt")
if not api_token:
    print("API token is not set. Please set the 'HF_API_TOKEN' environment variable.")
    exit(1)

# Log in to Hugging Face with the token
try:
    login(api_token)
    print("Successfully logged in to Hugging Face.")
except Exception as e:
    print(f"Failed to log in to Hugging Face: {e}")
    exit(1)

# Model and tokenizer names
model_name = "Ouiam123/Llama-2-7b-chat-finetune-tourism"

# Check if CUDA is available for GPU usage
device = "cuda" if torch.cuda.is_available() else "cpu"

try:
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Load the model with 4-bit quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        load_in_4bit=True,
        device_map="auto"
    )

    # Input text to the model
    input_text = "What should I do if I get lost in Morocco?"
    inputs = tokenizer(input_text, return_tensors="pt").to(device)

    # Generate a response
    outputs = model.generate(
        inputs["input_ids"],
        max_length=100,
        num_beams=5,
        early_stopping=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print("Response:", response)

except Exception as e:
    print(f"An error occurred: {e}")