import os import tempfile import torch from transformers import AutoTokenizer, AutoModelForCausalLM from huggingface_hub import login from dotenv import load_dotenv # Load environment variables load_dotenv() # Set the cache directory to a writable directory cache_dir = os.getenv("HF_HOME", "/tmp/huggingface_cache") # Ensure the cache directory is writable os.makedirs(cache_dir, exist_ok=True) os.environ["HF_HOME"] = cache_dir # Retrieve the Hugging Face API token from environment variables api_token = os.getenv("ttt") if not api_token: print("API token is not set. Please set the 'HF_API_TOKEN' environment variable.") exit(1) # Log in to Hugging Face with the token try: login(api_token) print("Successfully logged in to Hugging Face.") except Exception as e: print(f"Failed to log in to Hugging Face: {e}") exit(1) # Model and tokenizer names model_name = "Ouiam123/Llama-2-7b-chat-finetune-tourism" # Check if CUDA is available for GPU usage device = "cuda" if torch.cuda.is_available() else "cpu" try: # Load the tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) # Load the model with 4-bit quantization model = AutoModelForCausalLM.from_pretrained( model_name, load_in_4bit=True, device_map="auto" ) # Input text to the model input_text = "What should I do if I get lost in Morocco?" inputs = tokenizer(input_text, return_tensors="pt").to(device) # Generate a response outputs = model.generate( inputs["input_ids"], max_length=100, num_beams=5, early_stopping=True ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) print("Response:", response) except Exception as e: print(f"An error occurred: {e}")